<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/Multi_Modal_Deep_Learning_for_Integrating_Diverse_Data_Types.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

# Define multi-modal model (e.g., image and text)
class MultiModalNet(nn.Module):
    def __init__(self, text_dim, image_dim, hidden_dim, output_dim):
        super(MultiModalNet, self).__init__()
        self.text_model = BertModel.from_pretrained("bert-base-uncased")  # Pretrained BERT model
        self.image_model = nn.Sequential(nn.Linear(image_dim, hidden_dim), nn.ReLU())
        self.fc = nn.Linear(text_dim + hidden_dim, output_dim)

    def forward(self, text, image):
        text_features = self.text_model(text).pooler_output  # Text embeddings
        image_features = self.image_model(image)             # Image features
        combined = torch.cat((text_features, image_features), dim=1)  # Concatenate
        return self.fc(combined)

# Example usage (dummy text and image tensors)
text_input = torch.randint(0, 30522, (1, 16))  # Simulated token IDs
image_input = torch.randn(1, 512)              # Dummy image features
model = MultiModalNet(text_dim=768, image_dim=512, hidden_dim=256, output_dim=2)
output = model(text_input, image_input)