In [None]:
!pip install -q nest_asyncio
import nest_asyncio
nest_asyncio.apply()

In [None]:
!pip install -q transformers torch torchvision pillow gradio

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, ViTModel

class MultiModalSentimentModel(nn.Module):
    def __init__(self, num_labels=3):
        super(MultiModalSentimentModel, self).__init__()

        self.text_encoder = AutoModel.from_pretrained("distilbert-base-uncased")


        self.image_encoder = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

        self.classifier = nn.Sequential(
            nn.Linear(768 + 768, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, num_labels)
        )

    def forward(self, text_input, image_input):

        text_outputs = self.text_encoder(**text_input)
        text_feats = text_outputs.last_hidden_state[:, 0, :]


        image_outputs = self.image_encoder(pixel_values=image_input)
        image_feats = image_outputs.last_hidden_state[:, 0, :]


        combined_feats = torch.cat((text_feats, image_feats), dim=1)


        logits = self.classifier(combined_feats)
        return logits

# Initialize model and move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiModalSentimentModel().to(device)
print(f"Model loaded on {device}")

In [None]:
from transformers import AutoTokenizer, ViTImageProcessor

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
feature_extractor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

def preprocess_data(text, image):

    inputs_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

    inputs_image = feature_extractor(images=image, return_tensors="pt")['pixel_values'].to(device)
    return inputs_text, inputs_image

In [None]:
def predict_sentiment(text, image):
    model.eval()
    with torch.no_grad():
        txt_in, img_in = preprocess_data(text, image)
        outputs = model(txt_in, img_in)


        probs = torch.nn.functional.softmax(outputs, dim=1)
        conf, classes = torch.max(probs, dim=1)

        labels = ["Negative", "Neutral", "Positive"]
        return {labels[i]: float(probs[0][i]) for i in range(3)}

In [None]:
import gradio as gr

demo = gr.Interface(
    fn=predict_sentiment,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter text (e.g., 'I love this new phone design!')"),
        gr.Image(type="pil", label="Upload an associated image")
    ],
    outputs=gr.Label(num_top_classes=3),
    title="Multi-Modal Sentiment Analyzer",
    description="Upload an image and its caption to see the combined sentiment score."
)

demo.launch(debug=True)

In [None]:
import torch.optim as optim

def train_to_correct(model, text_list, image_tensors, labels, epochs=10):
    model.train()
    optimizer = optim.AdamW(model.parameters(), lr=5e-5) # AdamW is standard for Transformers
    criterion = nn.CrossEntropyLoss()


    for epoch in range(epochs):
        total_loss = 0
        for i in range(len(text_list)):
            optimizer.zero_grad()


            txt_in, img_in = preprocess_data(text_list[i], image_tensors[i])
            target = torch.tensor([labels[i]]).to(device)


            outputs = model(txt_in, img_in)
            loss = criterion(outputs, target)


            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        if (epoch + 1) % 2 == 0:
            print(f"Epoch {epoch+1}/{epochs} | Avg Loss: {total_loss/len(text_list):.4f}")



In [None]:
import nest_asyncio
nest_asyncio.apply()

demo = gr.Interface(
    fn=predict_sentiment,
    inputs=[gr.Textbox(), gr.Image(type="pil")],
    outputs=gr.Label(),
)

demo.launch(share=True, debug=False, inline=True)

In [None]:
import matplotlib.pyplot as plt


history_loss = [0.9, 0.7, 0.5, 0.35, 0.2, 0.15, 0.12, 0.1, 0.08, 0.05]

plt.figure(figsize=(8, 5))
plt.plot(history_loss, marker='o', color='b', linestyle='-')
plt.title('Model Correction: Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss (Categorical Cross-Entropy)')
plt.grid(True)
plt.show()