In [2]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import BertTokenizer, VisualBertModel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import numpy as np
from PIL import Image

# Load the Excel file and check for NaN values
file_path = 'train_data.xlsx'
df = pd.read_excel(file_path)
df = df.dropna(subset=['Label_Sentiment'])  # Remove rows with NaN in 'Label_Sentiment'
df['Label_Sentiment'] = df['Label_Sentiment'].astype(int)  # Ensure Label_Sentiment is integer type

class MemeDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])
        
        try:
            image = Image.open(img_name).convert("RGB")
        except FileNotFoundError:
            return None
        
        if self.transform:
            image = self.transform(image)
        
        text = self.dataframe.iloc[idx, 1]
        tokens = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        
        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)
        
        sample = {
            'image': image,
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'label': label
        }
        return sample

# Transformations for the image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Define the dataset
dataset = MemeDataset(dataframe=df, img_dir='images/', transform=transform)

# Custom collate function to filter out None samples
def collate_fn(batch):
    batch = [sample for sample in batch if sample is not None]
    if len(batch) == 0:
        return None
    return torch.utils.data.dataloader.default_collate(batch)

# Define the VisualBERT-based Multimodal Model
class VisualBERTMultimodalModel(nn.Module):
    def __init__(self):
        super(VisualBERTMultimodalModel, self).__init__()
        self.visual_model = models.vgg16(weights='IMAGENET1K_V1')
        self.visual_model.classifier = nn.Sequential(*list(self.visual_model.classifier.children())[:-1])  # Remove last layer
        self.text_model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(4096 + 768, 2)

    def forward(self, input_ids, attention_mask, images):
        visual_features = self.visual_model(images)
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        combined = torch.cat((visual_features, text_outputs), dim=1)
        combined = self.dropout(combined)
        logits = self.classifier(combined)
        return logits

# Initialize model, loss, and optimizer
model = VisualBERTMultimodalModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Early stopping parameters
patience = 2
best_loss = float('inf')
early_stop_counter = 0

# K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, test_idx) in enumerate(skf.split(dataset, df['Label_Sentiment'])):
    print(f'Fold {fold + 1}')
    
    train_subsampler = Subset(dataset, train_idx)
    test_subsampler = Subset(dataset, test_idx)
    
    train_dataloader = DataLoader(train_subsampler, batch_size=16, shuffle=True, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_subsampler, batch_size=16, shuffle=False, collate_fn=collate_fn)
    
    # Training loop with early stopping
    model.train()
    for epoch in range(10):  
        epoch_loss = 0.0
        for batch in train_dataloader:
            if batch is None:
                continue
            optimizer.zero_grad()
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], images=batch['image'])
            loss = criterion(outputs, batch['label'])
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Validation step for early stopping
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in test_dataloader:
                if batch is None:
                    continue
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], images=batch['image'])
                loss = criterion(outputs, batch['label'])
                val_loss += loss.item()
        
        print(f'Epoch {epoch + 1} - Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}')
        
        # Check early stopping condition
        if val_loss < best_loss:
            best_loss = val_loss
            early_stop_counter = 0  
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f'Early stopping triggered at epoch {epoch + 1}')
                break  
    
    # Evaluation
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in test_dataloader:
            if batch is None:
                continue
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], images=batch['image'])
            _, preds = torch.max(outputs, 1)
            all_labels.extend(batch['label'].numpy())
            all_preds.extend(preds.numpy())

    # Calculate metrics with zero_division parameter
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, zero_division=1)
    precision = precision_score(all_labels, all_preds, zero_division=1)
    recall = recall_score(all_labels, all_preds, zero_division=1)
    conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

    fold_results.append({
        'fold': fold + 1,
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': conf_matrix
    })
    print(f'Fold {fold + 1} - Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# Average results across folds
avg_accuracy = np.mean([result['accuracy'] for result in fold_results])
avg_f1 = np.mean([result['f1'] for result in fold_results])
avg_precision = np.mean([result['precision'] for result in fold_results])
avg_recall = np.mean([result['recall'] for result in fold_results])

print(f'Average Accuracy: {avg_accuracy}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')

Fold 1
Epoch 1 - Train Loss: 247.3033, Val Loss: 61.9655
Epoch 2 - Train Loss: 248.1031, Val Loss: 61.4468
Epoch 3 - Train Loss: 245.9011, Val Loss: 61.8740
Epoch 4 - Train Loss: 246.1299, Val Loss: 61.6801
Early stopping triggered at epoch 4
Fold 1 - Accuracy: 0.5042432814710043, F1: 0.5311036789297658, Precision: 0.5012626262626263, Recall: 0.5647226173541963
Confusion Matrix:
[[316 395]
 [306 397]]
Fold 2
Epoch 1 - Train Loss: 246.6512, Val Loss: 61.7636
Early stopping triggered at epoch 1
Fold 2 - Accuracy: 0.5074309978768577, F1: 0.6663470757430489, Precision: 0.5021676300578035, Recall: 0.99002849002849
Confusion Matrix:
[[ 22 689]
 [  7 695]]
Fold 3
Epoch 1 - Train Loss: 246.8698, Val Loss: 61.6137
Early stopping triggered at epoch 1
Fold 3 - Accuracy: 0.502476999292286, F1: 0.6650786088613625, Precision: 0.49964209019327127, Recall: 0.9943019943019943
Confusion Matrix:
[[ 12 699]
 [  4 698]]
Fold 4
Epoch 1 - Train Loss: 245.7830, Val Loss: 61.5667
Early stopping triggered at ep

In [5]:
torch.save(model.state_dict(), 'visualbert_model.pth')


In [7]:
import pandas as pd
import os
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from PIL import Image

# Load the test dataset
test_file_path = 'test_data.xlsx'
test_df = pd.read_excel(test_file_path)
test_df = test_df.dropna(subset=['Label_Sentiment'])  # Remove rows with NaN in 'Label_Sentiment'
test_df['Label_Sentiment'] = test_df['Label_Sentiment'].astype(int)  # Ensure Label_Sentiment is integer type

class MemeDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])
        
        try:
            image = Image.open(img_name).convert("RGB")
        except (FileNotFoundError, OSError):
            # Skip corrupted or missing images
            return None
        
        if self.transform:
            image = self.transform(image)
        
        text = self.dataframe.iloc[idx, 1]
        tokens = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        
        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)
        
        sample = {
            'image': image,
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'label': label
        }
        return sample

# Define the transformation for images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Create the test dataset and DataLoader
test_dataset = MemeDataset(dataframe=test_df, img_dir='images/', transform=transform)

# Custom collate function to filter out None samples
def collate_fn(batch):
    batch = [sample for sample in batch if sample is not None]
    if len(batch) == 0:
        return None
    return torch.utils.data.dataloader.default_collate(batch)

test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Load the trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VisualBERTMultimodalModel()
model.load_state_dict(torch.load('visualbert_model.pth'))  # Replace with the correct path to your trained model
model.to(device)
model.eval()

# Evaluate the model on the test dataset
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_dataloader:
        if batch is None:
            continue
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
        _, preds = torch.max(outputs, 1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, zero_division=1)
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'Confusion Matrix:\n{conf_matrix}')


  model.load_state_dict(torch.load('visualbert_model.pth'))  # Replace with the correct path to your trained model


Accuracy: 0.5100240577385726
F1 Score: 0.6140240050536956
Precision: 0.5164718384697131
Recall: 0.7570093457943925
Confusion Matrix:
[[150 455]
 [156 486]]
