In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import ViltProcessor, ViltModel  # ViLT import
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import numpy as np
from PIL import Image

# Load the Excel file and check for NaN values
file_path = 'train_data.xlsx'
df = pd.read_excel(file_path)
df = df.dropna(subset=['Label_Sentiment'])  # Remove rows with NaN in 'Label_Sentiment'
df['Label_Sentiment'] = df['Label_Sentiment'].astype(int)  # Ensure Label_Sentiment is integer type

class MemeDataset(Dataset):
    def __init__(self, dataframe, img_dir, max_length=40):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.max_length = max_length
        self.processor = ViltProcessor.from_pretrained('dandelin/vilt-b32-mlm')  # ViLT processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])

        try:
            image = Image.open(img_name).convert("RGB")
            image = image.resize((224, 224))  # Resize image to 224x224
        except FileNotFoundError:
            return None

        text = self.dataframe.iloc[idx, 1]
        encoding = self.processor(images=image, text=text, return_tensors="pt", padding="max_length", 
                                  truncation=True, max_length=self.max_length)

        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)

        sample = {
            'pixel_values': encoding['pixel_values'].squeeze(),
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': label
        }
        return sample

# Define the dataset
dataset = MemeDataset(dataframe=df, img_dir='images/')

# Custom collate function to filter out None samples
def collate_fn(batch):
    batch = [sample for sample in batch if sample is not None]
    if len(batch) == 0:
        return None
    return torch.utils.data.dataloader.default_collate(batch)

# Define the ViLT-based Multimodal Model
class VILTHybridModel(nn.Module):
    def __init__(self):
        super(VILTHybridModel, self).__init__()
        self.vilt = ViltModel.from_pretrained('dandelin/vilt-b32-mlm')  # Load ViLT
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(self.vilt.config.hidden_size, 2)  # Adjusted for 2-class classification

    def forward(self, input_ids, attention_mask, pixel_values):
        outputs = self.vilt(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        pooled_output = outputs.pooler_output  # Get the pooled output
        logits = self.classifier(self.dropout(pooled_output))
        return logits

# Initialize model, loss, and optimizer
model = VILTHybridModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Early stopping parameters
patience = 2
best_loss = float('inf')
early_stop_counter = 0

# K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, test_idx) in enumerate(skf.split(dataset, df['Label_Sentiment'])):
    print(f'Fold {fold + 1}')
    
    train_subsampler = Subset(dataset, train_idx)
    test_subsampler = Subset(dataset, test_idx)
    
    train_dataloader = DataLoader(train_subsampler, batch_size=16, shuffle=True, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_subsampler, batch_size=16, shuffle=False, collate_fn=collate_fn)
    
    # Training loop with early stopping
    model.train()
    for epoch in range(10):  
        epoch_loss = 0.0
        for batch in train_dataloader:
            if batch is None:
                continue
            optimizer.zero_grad()
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], pixel_values=batch['pixel_values'])
            loss = criterion(outputs, batch['label'])
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Validation step for early stopping
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in test_dataloader:
                if batch is None:
                    continue
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], pixel_values=batch['pixel_values'])
                loss = criterion(outputs, batch['label'])
                val_loss += loss.item()
        
        print(f'Epoch {epoch + 1} - Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}')
        
        # Check early stopping condition
        if val_loss < best_loss:
            best_loss = val_loss
            early_stop_counter = 0  
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f'Early stopping triggered at epoch {epoch + 1}')
                break  
    
    # Evaluation
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in test_dataloader:
            if batch is None:
                continue
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], pixel_values=batch['pixel_values'])
            _, preds = torch.max(outputs, 1)
            all_labels.extend(batch['label'].numpy())
            all_preds.extend(preds.numpy())

    # Calculate metrics with zero_division parameter
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, zero_division=1)
    precision = precision_score(all_labels, all_preds, zero_division=1)
    recall = recall_score(all_labels, all_preds, zero_division=1)
    conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

    fold_results.append({
        'fold': fold + 1,
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': conf_matrix
    })
    print(f'Fold {fold + 1} - Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# Average results across folds
avg_accuracy = np.mean([result['accuracy'] for result in fold_results])
avg_f1 = np.mean([result['f1'] for result in fold_results])
avg_precision = np.mean([result['precision'] for result in fold_results])
avg_recall = np.mean([result['recall'] for result in fold_results])

print(f'Average Accuracy: {avg_accuracy}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')

# Save the trained model weights
torch.save(model.state_dict(), 'vilt_model.pth')
print("Model weights saved to 'vilt_model.pth'")


  from .autonotebook import tqdm as notebook_tqdm
2024-11-15 11:05:16.767687: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-15 11:05:16.851595: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-15 11:05:16.854958: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-15 11:05:16.854969: I tensorflow/stream_executor/cuda/cudart_stub.c

Fold 1
Epoch 1 - Train Loss: 210.0282, Val Loss: 33.2878
Epoch 2 - Train Loss: 49.8789, Val Loss: 22.9893
Epoch 3 - Train Loss: 17.5621, Val Loss: 15.9480
Epoch 4 - Train Loss: 9.0595, Val Loss: 14.6643
Epoch 5 - Train Loss: 7.4195, Val Loss: 26.6437
Epoch 6 - Train Loss: 8.0047, Val Loss: 15.3654
Early stopping triggered at epoch 6
Fold 1 - Accuracy: 0.9653465346534653, F1: 0.9649749821300929, Precision: 0.9698275862068966, Recall: 0.9601706970128022
Confusion Matrix:
[[690  21]
 [ 28 675]]
Fold 2
Epoch 1 - Train Loss: 21.0239, Val Loss: 1.5489
Epoch 2 - Train Loss: 4.0703, Val Loss: 1.0181
Epoch 3 - Train Loss: 2.4121, Val Loss: 0.9140
Epoch 4 - Train Loss: 1.2019, Val Loss: 0.9317
Epoch 5 - Train Loss: 1.1836, Val Loss: 1.2029
Early stopping triggered at epoch 5
Fold 2 - Accuracy: 0.9978768577494692, F1: 0.9978586723768736, Precision: 1.0, Recall: 0.9957264957264957
Confusion Matrix:
[[711   0]
 [  3 699]]
Fold 3
Epoch 1 - Train Loss: 8.4650, Val Loss: 3.8405
Early stopping triggere

In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from PIL import Image
import os
from transformers import ViltProcessor

# Load the test data
test_df = pd.read_excel('test_data.xlsx')
test_df = test_df.dropna(subset=['Label_Sentiment'])
test_df['Label_Sentiment'] = test_df['Label_Sentiment'].astype(int)

# Updated MemeDataset class for testing
class MemeDataset(Dataset):
    def __init__(self, dataframe, img_dir, max_length=40):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.max_length = max_length
        self.processor = ViltProcessor.from_pretrained('dandelin/vilt-b32-mlm')  # Use the same processor as training

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])

        try:
            image = Image.open(img_name).convert("RGB")
            image = image.resize((224, 224))  # Ensure image size matches training
        except (FileNotFoundError, OSError):
            return None  # Skip this sample if the image is not found or is corrupted

        text = self.dataframe.iloc[idx, 1]
        encoding = self.processor(images=image, text=text, return_tensors="pt", padding="max_length", 
                                  truncation=True, max_length=self.max_length)

        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)

        sample = {
            'pixel_values': encoding['pixel_values'].squeeze(),
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': label
        }
        return sample

# Custom collate function to filter out None samples
def collate_fn(batch):
    batch = [sample for sample in batch if sample is not None]
    if len(batch) == 0:
        return None
    return torch.utils.data.dataloader.default_collate(batch)

# Define the test dataset and DataLoader
test_dataset = MemeDataset(dataframe=test_df, img_dir='images/')
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Load the trained model
model = VILTHybridModel()  # Ensure the model definition matches the training
model.load_state_dict(torch.load('vilt_model.pth'))  # Load the saved model weights
model.eval()

# Evaluate the model on the test set
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_dataloader:
        if batch is None:
            continue
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            pixel_values=batch['pixel_values']
        )
        _, preds = torch.max(outputs, 1)
        all_labels.extend(batch['label'].numpy())
        all_preds.extend(preds.numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, zero_division=1)
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

# Print results
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test F1 Score: {f1:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Confusion Matrix:\n{conf_matrix}')


  model.load_state_dict(torch.load('vilt_model.pth'))  # Load the saved model weights


Test Accuracy: 0.9679
Test F1 Score: 0.9688
Test Precision: 0.9688
Test Recall: 0.9688
Confusion Matrix:
[[585  20]
 [ 20 622]]
