In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import numpy as np
from PIL import Image

# Load the Excel file and check for NaN values
file_path = 'multi-sent-p.xlsx'
df = pd.read_excel(file_path)
df = df.dropna(subset=['Label_Sentiment'])  # Remove rows with NaN in 'Label_Sentiment'
df['Label_Sentiment'] = df['Label_Sentiment'].astype(int)  # Ensure Label_Sentiment is integer type

class ImageOnlyDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])
        
        try:
            image = Image.open(img_name).convert("RGB")
        except FileNotFoundError:
            return None
        
        if self.transform:
            image = self.transform(image)
        
        # Use existing label as it is already numeric (0 = negative, 1 = positive)
        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)
        
        sample = {
            'image': image,
            'label': label
        }
        return sample

# Transformations for the image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Define the dataset
dataset = ImageOnlyDataset(dataframe=df, img_dir='Memes/', transform=transform)

# Custom collate function to filter out None samples
def collate_fn(batch):
    batch = [sample for sample in batch if sample is not None]
    if len(batch) == 0:
        return None
    return torch.utils.data.dataloader.default_collate(batch)

class VGG16Model(nn.Module):
    def __init__(self):
        super(VGG16Model, self).__init__()
        # Load a pre-trained VGG-16 model
        self.vision_model = models.vgg16(weights='IMAGENET1K_V1')
        
        # Modify the classifier to fit 2 classes (binary classification)
        num_features = self.vision_model.classifier[6].in_features
        self.vision_model.classifier[6] = nn.Linear(num_features, 2)

    def forward(self, images):
        return self.vision_model(images)

# Initialize model, loss, and optimizer
model = VGG16Model()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Early stopping parameters
patience = 2
best_loss = float('inf')
early_stop_counter = 0

# K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, test_idx) in enumerate(skf.split(dataset, df['Label_Sentiment'])):
    print(f'Fold {fold + 1}')
    
    train_subsampler = Subset(dataset, train_idx)
    test_subsampler = Subset(dataset, test_idx)
    
    train_dataloader = DataLoader(train_subsampler, batch_size=16, shuffle=True, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_subsampler, batch_size=16, shuffle=False, collate_fn=collate_fn)
    
    # Training loop with early stopping
    model.train()
    for epoch in range(10):  
        epoch_loss = 0.0
        for batch in train_dataloader:
            if batch is None:
                continue
            optimizer.zero_grad()
            outputs = model(images=batch['image'])
            loss = criterion(outputs, batch['label'])
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Validation step for early stopping
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in test_dataloader:
                if batch is None:
                    continue
                outputs = model(images=batch['image'])
                loss = criterion(outputs, batch['label'])
                val_loss += loss.item()
        
        print(f'Epoch {epoch + 1} - Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}')
        
        # Check early stopping condition
        if val_loss < best_loss:
            best_loss = val_loss
            early_stop_counter = 0  
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f'Early stopping triggered at epoch {epoch + 1}')
                break  
    
    # Evaluation
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in test_dataloader:
            if batch is None:
                continue
            outputs = model(images=batch['image'])
            _, preds = torch.max(outputs, 1)
            all_labels.extend(batch['label'].numpy())
            all_preds.extend(preds.numpy())

    # Calculate metrics with zero_division parameter
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, zero_division=1)
    precision = precision_score(all_labels, all_preds, zero_division=1)
    recall = recall_score(all_labels, all_preds, zero_division=1)
    conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

    fold_results.append({
        'fold': fold + 1,
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': conf_matrix
    })
    print(f'Fold {fold + 1} - Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# Average results across foldsimport pandas as pd
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import os

# Load new data
new_data_path = 'new_data.xlsx'
new_df = pd.read_excel(new_data_path)
new_df = new_df.dropna(subset=['Label_Sentiment'])
new_df['Label_Sentiment'] = new_df['Label_Sentiment'].astype(int)

# Define the dataset for new data
class NewImageDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])
        
        try:
            image = Image.open(img_name).convert("RGB")
        except FileNotFoundError:
            return None
        
        if self.transform:
            image = self.transform(image)
        
        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)
        
        return {'image': image, 'label': label}

# Transformations for the image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load the dataset
new_dataset = NewImageDataset(dataframe=new_df, img_dir='images/', transform=transform)

# Define DataLoader for new data
new_dataloader = DataLoader(new_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Testing the model
model.eval()
all_labels = []
all_preds = []
with torch.no_grad():
    for batch in new_dataloader:
        if batch is None:
            continue
        outputs = model(images=batch['image'])
        _, preds = torch.max(outputs, 1)
        all_labels.extend(batch['label'].numpy())
        all_preds.extend(preds.numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, zero_division=1)
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

print(f'Test Accuracy: {accuracy}')
print(f'Test F1 Score: {f1}')
print(f'Test Precision: {precision}')
print(f'Test Recall: {recall}')
print(f'Confusion Matrix:\n{conf_matrix}')

avg_accuracy = np.mean([result['accuracy'] for result in fold_results])
avg_f1 = np.mean([result['f1'] for result in fold_results])
avg_precision = np.mean([result['precision'] for result in fold_results])
avg_recall = np.mean([result['recall'] for result in fold_results])

print(f'Average Accuracy: {avg_accuracy}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')




Fold 1
Epoch 1 - Train Loss: 149.4269, Val Loss: 35.6813
Epoch 2 - Train Loss: 136.1621, Val Loss: 34.5640
Epoch 3 - Train Loss: 119.9639, Val Loss: 34.8257
Epoch 4 - Train Loss: 104.2669, Val Loss: 34.1229
Epoch 5 - Train Loss: 81.2899, Val Loss: 47.0427
Epoch 6 - Train Loss: 54.3221, Val Loss: 47.6713
Early stopping triggered at epoch 6
Fold 1 - Accuracy: 0.7489711934156379, F1: 0.7365010799136069, Precision: 0.7785388127853882, Recall: 0.6987704918032787
Confusion Matrix:
[[387  97]
 [147 341]]
Fold 2
Epoch 1 - Train Loss: 73.3339, Val Loss: 11.5404
Epoch 2 - Train Loss: 32.9538, Val Loss: 9.2521
Epoch 3 - Train Loss: 28.2416, Val Loss: 12.3118
Epoch 4 - Train Loss: 20.1035, Val Loss: 19.9979
Early stopping triggered at epoch 4
Fold 2 - Accuracy: 0.86302780638517, F1: 0.8767377201112141, Precision: 0.8003384094754653, Recall: 0.9692622950819673
Confusion Matrix:
[[365 118]
 [ 15 473]]
Fold 3
Epoch 1 - Train Loss: 33.9958, Val Loss: 6.5002
Epoch 2 - Train Loss: 13.0518, Val Loss: 4.1

In [8]:
import pandas as pd
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# Load the test dataset
test_file_path = 'test_data.xlsx'
test_df = pd.read_excel(test_file_path)
test_df = test_df.dropna(subset=['Label_Sentiment'])  # Remove rows with NaN in 'Label_Sentiment'
test_df['Label_Sentiment'] = test_df['Label_Sentiment'].astype(int)  # Ensure Label_Sentiment is integer type

class TestImageDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])
        
        try:
            image = Image.open(img_name).convert("RGB")
        except FileNotFoundError:
            return None
        
        if self.transform:
            image = self.transform(image)
        
        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)
        
        sample = {
            'image': image,
            'label': label
        }
        return sample

# Define transformations for the images
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Create the test dataset and data loader
test_dataset = TestImageDataset(dataframe=test_df, img_dir='Memes-test/', transform=test_transform)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Evaluate the model on the test data
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_dataloader:
        if batch is None:
            continue
        outputs = model(images=batch['image'])
        _, preds = torch.max(outputs, 1)
        all_labels.extend(batch['label'].numpy())
        all_preds.extend(preds.numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, zero_division=1)
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

# Print results
print(f'Test Accuracy: {accuracy}')
print(f'Test F1 Score: {f1}')
print(f'Test Precision: {precision}')
print(f'Test Recall: {recall}')
print(f'Confusion Matrix:\n{conf_matrix}')


Test Accuracy: 0.7555555555555555
Test F1 Score: 0.7528089887640449
Test Precision: 0.730909090909091
Test Recall: 0.7760617760617761
Confusion Matrix:
[[207  74]
 [ 58 201]]


In [7]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import os

# Load new data
new_data_path = 'new_data_numeric.xlsx'
new_df = pd.read_excel(new_data_path)
new_df = new_df.dropna(subset=['Label_Sentiment'])
new_df['Label_Sentiment'] = new_df['Label_Sentiment'].astype(int)

# Define the dataset for new data with improved error handling
class NewImageDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])
        
        try:
            image = Image.open(img_name).convert("RGB")
            if self.transform:
                image = self.transform(image)
            label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)
            return {'image': image, 'label': label}
        
        except (FileNotFoundError, OSError):
            # Skip this file if it cannot be opened
            print(f"Warning: Unable to load image {img_name}. Skipping.")
            return None  # Return None if image loading fails

# Update the DataLoader to handle skipped images
def collate_fn(batch):
    batch = [sample for sample in batch if sample is not None]
    if len(batch) == 0:
        return None
    return torch.utils.data.dataloader.default_collate(batch)

# Define DataLoader for new data with collate_fn
new_dataloader = DataLoader(new_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


# Transformations for the image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load the dataset
new_dataset = NewImageDataset(dataframe=new_df, img_dir='images/', transform=transform)

# Define DataLoader for new data
new_dataloader = DataLoader(new_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Testing the model
model.eval()
all_labels = []
all_preds = []
with torch.no_grad():
    for batch in new_dataloader:
        if batch is None:
            continue
        outputs = model(images=batch['image'])
        _, preds = torch.max(outputs, 1)
        all_labels.extend(batch['label'].numpy())
        all_preds.extend(preds.numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, zero_division=1)
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

print(f'Test Accuracy: {accuracy}')
print(f'Test F1 Score: {f1}')
print(f'Test Precision: {precision}')
print(f'Test Recall: {recall}')
print(f'Confusion Matrix:\n{conf_matrix}')


Test Accuracy: 0.49960348929421095
Test F1 Score: 0.5089494163424124
Test Precision: 0.49923664122137407
Test Recall: 0.5190476190476191
Confusion Matrix:
[[303 328]
 [303 327]]


In [4]:
import pandas as pd

# Load the dataset
file_path = 'new_data.xlsx'  # Replace with the path to your file
data = pd.read_excel(file_path)

# Convert the 'Label_Sentiment' column to numeric
# Assuming 'positive' as 1 and 'negative' as 0
data['Label_Sentiment'] = data['Label_Sentiment'].apply(lambda x: 1 if x.strip().lower() == 'positive' else 0)

# Save the updated data back to an Excel file
data.to_excel('new_data_numeric.xlsx', index=False)

print("Converted dataset saved as 'new_data_numeric.xlsx'")


Converted dataset saved as 'new_data_numeric.xlsx'
