In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics import f1_score

In [2]:
class EMOTICDataset(Dataset):
    def __init__(self, csv_path, npy_folder, transform=None):
        self.annotations = pd.read_csv(csv_path)
        self.npy_folder = npy_folder
        self.transform = transform
        self.categories = [
            'Peace', 'Affection', 'Esteem', 'Anticipation', 'Engagement', 'Confidence', 
            'Happiness', 'Pleasure', 'Excitement', 'Surprise', 'Sympathy', 'Doubt/Confusion',
            'Disconnection', 'Fatigue', 'Embarrassment', 'Yearning', 'Disapproval', 'Aversion',
            'Annoyance', 'Anger', 'Sensitivity', 'Sadness', 'Disquietment', 'Fear', 'Pain', 'Suffering'
        ]

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        row = self.annotations.iloc[idx]
        img_array = np.load(f"{self.npy_folder}/{row['Arr_name']}")
        img_tensor = torch.tensor(img_array, dtype=torch.float32).permute(2, 0, 1)
        
        if self.transform:
            img_tensor = self.transform(img_tensor)

        labels = torch.tensor(row[self.categories].values.astype(float), dtype=torch.float32)
        text = row['Filename']

        return {
            'image': img_tensor,
            'text': text,
            'labels': labels
        }


In [3]:
class MultiModalNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.img_encoder = nn.Sequential(*list(models.resnet50(pretrained=True).children())[:-1])
        self.text_encoder = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.text_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        
        self.fusion = nn.Sequential(
            nn.Linear(2048 + 768, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, img_input, text_input):
        img_features = self.img_encoder(img_input).squeeze()
        text_features = self.text_encoder(**text_input).last_hidden_state.mean(dim=1)
        combined = torch.cat((img_features, text_features), dim=1)
        return self.fusion(combined)

In [4]:
def train_model(model, train_loader, val_loader, num_epochs=10):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    criterion = nn.BCEWithLogitsLoss()
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            optimizer.zero_grad()
            images = batch['image'].to(device)
            text_inputs = model.text_tokenizer(
                batch['text'], 
                padding=True, 
                truncation=True, 
                return_tensors='pt'
            ).to(device)
            
            outputs = model(images, text_inputs)
            loss = criterion(outputs, batch['labels'].to(device))
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}')
        
        if val_loader:
            validate_model(model, val_loader)

def validate_model(model, val_loader):
    model.eval()
    device = next(model.parameters()).device
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            images = batch['image'].to(device)
            text_inputs = model.text_tokenizer(
                batch['text'], 
                padding=True, 
                truncation=True, 
                return_tensors='pt'
            ).to(device)
            
            outputs = torch.sigmoid(model(images, text_inputs))
            preds = (outputs > 0.5).float().cpu()
            labels = batch['labels'].cpu()
            
            all_preds.append(preds)
            all_labels.append(labels)
    
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    accuracy = (all_preds == all_labels).float().mean().item()
    f1 = f1_score(all_labels.numpy(), all_preds.numpy(), average='samples')
    
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")


In [7]:
# Main execution
if __name__ == "__main__":
    image_transform = transforms.Compose([
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dataset = NPYMultiModalDataset(
        npy_folder='D:/Ruturaj/New folder (3)/Ruturaj/Smart Systems/MANAV Experiment/img_arrs',
        mat_path='D:/Ruturaj/New folder (3)/Ruturaj/Smart Systems/MANAV Experiment/annots_arrs/annot_arrs_train.csv',
        split='train',
        transform=image_transform
    )

    val_dataset = NPYMultiModalDataset(
        npy_folder='D:/Ruturaj/New folder (3)/Ruturaj/Smart Systems/MANAV Experiment/img_arrs',
        mat_path='D:/Ruturaj/New folder (3)/Ruturaj/Smart Systems/MANAV Experiment/annots_arrs/annot_arrs_train.csv',
        split='val',
        transform=image_transform
    )

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2)

    model = MultiModalNet(num_classes=26)
    train_model(model, train_loader, val_loader, num_epochs=10)

ValueError: Unknown mat file type, version 115, 115

In [None]:






if __name__ == "__main__":
    image_transform = transforms.Compose([
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dataset = EMOTICDataset(
        csv_path='D:/Ruturaj/New folder (3)/Ruturaj/Smart Systems/MANAV Experiment/annots_arrs/annot_arrs_train.csv',
        npy_folder='D:/Ruturaj/New folder (3)/Ruturaj/Smart Systems/MANAV Experiment/img_arrs',
        transform=image_transform
    )

    val_dataset = EMOTICDataset(
        csv_path='D:/Ruturaj/New folder (3)/Ruturaj/Smart Systems/MANAV Experiment/annots_arrs/annot_arrs_val.csv',
        npy_folder='D:/Ruturaj/New folder (3)/Ruturaj/Smart Systems/MANAV Experiment/img_arrs',
        transform=image_transform
    )

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

    model = MultiModalNet(num_classes=26)
    train_model(model, train_loader, val_loader, num_epochs=10)


