In [1]:
pip install torchvision

Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install PIL

[31mERROR: Could not find a version that satisfies the requirement PIL (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for PIL[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [15]:
import torch.nn as nn

class AudioLSTM(nn.Module):
    def __init__(self):
        super(AudioLSTM, self).__init__()
        self.lstm1 = nn.LSTM(input_size=1, hidden_size=256, batch_first=True, dropout=0.2, num_layers=1)
        self.lstm2 = nn.LSTM(input_size=256, hidden_size=128, batch_first=True, dropout=0.2, num_layers=1)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(128, 64)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = x[:, -1, :]  # Get last time step
        x = self.dropout(torch.relu(self.fc1(x)))
        return x  # feature vector


In [16]:
class ImageResNet(nn.Module):
    def __init__(self):
        super(ImageResNet, self).__init__()
        base_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        self.feature_extractor = nn.Sequential(*list(base_model.children())[:-1])  # Remove FC layer

    def forward(self, x):
        x = self.feature_extractor(x)
        return x.view(x.size(0), -1)  # Flatten (B, 2048)


In [3]:
import pandas as pd
from torchvision import transforms
from PIL import Image
import torch

# Load FER2013
fer_df = pd.read_csv('/Users/prithvipandey/Documents/dataset/fer2013.csv')

# Filter only 'Training' data
train_fer = fer_df[fer_df['Usage'] == 'Training']

# Transform
image_transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((48, 48)),
    transforms.ToTensor()
])

# Convert pixel strings to image tensors
fer_images = []
fer_labels = []

for idx, row in train_fer.iterrows():
    pixels = list(map(int, row['pixels'].split()))
    img = Image.fromarray(
        torch.tensor(pixels).reshape(48, 48).numpy().astype('uint8')
    )
    img_tensor = image_transform(img)
    fer_images.append(img_tensor)
    fer_labels.append(int(row['emotion']))


In [4]:
import os
import librosa
import pandas as pd
import numpy as np

# Path to TESS dataset folder
tess_path = '/Users/prithvipandey/Downloads/archive/TESS Toronto emotional speech set data'

# Emotions to keep (mapping for FER2013 if needed)
emotion_map = {
    'angry': 'angry',
    'disgust': 'disgust',
    'fear': 'fear',
    'happy': 'happy',
    'neutral': 'neutral',
    'ps': 'surprise',  # Sometimes "pleasant surprise" is written as "ps"
    'sad': 'sad'
}

data = []

for root, dirs, files in os.walk(tess_path):
    for file in files:
        if file.endswith(".wav"):
            emotion_label = None
            for key in emotion_map:
                if key in file.lower():
                    emotion_label = emotion_map[key]
                    break
            if emotion_label is None:
                continue

            file_path = os.path.join(root, file)

            # Load audio
            try:
                y, sr = librosa.load(file_path, sr=None)
                # Extract MFCCs
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
                mfcc_mean = np.mean(mfcc.T, axis=0)  # Average across time axis
                row = [file_path, emotion_label] + mfcc_mean.tolist()
                data.append(row)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

# Create DataFrame
columns = ['filename', 'emotion'] + [f'mfcc_{i}' for i in range(40)]
df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv('tess_features.csv', index=False)

print("Saved TESS MFCC features to tess_features.csv")


Saved TESS MFCC features to tess_features.csv


In [6]:
import numpy as np

# Load audio features from CSV
tess_df = pd.read_csv('tess_features.csv')

# Map emotion strings to same label format as FER2013 (0–6 or 0–7)
emotion_map = {
    'angry': 0,
    'disgust': 1,
    'fear': 2,
    'happy': 3,
    'sad': 4,
    'surprise': 5,
    'neutral': 6
}
# Filter only matching emotions
tess_df = tess_df[tess_df['emotion'].isin(emotion_map.keys())]

audio_features = []
audio_labels = []

for _, row in tess_df.iterrows():
    # Select only numeric columns for features
    features = row.drop(['filename', 'emotion']).values.astype(np.float32)
    tensor = torch.tensor(features)
    audio_features.append(tensor)
    audio_labels.append(emotion_map[row['emotion']])


In [7]:
from torch.utils.data import Dataset

class FusionDataset(Dataset):
    def __init__(self, image_data, audio_data, labels):
        self.image_data = image_data
        self.audio_data = audio_data
        self.labels = labels

    def __len__(self):
        return min(len(self.image_data), len(self.audio_data), len(self.labels))

    def __getitem__(self, idx):
        return self.image_data[idx], self.audio_data[idx], self.labels[idx]


In [8]:
# Combine only matching label data
min_len = min(len(fer_labels), len(audio_labels))
fusion_images = fer_images[:min_len]
fusion_audio = audio_features[:min_len]
fusion_labels = fer_labels[:min_len]  # assuming same mapping as audio

# Create dataset
fusion_dataset = FusionDataset(fusion_images, fusion_audio, fusion_labels)


In [9]:
from torch.utils.data import DataLoader

train_loader = DataLoader(fusion_dataset, batch_size=32, shuffle=True)


In [12]:
import torch.nn as nn


In [17]:
class MultimodalFusion(nn.Module):
    def __init__(self):
        super(MultimodalFusion, self).__init__()
        self.image_model = ImageResNet()
        self.audio_model = AudioLSTM()

        self.fusion_fc = nn.Sequential(
            nn.Linear(2048 + 64, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 7)  # Or 8 depending on your target class setup
        )

    def forward(self, image_input, audio_input):
        img_feat = self.image_model(image_input)
        aud_feat = self.audio_model(audio_input)
        combined = torch.cat((img_feat, aud_feat), dim=1)
        output = self.fusion_fc(combined)
        return output


In [18]:
def train_fusion_model(model, dataloader, optimizer, loss_fn, device, num_epochs=10):
    model.train()
    train_losses = []
    train_accuracies = []

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct, total = 0, 0

        for image_inputs, audio_inputs, labels in dataloader:
            image_inputs = image_inputs.to(device)
            audio_inputs = audio_inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(image_inputs, audio_inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        epoch_loss = running_loss / len(dataloader)
        epoch_acc = 100 * correct / total
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_acc)

        print(f"Epoch {epoch+1}/{num_epochs}: Loss = {epoch_loss:.4f}, Accuracy = {epoch_acc:.2f}%")

    return train_losses, train_accuracies


In [19]:
model = MultimodalFusion().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

# Train
train_losses, train_accuracies = train_fusion_model(
    model, train_loader, optimizer, loss_fn, device, num_epochs=10
)


NameError: name 'models' is not defined

In [20]:
import torch
import torch.nn as nn
import torchvision.models as models

class ImageResNet(nn.Module):
    def __init__(self):
        super(ImageResNet, self).__init__()
        base_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        self.feature_extractor = nn.Sequential(*list(base_model.children())[:-1])

    def forward(self, x):
        x = self.feature_extractor(x)
        x = torch.flatten(x, 1)
        return x

class AudioLSTM(nn.Module):
  #...your audio LSTM code...
    def __init__(self):
        super(AudioLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=40, hidden_size=32, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(32 * 2, 64)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the last time step output
        out = self.fc(lstm_out)
        return out

class MultimodalFusion(nn.Module):
    def __init__(self):
        super(MultimodalFusion, self).__init__()
        self.image_model = ImageResNet()
        self.audio_model = AudioLSTM()

        self.fusion_fc = nn.Sequential(
            nn.Linear(2048 + 64, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 7)  # Or 8 depending on your target class setup
        )

    def forward(self, image_input, audio_input):
        image_features = self.image_model(image_input)
        audio_features = self.audio_model(audio_input)
        combined_features = torch.cat((image_features, audio_features), dim=1)
        output = self.fusion_fc(combined_features)
        return output

# Example usage (assuming you have 'device' defined)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalFusion().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

#Example input.
image_input = torch.randn(1,3,224,224).to(device)
audio_input = torch.randn(1,10,40).to(device)

output = model(image_input,audio_input)
print(output.shape)

torch.Size([1, 7])


In [21]:
import torch
import torch.nn as nn
import torchvision.models as models

# Determine the device (MPS if available, otherwise CPU)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

class ImageResNet(nn.Module):
    def __init__(self):
        super(ImageResNet, self).__init__()
        base_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        self.feature_extractor = nn.Sequential(*list(base_model.children())[:-1])

    def forward(self, x):
        x = self.feature_extractor(x)
        x = torch.flatten(x, 1)
        return x

class AudioLSTM(nn.Module):
    def __init__(self):
        super(AudioLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=40, hidden_size=32, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(32 * 2, 64)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the last time step output
        out = self.fc(lstm_out)
        return out

class MultimodalFusion(nn.Module):
    def __init__(self):
        super(MultimodalFusion, self).__init__()
        self.image_model = ImageResNet()
        self.audio_model = AudioLSTM()

        self.fusion_fc = nn.Sequential(
            nn.Linear(2048 + 64, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 7)  # Or 8 depending on your target class setup
        )

    def forward(self, image_input, audio_input):
        image_features = self.image_model(image_input)
        audio_features = self.audio_model(audio_input)
        combined_features = torch.cat((image_features, audio_features), dim=1)
        output = self.fusion_fc(combined_features)
        return output

# Instantiate the model and move it to the MPS device
model = MultimodalFusion().to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

# Example input (move inputs to the MPS device)
image_input = torch.randn(1, 3, 224, 224).to(device)
audio_input = torch.randn(1, 10, 40).to(device)

# Forward pass
output = model(image_input, audio_input)
print(output.shape)

# Example of a loss calculation.
target = torch.randint(0, 7, (1,)).to(device) # Example target
loss = loss_fn(output, target)
print(f"Example loss: {loss}")

Using device: mps
torch.Size([1, 7])
Example loss: 2.013303756713867


In [23]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from PIL import Image

# Use MPS on Mac if available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load FER2013 CSV (Image Data)
fer_df = pd.read_csv("/Users/prithvipandey/Documents/dataset/fer2013.csv")
fer_df = fer_df[fer_df['Usage'] == 'Training']
fer_images = fer_df['pixels'].apply(lambda x: np.fromstring(x, sep=' ').reshape(48, 48).astype(np.uint8))
fer_labels = fer_df['emotion'].values

# Resize & preprocess images for ResNet
image_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
])

fer_image_tensors = torch.stack([image_transform(img) for img in fer_images])
fer_labels = torch.tensor(fer_labels)

# Load TESS CSV (Audio Features)
tess_df = pd.read_csv("tess_features.csv")  # CSV with MFCC or other audio features
tess_features = tess_df.drop("label", axis=1).values.reshape(-1, 10, 40).astype(np.float32)
tess_labels = tess_df['label'].values

# Encode TESS labels to numeric to match FER2013 (map emotions to same index!)
emotion_map = {'neutral': 0, 'happy': 1, 'sad': 2, 'angry': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}
tess_labels = np.array([emotion_map[label] for label in tess_labels])

# Ensure same length by trimming or balancing
min_len = min(len(fer_labels), len(tess_labels))
fer_image_tensors = fer_image_tensors[:min_len]
fer_labels = fer_labels[:min_len]
tess_features = tess_features[:min_len]
tess_labels = tess_labels[:min_len]

# Final labels must match for multimodal fusion
assert all(fer_labels.numpy() == tess_labels)

# Train/Val Split
X_img_train, X_img_val, X_aud_train, X_aud_val, y_train, y_val = train_test_split(
    fer_image_tensors, tess_features, fer_labels, test_size=0.2, random_state=42
)

# Torch Dataset
class MultimodalDataset(Dataset):
    def __init__(self, image_data, audio_data, labels):
        self.image_data = image_data
        self.audio_data = torch.tensor(audio_data)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.image_data[idx], self.audio_data[idx], self.labels[idx]

train_dataset = MultimodalDataset(X_img_train, X_aud_train, y_train)
val_dataset = MultimodalDataset(X_img_val, X_aud_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Image model: ResNet-50
class ImageResNet(nn.Module):
    def __init__(self):
        super(ImageResNet, self).__init__()
        base_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        self.feature_extractor = nn.Sequential(*list(base_model.children())[:-1])

    def forward(self, x):
        x = self.feature_extractor(x)
        x = torch.flatten(x, 1)
        return x

# Audio model: BiLSTM
class AudioLSTM(nn.Module):
    def __init__(self):
        super(AudioLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=40, hidden_size=32, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(64, 64)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])
        return out

# Fusion Model
class MultimodalFusion(nn.Module):
    def __init__(self):
        super(MultimodalFusion, self).__init__()
        self.image_model = ImageResNet()
        self.audio_model = AudioLSTM()
        self.fusion_fc = nn.Sequential(
            nn.Linear(2048 + 64, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 7)
        )

    def forward(self, image_input, audio_input):
        img_feat = self.image_model(image_input)
        aud_feat = self.audio_model(audio_input)
        combined = torch.cat((img_feat, aud_feat), dim=1)
        return self.fusion_fc(combined)

# Training setup
model = MultimodalFusion().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    train_loss, preds, targets = 0, [], []
    for img, aud, label in train_loader:
        img, aud, label = img.to(device), aud.to(device), label.to(device)
        optimizer.zero_grad()
        output = model(img, aud)
        loss = loss_fn(output, label)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds.extend(torch.argmax(output, dim=1).cpu().numpy())
        targets.extend(label.cpu().numpy())

    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='weighted')
    print(f"Epoch {epoch+1} | Loss: {train_loss/len(train_loader):.4f} | Accuracy: {acc:.4f} | F1: {f1:.4f}")

    # Validation
    model.eval()
    val_preds, val_targets = [], []
    with torch.no_grad():
        for img, aud, label in val_loader:
            img, aud, label = img.to(device), aud.to(device), label.to(device)
            output = model(img, aud)
            val_preds.extend(torch.argmax(output, dim=1).cpu().numpy())
            val_targets.extend(label.cpu().numpy())
    val_acc = accuracy_score(val_targets, val_preds)
    val_f1 = f1_score(val_targets, val_preds, average='weighted')
    print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}")


Using device: mps


: 

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from PIL import Image

# Use MPS on Mac if available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [2]:
# Load FER2013 CSV (Image Data)
fer_df = pd.read_csv("/Users/prithvipandey/Documents/dataset/fer2013.csv")
fer_df = fer_df[fer_df['Usage'] == 'Training']
fer_images = fer_df['pixels'].apply(lambda x: np.fromstring(x, sep=' ').reshape(48, 48).astype(np.uint8))
fer_labels = fer_df['emotion'].values

# Resize & preprocess images for ResNet
image_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
])

fer_image_tensors = torch.stack([image_transform(img) for img in fer_images])
fer_labels = torch.tensor(fer_labels)

: 