<a href="https://colab.research.google.com/github/SaShukla090/450_DSA_Questions/blob/master/AudionTextFusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextStreamBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(TextStreamBlock, self).__init__()
        self.fc = nn.Linear(input_dim, hidden_dim)
        self.bn = nn.BatchNorm1d(hidden_dim)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        x = F.relu(x)
        return x

class AudioStreamBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(AudioStreamBlock, self).__init__()
        self.fc = nn.Linear(input_dim, hidden_dim)
        self.bn = nn.BatchNorm1d(hidden_dim)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        x = F.relu(x)
        return x

class MultimodalNetwork(nn.Module):
    def __init__(self, text_input_dim, audio_input_dim, num_classes):
        super(MultimodalNetwork, self).__init__()

        # Text stream
        self.text_blocks = nn.ModuleList([
            TextStreamBlock(text_input_dim, 128),
            TextStreamBlock(128, 256),
            TextStreamBlock(256, 512),
            TextStreamBlock(512, 1024)
        ])

        # Audio stream
        self.audio_blocks = nn.ModuleList([
            AudioStreamBlock(audio_input_dim, 128),
            AudioStreamBlock(128, 256),
            AudioStreamBlock(256, 512),
            AudioStreamBlock(512, 1024)
        ])

        # Fusion layer
        self.fusion_layer = nn.Linear(2048, 1024)  # Combine features from text and audio

        # Classification layer
        self.classification_layer = nn.Linear(1024, num_classes)

    def forward(self, text_input, audio_input):
        # Forward pass for the text stream
        text_features = text_input
        for block in self.text_blocks:
            text_features = block(text_features)

        # Forward pass for the audio stream
        audio_features = audio_input
        for block in self.audio_blocks:
            audio_features = block(audio_features)

        # Concatenate text and audio features
        combined_features = torch.cat((text_features, audio_features), dim=1)

        # Fusion layer
        fused_features = self.fusion_layer(combined_features)

        # Classification layer
        output = self.classification_layer(fused_features)

        return output

# Example usage:
text_input_dim = 300  # Example text input dimension
audio_input_dim = 128  # Example audio input dimension
num_classes = 10  # Number of classes for classification

model = MultimodalNetwork(text_input_dim, audio_input_dim, num_classes)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Assuming you have your own data loading code for text and audio data
# Make sure to preprocess your data, including tokenization for text and feature extraction for audio

# Example data (replace with your own data loading logic)
text_data = torch.randn(100, 300)  # Replace with your text data
audio_data = torch.randn(100, 128)  # Replace with your audio data
labels = torch.randint(0, 10, (100,))  # Replace with your labels

# Create DataLoader for training data
dataset = TensorDataset(text_data, audio_data, labels)
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the model
text_input_dim = 300  # Example text input dimension
audio_input_dim = 128  # Example audio input dimension
num_classes = 10  # Number of classes for classification

model = MultimodalNetwork(text_input_dim, audio_input_dim, num_classes)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for i, (text_inputs, audio_inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        # Forward pass
        outputs = model(text_inputs, audio_inputs)

        # Compute loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print average loss for the epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")

print("Training finished!")

# You can now use the trained model for inference or evaluation.


Epoch 1/10, Loss: 3.4539296627044678
Epoch 2/10, Loss: 2.0454811453819275
Epoch 3/10, Loss: 1.6332950592041016
Epoch 4/10, Loss: 0.19038929278030992
Epoch 5/10, Loss: 1.1737539768218994
Epoch 6/10, Loss: 0.6450975574553013
Epoch 7/10, Loss: 0.8802142746280879
Epoch 8/10, Loss: 0.09826144529506564
Epoch 9/10, Loss: 0.332902230322361
Epoch 10/10, Loss: 0.4054521534126252
Training finished!
