In [1]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import numpy as np 
from PIL import Image
from torch.utils.data import DataLoader,Dataset,random_split
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from os import listdir
from sklearn.model_selection import train_test_split
import pandas as pd
from os.path import isfile, join

In [2]:
import warnings
warnings.filterwarnings("ignore")
t.cuda.set_device(7)

In [3]:
device = t.device('cuda' if t.cuda.is_available() else 'cpu')

In [4]:
audio_dir = "/raid/amana/lavish_multi_model/emotion_detection/data/raw_audio/"

In [5]:
import librosa
import librosa.display
import matplotlib.pyplot as plt

def normalize_vector(vector):
    vector = vector.detach().numpy()
    vector_min = np.min(vector)
    vector_max = np.max(vector)
    normalized_vector = (vector - vector_min) / (vector_max - vector_min)
    return t.tensor(normalized_vector)


def get_mel_spectogram(audio_file):
    audio_file = audio_dir+"dia0_utt0.wav"
    y, sr = librosa.load(audio_file)
    D = librosa.stft(y)
    S_mel = librosa.feature.melspectrogram(y=y, sr=sr)
    S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
    return S_mel_db

def get_image_input(audio_file):
    # Get the mel spectrogram and extract the clip embedding
    my_normalize = Compose([
				Resize([128,128], interpolation=Image.BICUBIC),
				Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
			])
    S_mel = get_mel_spectogram(audio_file)
    inputs = t.tensor(S_mel)
    res = t.rand(3,*inputs.shape)
    res[0] = res[1] = res[2] = inputs
    return my_normalize(res)

In [123]:
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPVisionModel

model_clip = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")


Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.3

In [6]:
def get_dataset_from_clip(audio_file,device):
    input = get_image_input(audio_file)
    input = normalize_vector(input)
    inputs = processor(images=input, return_tensors="pt").to(device)
    outputs = model_clip(**inputs)
    pooled_output = outputs.last_hidden_state
    return pooled_output.squeeze(0).cpu().detach().numpy()

In [130]:
get_dataset_from_clip(audio_dir+"dia0_utt0.wav",device).shape

(50, 768)

In [7]:
onlyfiles = [f for f in listdir(audio_dir) if isfile(join(audio_dir, f))]
data = pd.read_csv("/raid/amana/lavish_multi_model/emotion_detection/data/text_data.csv")
data['filename'] = [f'dia{a}_utt{b}' for a,b in zip(data['Dialogue_ID'],data['Utterance_ID'])]
labels = list(data['Emotion'])

In [132]:
dataset = []

for a,b in tqdm(zip(onlyfiles,labels)):
    x = get_dataset_from_clip(audio_dir+"/"+a,device)
    dataset.append([x,b])

1109it [10:26,  1.77it/s]


In [8]:
emotion_label = {'anger' : 1, 'disgust' : 2, 'fear' : 3,
                 'joy' : 4, 'neutral' : 5, 'sadness' : 6, 'surprise' : 0}

In [134]:
dataset1 = np.array([np.array([a[0],emotion_label[a[1]]]) for a in dataset])

In [9]:
dataset1 = np.load("dataset_clip_vectors.npy",allow_pickle=True)

In [135]:
np.save("dataset_clip_vectors.npy",dataset1)

In [10]:
X_train,X_test,Y_train,Y_test = train_test_split(dataset1[:,0],dataset1[:,1],test_size=0.2,random_state=42)

In [11]:
import torch.nn as nn
import torch.nn.functional as F

class EmotionMLP(nn.Module):
    def softmax(self,x):
        e_x = t.exp(x - t.max(x))
        return e_x / e_x.sum()
    
    def __init__(self, num_classes):
        super(EmotionMLP, self).__init__()
        self.mat = nn.Parameter(t.rand(50,1))
        self.fc0 = nn.Linear(768,64)
        self.fc4 = nn.Linear(64, num_classes)
    
    def forward(self, x):
        x = t.relu(t.matmul(x.t(),self.mat))
        x = x.view(-1, 768)
        x = t.relu(self.fc0(x))
        x = t.relu(self.fc4(x))
        x = F.softmax(x)
        return x

In [12]:
import warnings
warnings.filterwarnings("ignore")
device = t.device('cuda' if t.cuda.is_available() else 'cpu')

In [13]:
num_classes = 7
model = EmotionMLP(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = t.optim.Adam(model.parameters(), lr=0.0001)

In [14]:
def train(model,X_train,Y_train,optimizer,criterion,num_epochs,device):
    loss_arr = []
    for epoch in tqdm(range(num_epochs)):
        # model.train()
        total_loss = 0
        for x,y in zip(X_train,Y_train):
            x = t.tensor(x)
            # Forward pass
            outputs = model(x.to(device))
            loss = criterion(outputs,t.tensor([y]).to(device)).to(device)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        loss_arr.append(total_loss/len(X_train))
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {total_loss/len(X_train)}')
    return loss_arr

num_epochs = 20
lossarr = train(model,X_train,Y_train,optimizer,criterion,num_epochs,device)

  5%|▌         | 1/20 [00:04<01:31,  4.80s/it]

Epoch 1/20, Training Loss: 1.9484052223981838


 10%|█         | 2/20 [00:06<00:55,  3.08s/it]

Epoch 2/20, Training Loss: 1.9459102153778076


 15%|█▌        | 3/20 [00:12<01:14,  4.37s/it]

Epoch 3/20, Training Loss: 1.9459102153778076


 20%|██        | 4/20 [00:29<02:28,  9.30s/it]

Epoch 4/20, Training Loss: 1.9459102153778076


 25%|██▌       | 5/20 [00:31<01:40,  6.73s/it]

Epoch 5/20, Training Loss: 1.9459102153778076


 30%|███       | 6/20 [01:04<03:36, 15.48s/it]

Epoch 6/20, Training Loss: 1.9459102153778076


 35%|███▌      | 7/20 [01:37<04:38, 21.44s/it]

Epoch 7/20, Training Loss: 1.9459102153778076


 40%|████      | 8/20 [02:10<05:01, 25.15s/it]

Epoch 8/20, Training Loss: 1.9459102153778076


 45%|████▌     | 9/20 [02:46<05:11, 28.33s/it]

Epoch 9/20, Training Loss: 1.9459102153778076


 50%|█████     | 10/20 [03:16<04:50, 29.03s/it]

Epoch 10/20, Training Loss: 1.9459102153778076


 55%|█████▌    | 11/20 [03:45<04:19, 28.85s/it]

Epoch 11/20, Training Loss: 1.9459102153778076


 60%|██████    | 12/20 [04:13<03:50, 28.81s/it]

Epoch 12/20, Training Loss: 1.9459102153778076


 65%|██████▌   | 13/20 [04:43<03:23, 29.02s/it]

Epoch 13/20, Training Loss: 1.9459102153778076


 70%|███████   | 14/20 [05:06<02:42, 27.11s/it]

Epoch 14/20, Training Loss: 1.9459102153778076


 75%|███████▌  | 15/20 [05:12<01:44, 20.85s/it]

Epoch 15/20, Training Loss: 1.9459102153778076


 80%|████████  | 16/20 [05:26<01:14, 18.65s/it]

Epoch 16/20, Training Loss: 1.9459102153778076


 85%|████████▌ | 17/20 [06:01<01:11, 23.74s/it]

Epoch 17/20, Training Loss: 1.9459102153778076


 90%|█████████ | 18/20 [06:37<00:54, 27.49s/it]

Epoch 18/20, Training Loss: 1.9459102153778076


 95%|█████████▌| 19/20 [07:13<00:29, 29.92s/it]

Epoch 19/20, Training Loss: 1.9459102153778076


100%|██████████| 20/20 [07:48<00:00, 23.42s/it]

Epoch 20/20, Training Loss: 1.9459102153778076





In [21]:
def test(model,X_test,Y_test,device):
    acc = 0
    def accuracy(y_true, y_pred):
        eq = t.eq(y_true, y_pred).int()
        return sum(eq)/len(eq)

    with t.no_grad():
        model.eval()
        for x,y in zip(X_test,Y_test):
            outputs = model(t.tensor(x).to(device))
            outputs1 = outputs.detach().cpu()
            outputs1 = outputs1.argmax(dim=0)
            acc += accuracy(t.tensor([y]),outputs1)
        print(f"accuracy: {(acc/len(X_test))*100: 0.5f}%")
        
test(model,X_test,Y_test,device)

accuray : 30.52685%
