In [1]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import numpy as np 
from PIL import Image
from torch.utils.data import DataLoader,Dataset,random_split
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from os import listdir
from sklearn.model_selection import train_test_split
import pandas as pd
from os.path import isfile, join

In [3]:
device = t.device('cuda' if t.cuda.is_available() else 'cpu')

In [4]:
audio_dir = "/raid/amana/lavish_multi_model/emotion_detection/data/raw_audio/"
image_dir = "/raid/amana/lavish_multi_model/emotion_detection/data/video_frames/"

In [5]:
import torchvision
import glob


def get_image_from_file(image_dir,file_name):
    my_normalize = Compose([
				Resize([224,224], interpolation=Image.BICUBIC),
				Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
			])
    total_num_frames = len(glob.glob(image_dir+'/'+file_name+'/*.jpg'))
    sample_indx = np.linspace(1, total_num_frames , num=10, dtype=int)
    total_img = []
    for vis_idx in range(10):
        tmp_idx = sample_indx[vis_idx]
        tmp_img = torchvision.io.read_image(image_dir+'/'+file_name+'/'+ str("{:08d}".format(tmp_idx))+ '.jpg')/255
        tmp_img = my_normalize(tmp_img)
        total_img.append(tmp_img)
    total_img = t.stack(total_img)
    total_img = t.mean(total_img,dim=0)
    return total_img

In [31]:
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPVisionModel

import warnings
warnings.filterwarnings("ignore")
t.cuda.set_device(7)
device = t.device('cuda' if t.cuda.is_available() else 'cpu')

model_clip = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")


Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight',

In [35]:
def normalize_vector(vector):
    vector = vector.detach().numpy()
    vector_min = np.min(vector)
    vector_max = np.max(vector)
    normalized_vector = (vector - vector_min) / (vector_max - vector_min)
    return t.tensor(normalized_vector)

def get_dataset_from_clip(image_folder,file_name,device):
    input = get_image_from_file(image_folder,file_name)
    input = normalize_vector(input)
    inputs = processor(images=input, return_tensors="pt").to(device)
    outputs = model_clip(**inputs)
    pooled_output = outputs.last_hidden_state
    return pooled_output.squeeze(0).cpu().detach().numpy()

In [36]:
onlyfiles = [f for f in listdir(image_dir)]
data = pd.read_csv("/raid/amana/lavish_multi_model/emotion_detection/data/text_data.csv")
data['filename'] = [f'dia{a}_utt{b}' for a,b in zip(data['Dialogue_ID'],data['Utterance_ID'])]
labels = list(data['Emotion'])

In [37]:
dataset = []

for a,b in tqdm(zip(onlyfiles,labels)):
    x = get_dataset_from_clip(image_dir,a,device)
    dataset.append([x,b])

1109it [48:49,  2.64s/it]


In [40]:
emotion_label = {'anger' : 1, 'disgust' : 2, 'fear' : 3,
                 'joy' : 4, 'neutral' : 5, 'sadness' : 6, 'surprise' : 0}

In [41]:
dataset1 = np.array([np.array([a[0],emotion_label[a[1]]]) for a in dataset])

In [43]:
np.save("dataset_clip_image_vectors1.npy",dataset1)

In [44]:
X_train,X_test,Y_train,Y_test = train_test_split(dataset1[:,0],dataset1[:,1],test_size=0.2,random_state=42)

In [46]:
import torch.nn as nn
import torch.nn.functional as F

class EmotionMLP(nn.Module):
    def softmax(self,x):
        e_x = t.exp(x - t.max(x))
        return e_x / e_x.sum()
    
    def __init__(self, num_classes):
        super(EmotionMLP, self).__init__()
        self.mat = nn.Parameter(t.rand(50,1))
        self.fc0 = nn.Linear(768,64)
        self.fc4 = nn.Linear(64, num_classes)
    
    def forward(self, x):
        x = t.relu(t.matmul(x.t(),self.mat))
        x = x.view(-1, 768)
        x = t.relu(self.fc0(x))
        x = t.relu(self.fc4(x))
        return x

In [47]:
import warnings
warnings.filterwarnings("ignore")
device = t.device('cuda' if t.cuda.is_available() else 'cpu')

In [48]:
num_classes = 7
model = EmotionMLP(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = t.optim.Adam(model.parameters(), lr=0.0001)

In [49]:
def train(model,X_train,Y_train,optimizer,criterion,num_epochs,device):
    loss_arr = []
    for epoch in tqdm(range(num_epochs)):
        # model.train()
        total_loss = 0
        for x,y in zip(X_train,Y_train):
            x = t.tensor(x)
            # Forward pass
            outputs = model(x.to(device))
            loss = criterion(outputs,t.tensor([y]).to(device)).to(device)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        loss_arr.append(total_loss/len(X_train))
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {total_loss/len(X_train)}')
    return loss_arr

num_epochs = 20
lossarr = train(model,X_train,Y_train,optimizer,criterion,num_epochs,device)

  5%|▌         | 1/20 [00:07<02:31,  7.97s/it]

Epoch 1/20, Training Loss: 1.7472953045448831


 10%|█         | 2/20 [00:17<02:35,  8.66s/it]

Epoch 2/20, Training Loss: 1.7368239916970254


 15%|█▌        | 3/20 [00:25<02:25,  8.56s/it]

Epoch 3/20, Training Loss: 1.7348943842572855


 20%|██        | 4/20 [00:33<02:11,  8.22s/it]

Epoch 4/20, Training Loss: 1.7330014184873295


 25%|██▌       | 5/20 [00:35<01:32,  6.15s/it]

Epoch 5/20, Training Loss: 1.7311757119223392


 30%|███       | 6/20 [00:40<01:19,  5.69s/it]

Epoch 6/20, Training Loss: 1.7287293471639562


 35%|███▌      | 7/20 [00:44<01:04,  4.99s/it]

Epoch 7/20, Training Loss: 1.7261072393536165


 40%|████      | 8/20 [00:46<00:49,  4.10s/it]

Epoch 8/20, Training Loss: 1.723993826510538


 45%|████▌     | 9/20 [00:49<00:40,  3.70s/it]

Epoch 9/20, Training Loss: 1.7227014555199978


 50%|█████     | 10/20 [00:54<00:43,  4.31s/it]

Epoch 10/20, Training Loss: 1.719993760580409


 55%|█████▌    | 11/20 [01:04<00:54,  6.11s/it]

Epoch 11/20, Training Loss: 1.7194270033645522


 60%|██████    | 12/20 [01:06<00:38,  4.83s/it]

Epoch 12/20, Training Loss: 1.7188433098618314


 65%|██████▌   | 13/20 [01:08<00:27,  3.89s/it]

Epoch 13/20, Training Loss: 1.7182366111501632


 70%|███████   | 14/20 [01:10<00:19,  3.23s/it]

Epoch 14/20, Training Loss: 1.7177167839907201


 75%|███████▌  | 15/20 [01:26<00:36,  7.23s/it]

Epoch 15/20, Training Loss: 1.7161424615270255


 80%|████████  | 16/20 [01:53<00:52, 13.05s/it]

Epoch 16/20, Training Loss: 1.7153092138931205


 85%|████████▌ | 17/20 [02:28<00:59, 19.75s/it]

Epoch 17/20, Training Loss: 1.7149510637614478


 90%|█████████ | 18/20 [03:04<00:49, 24.64s/it]

Epoch 18/20, Training Loss: 1.7145708174420586


 95%|█████████▌| 19/20 [03:23<00:22, 22.86s/it]

Epoch 19/20, Training Loss: 1.7142838453359508


100%|██████████| 20/20 [03:38<00:00, 10.94s/it]

Epoch 20/20, Training Loss: 1.7139769481429925





In [52]:
def test(model,X_test,Y_test,device):
    acc = 0
    def accuracy(y_true, y_pred):
        eq = t.eq(y_true, y_pred).int()
        return sum(eq)/len(eq)

    with t.no_grad():
        model.eval()
        for x,y in zip(X_test,Y_test):
            outputs = model(t.tensor(x).to(device))
            outputs1 = outputs.detach().cpu()
            outputs1 = outputs1.argmax(dim=0)
            acc += accuracy(t.tensor([y]),outputs1)
        print(f"accuracy: {(acc/len(X_test))*100: 0.2f}%")
        
test(model,X_test,Y_test,device)

accuracy:  30.61%
