In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from sklearn import preprocessing

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.utils.class_weight import compute_class_weight
from sklearn import preprocessing

import torch
import torch.nn as nn
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.autograd import Variable

desired_frames = 1
desired_features = 2048

In [2]:
labels = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/data/scene_labels.csv")

In [3]:
labels.head(5)

Unnamed: 0,SCENE,KEY,SPEAKER,SHOW,Sarcasm,Sarcasm_Type
0,1_10004,1_10004_u,SHELDON,BBT,0.0,NONE
1,1_10009,1_10009_u,PENNY,BBT,0.0,NONE
2,1_1001,1_1001_u,RAJ,BBT,0.0,NONE
3,1_1003,1_1003_u,HOWARD,BBT,1.0,PRO
4,1_10190,1_10190_u,SHELDON,BBT,0.0,NONE


In [4]:
scenes = list(pd.unique(labels["SCENE"]))

In [5]:
len(scenes)

1202

In [6]:
parent_dir = "/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/data/features/utterances_final"
visual_features = {}
for i in range(len(scenes)):
    # Visual Features
    try:
        vf = np.load(os.path.join(parent_dir, "resnet_pool5_" + scenes[i] + ".npy"))
        #Global average pooling
        vf_p = np.apply_over_axes(np.mean, vf, [2, 3]) 
        vf_p = np.reshape(vf_p, (vf_p.shape[0],2048))
        if vf_p.shape[0] != 15:
            vf_p = np.pad(vf_p, [(0, 15 - vf_p.shape[0]), (0, 0)], mode='constant')
    except:
        vf_p = np.zeros((15, 2048))
    
    visual_features[scenes[i]] = vf_p

In [7]:
def get_model_data(video_features):
    model_data = pd.DataFrame(columns=['video_feature','sarcasm','sarcasm_type', 'speaker'])
    for index, row in labels.iterrows():
        model_data = model_data.append({'SCENE': row["SCENE"], 'video_feature': video_features[row["SCENE"]],
                                    'sarcasm' : row["Sarcasm"],
                                    'sarcasm_type' : row["Sarcasm_Type"],
                                    'speaker' : row["SPEAKER"]},
                                  ignore_index=True)
    return model_data

In [8]:
warnings.filterwarnings("ignore")
model_data = get_model_data(visual_features)

In [9]:
model_data.head()

Unnamed: 0,video_feature,sarcasm,sarcasm_type,speaker,SCENE
0,"[[0.13835047, 0.2704592, 0.44648886, 0.1415337...",0.0,NONE,SHELDON,1_10004
1,"[[0.46479157, 0.1813915, 0.22123067, 0.5245148...",0.0,NONE,PENNY,1_10009
2,"[[0.253619, 0.25664786, 0.6646118, 0.4821793, ...",0.0,NONE,RAJ,1_1001
3,"[[0.55624646, 0.16990338, 0.62457716, 0.209021...",1.0,PRO,HOWARD,1_1003
4,"[[0.6140023, 0.4846397, 0.79425097, 0.13518682...",0.0,NONE,SHELDON,1_10190


In [10]:
# Label Encode Speaker
le = preprocessing.LabelEncoder()
model_data['speaker_encode'] = le.fit_transform(model_data['speaker'])
model_data.head(5)

Unnamed: 0,video_feature,sarcasm,sarcasm_type,speaker,SCENE,speaker_encode
0,"[[0.13835047, 0.2704592, 0.44648886, 0.1415337...",0.0,NONE,SHELDON,1_10004,25
1,"[[0.46479157, 0.1813915, 0.22123067, 0.5245148...",0.0,NONE,PENNY,1_10009,15
2,"[[0.253619, 0.25664786, 0.6646118, 0.4821793, ...",0.0,NONE,RAJ,1_1001,21
3,"[[0.55624646, 0.16990338, 0.62457716, 0.209021...",1.0,PRO,HOWARD,1_1003,7
4,"[[0.6140023, 0.4846397, 0.79425097, 0.13518682...",0.0,NONE,SHELDON,1_10190,25


In [11]:
y_test = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/MultiModal/final_train_test/y_test_final.csv")
y_test.head()


Unnamed: 0,scene,sarcasm
0,2_388,0.0
1,1_5058,1.0
2,1_S11E21_080,1.0
3,1_S11E12_038,1.0
4,1_S11E01_337,0.0


In [12]:
test_data = model_data[model_data["SCENE"].isin(y_test.scene)]
test_data.reset_index(drop=True, inplace = True)

In [13]:
test_data.head()

Unnamed: 0,video_feature,sarcasm,sarcasm_type,speaker,SCENE,speaker_encode
0,"[[0.46479157, 0.1813915, 0.22123067, 0.5245148...",0.0,NONE,PENNY,1_10009,15
1,"[[0.55624646, 0.16990338, 0.62457716, 0.209021...",1.0,PRO,HOWARD,1_1003,7
2,"[[0.32759786, 0.4366611, 0.5403976, 0.48374075...",1.0,EMB,SHELDON,1_105,25
3,"[[0.31870434, 0.48313776, 0.79392844, 0.464412...",0.0,NONE,AMY,1_10797,0
4,"[[0.20445585, 0.6441803, 0.8992094, 0.28929886...",1.0,PRO,RAJ,1_10890,21


In [14]:
desired_length = 15

test_data['padded_video_feature'] = test_data.loc[:, 'video_feature']

test_data["sarcasm"] = test_data["sarcasm"].astype('int').to_numpy()

In [15]:
test_data.head()

Unnamed: 0,video_feature,sarcasm,sarcasm_type,speaker,SCENE,speaker_encode,padded_video_feature
0,"[[0.46479157, 0.1813915, 0.22123067, 0.5245148...",0,NONE,PENNY,1_10009,15,"[[0.46479157, 0.1813915, 0.22123067, 0.5245148..."
1,"[[0.55624646, 0.16990338, 0.62457716, 0.209021...",1,PRO,HOWARD,1_1003,7,"[[0.55624646, 0.16990338, 0.62457716, 0.209021..."
2,"[[0.32759786, 0.4366611, 0.5403976, 0.48374075...",1,EMB,SHELDON,1_105,25,"[[0.32759786, 0.4366611, 0.5403976, 0.48374075..."
3,"[[0.31870434, 0.48313776, 0.79392844, 0.464412...",0,NONE,AMY,1_10797,0,"[[0.31870434, 0.48313776, 0.79392844, 0.464412..."
4,"[[0.20445585, 0.6441803, 0.8992094, 0.28929886...",1,PRO,RAJ,1_10890,21,"[[0.20445585, 0.6441803, 0.8992094, 0.28929886..."


In [16]:
class RNNTensorDataset(Dataset):
    def __init__(self, dataframe, speaker):
        self.data = dataframe
        self.speaker = speaker

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.speaker:
            features = self.data.loc[index, 'padded_video_feature']
            a=np.empty((15,1))
            a.fill(self.data.loc[index, 'speaker_encode'])
            final_features = np.hstack((features, a))
            label = self.data.loc[index, 'sarcasm']
            return torch.from_numpy(final_features).float(), label
        else:
            features = self.data.loc[index, 'padded_video_feature']
            label = self.data.loc[index, 'sarcasm']
            return torch.from_numpy(features).float(), label
    def __getindexlist__(self):
        return list(self.data.index.values)
    
class RNNetSD(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, layers):
        super(RNNetSD, self).__init__()
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=layers, batch_first=True, nonlinearity="relu")
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = Variable(torch.zeros(self.layers, x.size(0), self.hidden_dim))
        out, hn = self.rnn(x, h0)
        out = F.softmax(self.fc(out[:, -1, :]))
        return out
    
def evaluateRNN(rnn, review, size):
    output = rnn(review)
    return output

def categoryFromOutput(output):
    top_n, top_i = torch.max(output,dim=1)
    return top_i

def test_accuracy(rnn, loader, size):
    actuals = []
    predictions = []
    for data, target in loader:
        output = evaluateRNN(rnn, data, size)
        prediction_index = categoryFromOutput(output)
        predictions = prediction_index.tolist()
        actuals = target.tolist()
    return predictions, actuals

rnn_test_tensor = RNNTensorDataset(test_data[['padded_video_feature', 'speaker_encode', 'sarcasm']], True)

test_indices = list(range(len(rnn_test_tensor)))


num_of_workers = 0
batch_size = 44
valid_size = 0.1


test_loader = torch.utils.data.DataLoader(
    rnn_test_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(test_indices)
)

test_loader_epoch = torch.utils.data.DataLoader(
    rnn_test_tensor, batch_size=rnn_test_tensor.__len__())

In [17]:
rnn = torch.load("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/visual/Models/video_rnn_speaker_dependent_model.pt")
rnn.eval()

RNNetSD(
  (rnn): RNN(2049, 20, num_layers=2, batch_first=True)
  (fc): Linear(in_features=20, out_features=2, bias=True)
)

In [18]:
predictions, actuals = test_accuracy(rnn, test_loader_epoch, rnn_test_tensor.__len__())
print(classification_report(actuals, predictions, digits=4))

              precision    recall  f1-score   support

           0     0.6495    0.5207    0.5780       121
           1     0.5972    0.7167    0.6515       120

    accuracy                         0.6183       241
   macro avg     0.6234    0.6187    0.6147       241
weighted avg     0.6235    0.6183    0.6146       241



In [19]:
test_data["Predictions"] = predictions

In [20]:
final_predictions = test_data[['SCENE', 'Predictions', 'sarcasm']]

In [21]:
final_predictions.columns = ['scene', 'visual_predictions', 'actuals']

In [None]:
final_predictions.