In [1]:
import pandas as pd
import torch
import pickle
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn import preprocessing

import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F

In [2]:
labels = pd.read_csv("../data/scene_labels.csv")
test_labels = pd.read_csv("final_train_test/y_test_final.csv")
test_labels = test_labels.merge(labels, left_on='scene', right_on='SCENE', how = "inner")

In [3]:
def get_model_data(context_audio_features, audio_features):
    model_data = pd.DataFrame(columns=['context_audio_feature', 'audio_feature','sarcasm', 'speaker'])
    for index, row in test_labels.iterrows():
        audio_key = row["SCENE"] + "_u.wav"
        context_audio_key = row["SCENE"] + "_c.wav"
        model_data = model_data.append({
                                    'scene' : row["SCENE"],
                                    'context_audio_feature': context_audio_features[context_audio_key],
                                    'audio_feature': audio_features[audio_key],
                                    'sarcasm' : row["Sarcasm"],
                                    'speaker' : row["SPEAKER"]},
                                  ignore_index=True)
    return model_data

def get_test_split(model_data):
    model_data.loc[model_data['scene'].isin(list(test_labels["SCENE"]))]
    return model_data

In [4]:
with open('../audio_features/feat_dict_librosa_lld.pickle', 'rb') as f:
    librosa_audio_features = pickle.load(f, encoding='latin1')
with open('../audio_features/feat_dict_context_librosa_lld.pickle', 'rb') as f:
    librosa_context_audio_features = pickle.load(f, encoding='latin1')
    
model_data = get_model_data(librosa_context_audio_features, librosa_audio_features)
le = preprocessing.LabelEncoder()
model_data['speaker_encode'] = le.fit_transform(model_data['speaker'])
model_data.head(5)

Unnamed: 0,context_audio_feature,audio_feature,sarcasm,speaker,scene,speaker_encode
0,"[[-464.8975524902344, -504.45113845098587, -46...","[[-623.8641967773438, -565.6373408390925, -632...",0.0,CHANDLER,2_388,2
1,"[[-640.9610595703125, -503.1803544786241, -506...","[[-421.7781066894531, -454.1127905594675, -529...",1.0,SHELDON,1_5058,21
2,"[[-308.763671875, -489.3131433603715, -457.306...","[[-785.21826171875, -522.0255177815756, -390.1...",1.0,HOWARD,1_S11E21_080,6
3,"[[-432.1380615234375, -560.211009058459, -419....","[[-537.4343872070312, -504.6135768890381, -602...",1.0,RAJ,1_S11E12_038,18
4,"[[-707.3914794921875, -705.6019348144531, -591...","[[-240.57118225097656, -309.30991155450994, -3...",0.0,PENNY,1_S11E01_337,13


In [5]:
test_data = get_test_split(model_data)
test_data = test_data[["context_audio_feature", "audio_feature", "speaker_encode", "sarcasm"]]

fnn_test = test_data.copy()
fnn_test.reset_index(drop=True, inplace = True)
fnn_test['averaged_audio_feature'] = fnn_test.loc[:, 'audio_feature']
fnn_test['averaged_context_audio_feature'] = fnn_test.loc[:, 'context_audio_feature']
for index, row in fnn_test.iterrows():
    audio = row['averaged_audio_feature']
    fnn_test.at[index, "averaged_audio_feature"] = np.array([np.mean(audio, axis=1)])
    context_audio = row['averaged_context_audio_feature']
    fnn_test.at[index, "averaged_context_audio_feature"] = np.array([np.mean(context_audio, axis=1)])

fnn_test["sarcasm"] = fnn_test["sarcasm"].astype('int').to_numpy()

In [6]:
class FNNTensorDataset(Dataset):
    def __init__(self, dataframe, speaker):
        self.data = dataframe
        self.speaker = speaker

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.speaker:
            features = self.data.loc[index, 'averaged_audio_feature']
            ctxt_features = self.data.loc[index, 'averaged_context_audio_feature']
            a=np.empty((1,1))
            a.fill(self.data.loc[index, 'speaker_encode'])
            final_features = np.hstack((ctxt_features, features, a))
            label = self.data.loc[index, 'sarcasm']
            return torch.from_numpy(final_features).float(), label
        else:
            features = self.data.loc[index, 'averaged_audio_feature']
            ctxt_features = self.data.loc[index, 'averaged_context_audio_feature']
            final_features = np.hstack((ctxt_features, features))
            label = self.data.loc[index, 'sarcasm']
            return torch.from_numpy(final_features).float(), label
    
    def __getindexlist__(self):
        return list(self.data.index.values)

desired_frames = 1
desired_features = 690
class FNNNetSD(nn.Module):
    def __init__(self):
        super(FNNNetSD, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(desired_frames*(desired_features + desired_features + 1), hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)

    def forward(self, x):
        x = x.view(-1, desired_frames*(desired_features + desired_features + 1))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x

fnn_test_tensor = FNNTensorDataset(fnn_test[['averaged_context_audio_feature', 'averaged_audio_feature', 'speaker_encode', 'sarcasm']], True)
test_loader = torch.utils.data.DataLoader(fnn_test_tensor, batch_size=fnn_test_tensor.__len__())

In [7]:
model = FNNNetSD()
print(model)
audio_best_model = model.load_state_dict(torch.load("fnn_audio_best_model80.pt"))
print(audio_best_model)

FNNNetSD(
  (fc1): Linear(in_features=1381, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
)
<All keys matched successfully>


In [8]:
def predict_fnn(fnn_model, dataloader):
    prediction_list = []
    actual_list = []
    for data, target in dataloader:
        outputs = fnn_model(data)
        _, predicted = torch.max(outputs.data, 1) 
        prediction_list.append(predicted.cpu())
        actual_list.append(target)
    return prediction_list, actual_list
predictions, actuals = predict_fnn(model, test_loader)

In [9]:
predictions

[tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
         1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
         1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
         0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
         0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
         1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
         1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
         1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
         1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
         1])]

In [10]:
audio_predictions = pd.DataFrame(list(zip(test_labels['scene'].tolist(), predictions[0].tolist(), actuals[0].tolist())), columns = ['scene', 'audio_predictions', 'actuals'])

In [11]:
audio_predictions

Unnamed: 0,scene,audio_predictions,actuals
0,2_388,1,0
1,1_5058,1,1
2,1_S11E21_080,1,1
3,1_S11E12_038,1,1
4,1_S11E01_337,1,0
...,...,...,...
236,1_2423,0,0
237,2_242,1,1
238,2_168,1,1
239,2_270,0,0


In [12]:
audio_predictions.to_csv("audio_predictions.csv", index = False)