In [1]:
import pandas as pd 
import pickle
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn import preprocessing

import torch
import torch.nn as nn
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.autograd import Variable

In [2]:
labels = pd.read_csv("../../data/scene_labels.csv")

In [3]:
labels.head(5)

Unnamed: 0,SCENE,KEY,SPEAKER,SHOW,Sarcasm,Sarcasm_Type
0,1_10004,1_10004_u,SHELDON,BBT,0.0,NONE
1,1_10009,1_10009_u,PENNY,BBT,0.0,NONE
2,1_1001,1_1001_u,RAJ,BBT,0.0,NONE
3,1_1003,1_1003_u,HOWARD,BBT,1.0,PRO
4,1_10190,1_10190_u,SHELDON,BBT,0.0,NONE


#### Perform mean, median, max, min and sum pooling on audio feature data

In [4]:
def get_model_data(context_audio_features, audio_features):
    model_data = pd.DataFrame(columns=['context_audio_feature', 'audio_feature','sarcasm','sarcasm_type', 'speaker'])
    for index, row in labels.iterrows():
        audio_key = row["SCENE"] + "_u.wav"
        context_audio_key = row["SCENE"] + "_c.wav"
        model_data = model_data.append({'context_audio_feature': context_audio_features[context_audio_key],
                                    'audio_feature': audio_features[audio_key],
                                    'sarcasm' : row["Sarcasm"],
                                    'sarcasm_type' : row["Sarcasm_Type"],
                                    'speaker' : row["SPEAKER"]},
                                  ignore_index=True)
    return model_data

In [5]:
def get_train_test_split(model_data, x_columns, y_column, stratify_column):
    X_train, X_test, Y_train, Y_test = train_test_split(
        model_data[x_columns],
        model_data[y_column],
        train_size=0.8, 
        test_size=0.2, 
        random_state=42, 
        shuffle=True,
        stratify=model_data[stratify_column])
    
    print("Train: ",X_train.shape, Y_train.shape,
      "Test: ",(X_test.shape, Y_test.shape))
    print(type(X_train))
    train_data = pd.merge(X_train, Y_train, left_index=True, right_index=True)
    test_data = pd.merge(X_test, Y_test, left_index=True, right_index=True)
    return train_data, test_data

In [15]:
class RNNTensorDataset(Dataset):
    def __init__(self, dataframe, speaker):
        self.data = dataframe
        self.speaker = speaker

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.speaker:
            features = self.data.loc[index, 'padded_audio_feature']
            ctxt_features = self.data.loc[index, 'padded_context_audio_feature']
            final_features = np.vstack((ctxt_features, features))
            a=np.empty((36,1))
            a.fill(self.data.loc[index, 'speaker_encode'])
            final_features = np.hstack((final_features, a))
            label = self.data.loc[index, 'sarcasm']
            return torch.from_numpy(final_features).float(), label
        else:
            features = self.data.loc[index, 'padded_audio_feature']
            ctxt_features = self.data.loc[index, 'padded_context_audio_feature']
            final_features = np.vstack((ctxt_features, features))
            label = self.data.loc[index, 'sarcasm']
            return torch.from_numpy(final_features).float(), label
    
    def __getindexlist__(self):
        return list(self.data.index.values)
    
class RNNetSD(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, layers):
        super(RNNetSD, self).__init__()
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=layers, batch_first=True, nonlinearity="relu")
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = Variable(torch.zeros(self.layers, x.size(0), self.hidden_dim))
        out, hn = self.rnn(x, h0)
        out = F.softmax(self.fc(out[:, -1, :]))
        return out
    
class RNNetSID(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, layers):
        super(RNNetSID, self).__init__()
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=layers, batch_first=True, nonlinearity="relu")
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = Variable(torch.zeros(self.layers, x.size(0), self.hidden_dim))
        out, hn = self.rnn(x, h0)
        out = F.softmax(self.fc(out[:, -1, :]))
        return out
    
def evaluateRNN(rnn, review, size):
    output = rnn(review)
    return output

def categoryFromOutput(output):
    top_n, top_i = torch.max(output,dim=1)
    return top_i

def test_accuracy(rnn, loader, size):
    actuals = []
    predictions = []
    for data, target in loader:
        output = evaluateRNN(rnn, data, size)
        prediction_index = categoryFromOutput(output)
        predictions = prediction_index.tolist()
        actuals = target.tolist()
    return predictions, actuals

In [7]:
with open('../../audio_features/feat_dict_librosa_lld.pickle', 'rb') as f:
    librosa_audio_features = pickle.load(f, encoding='latin1')
with open('../../audio_features/feat_dict_context_librosa_lld.pickle', 'rb') as f:
    librosa_context_audio_features = pickle.load(f, encoding='latin1')

model_data = get_model_data(librosa_context_audio_features, librosa_audio_features)  
le = preprocessing.LabelEncoder()
model_data['speaker_encode'] = le.fit_transform(model_data['speaker'])
model_data.head(5)

Unnamed: 0,context_audio_feature,audio_feature,sarcasm,sarcasm_type,speaker,speaker_encode
0,"[[-637.1869506835938, -527.0845538153164, -543...","[[-386.6164855957031, -649.6673512776692, -633...",0.0,NONE,SHELDON,25
1,"[[-625.8624267578125, -403.14568843397984, -38...","[[-255.5221405029297, -484.69307309105284, -52...",0.0,NONE,PENNY,15
2,"[[-500.3988952636719, -396.36312383413315, -22...","[[-569.0548095703125, -381.4147456242488, -221...",0.0,NONE,RAJ,21
3,"[[-313.1777038574219, -228.85994769251624, -35...","[[-237.61074829101562, -211.002773845897, -382...",1.0,PRO,HOWARD,7
4,"[[-337.88116455078125, -420.55431980762665, -3...","[[-530.5701293945312, -374.83951552370763, -42...",0.0,NONE,SHELDON,25


In [8]:
train_data, test_data = get_train_test_split(model_data, ['context_audio_feature', 'audio_feature', 'speaker_encode'], 'sarcasm', 'sarcasm_type')
rnn_train = train_data.copy()
rnn_test = test_data.copy()
rnn_train.reset_index(drop=True, inplace = True)
rnn_test.reset_index(drop=True, inplace = True)

Train:  (961, 3) (961,) Test:  ((241, 3), (241,))
<class 'pandas.core.frame.DataFrame'>


In [9]:
desired_length = 18
desired_length_ctxt = 18

rnn_train['padded_audio_feature'] = rnn_train.loc[:, 'audio_feature']
for index, row in rnn_train.iterrows():
    data_array = row['padded_audio_feature']
    new_array = []
    for arr in data_array:
        if arr.shape[0] < desired_length:
            arr = np.pad(arr, (0, desired_length - arr.shape[0]), 'constant')
            new_array.append(arr)
        else:
            new_array.append(arr)
    rnn_train.at[index, "padded_audio_feature"] = np.transpose(np.array(new_array))

rnn_train['padded_context_audio_feature'] = rnn_train.loc[:, 'context_audio_feature']
for index, row in rnn_train.iterrows():
    data_array = row['padded_context_audio_feature']
    new_array = []
    for arr in data_array:
        if arr.shape[0] < desired_length_ctxt:
            arr = np.pad(arr, (0, desired_length_ctxt - arr.shape[0]), 'constant')
            new_array.append(arr)
        else:
            new_array.append(arr)
    rnn_train.at[index, "padded_context_audio_feature"] = np.transpose(np.array(new_array))

rnn_test['padded_audio_feature'] = rnn_test.loc[:, 'audio_feature']
for index, row in rnn_test.iterrows():
    data_array = row['padded_audio_feature']
    new_array = []
    for arr in data_array:
        if arr.shape[0] < desired_length:
            arr = np.pad(arr, (0, desired_length - arr.shape[0]), 'constant')
            new_array.append(arr)
        else:
            new_array.append(arr)
    rnn_test.at[index, "padded_audio_feature"] = np.transpose(np.array(new_array))
    
rnn_test['padded_context_audio_feature'] = rnn_test.loc[:, 'audio_feature']
for index, row in rnn_test.iterrows():
    data_array = row['padded_context_audio_feature']
    new_array = []
    for arr in data_array:
        if arr.shape[0] < desired_length_ctxt:
            arr = np.pad(arr, (0, desired_length_ctxt - arr.shape[0]), 'constant')
            new_array.append(arr)
        else:
            new_array.append(arr)
    rnn_test.at[index, "padded_context_audio_feature"] = np.transpose(np.array(new_array))

rnn_train["sarcasm"] = rnn_train["sarcasm"].astype('int').to_numpy()
rnn_test["sarcasm"] = rnn_test["sarcasm"].astype('int').to_numpy()

In [10]:
EMBEDDING_DIM_sid = 690
EMBEDDING_DIM_sd = 691
HIDDEN_DIM = 20
OUTPUT_DIM = 2
layers = 2
criterion = nn.NLLLoss()

### Speaker InDependent

In [11]:
rnn_train_tensor = RNNTensorDataset(rnn_train[['padded_context_audio_feature', 'padded_audio_feature', 'sarcasm']], False)
rnn_test_tensor = RNNTensorDataset(rnn_test[['padded_context_audio_feature', 'padded_audio_feature', 'sarcasm']], False)

num_of_workers = 0
batch_size = 31
valid_size = 0.1

train_indices = list(range(len(rnn_train_tensor)))
np.random.shuffle(train_indices)

test_indices = list(range(len(rnn_test_tensor)))
np.random.shuffle(test_indices)

train_loader = torch.utils.data.DataLoader(
    rnn_train_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(train_indices)
)

test_loader = torch.utils.data.DataLoader(
    rnn_test_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(test_indices)
)

test_loader_epoch = torch.utils.data.DataLoader(
    rnn_test_tensor, batch_size=rnn_test_tensor.__len__())

In [12]:
rnn = RNNetSID(EMBEDDING_DIM_sid, HIDDEN_DIM, OUTPUT_DIM, layers)
print(rnn)

optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)

n_epochs = 501
    
test_min_loss = np.inf

for epoch in range(n_epochs):
    torch.manual_seed(42)
    train_loss = 0.0
    test_loss = 0.0
    rnn.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        output = rnn(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*data.size(0)

    rnn.eval()
    for data, target in test_loader:
        if data.shape[1] < 31:
            continue
        output = rnn(data)
        loss = criterion(output, target)
        test_loss += loss.item()*data.size(0)

    train_loss = train_loss / len(train_loader.dataset)
    test_loss = test_loss / len(test_loader.dataset)

    if(epoch%20 == 0):
        print("Epoch: " + str(epoch))
        predictions, actuals = test_accuracy(rnn, test_loader_epoch, rnn_test_tensor.__len__())
        print(classification_report(actuals, predictions, digits=4))

RNNetSID(
  (rnn): RNN(690, 20, num_layers=2, batch_first=True)
  (fc): Linear(in_features=20, out_features=2, bias=True)
)
Epoch: 0
              precision    recall  f1-score   support

           0     0.4979    1.0000    0.6648       120
           1     0.0000    0.0000    0.0000       121

    accuracy                         0.4979       241
   macro avg     0.2490    0.5000    0.3324       241
weighted avg     0.2479    0.4979    0.3310       241

Epoch: 20
              precision    recall  f1-score   support

           0     0.7209    0.5167    0.6019       120
           1     0.6258    0.8017    0.7029       121

    accuracy                         0.6598       241
   macro avg     0.6734    0.6592    0.6524       241
weighted avg     0.6732    0.6598    0.6526       241

Epoch: 40
              precision    recall  f1-score   support

           0     0.6771    0.5417    0.6019       120
           1     0.6207    0.7438    0.6767       121

    accuracy                 

Epoch: 480
              precision    recall  f1-score   support

           0     0.6259    0.7250    0.6718       120
           1     0.6765    0.5702    0.6188       121

    accuracy                         0.6473       241
   macro avg     0.6512    0.6476    0.6453       241
weighted avg     0.6513    0.6473    0.6452       241

Epoch: 500
              precision    recall  f1-score   support

           0     0.6181    0.7417    0.6742       120
           1     0.6804    0.5455    0.6055       121

    accuracy                         0.6432       241
   macro avg     0.6492    0.6436    0.6399       241
weighted avg     0.6494    0.6432    0.6397       241



### Speaker Dependent

In [16]:
rnn_train_tensor = RNNTensorDataset(rnn_train[['padded_context_audio_feature', 'padded_audio_feature', 'speaker_encode','sarcasm']], True)
rnn_test_tensor = RNNTensorDataset(rnn_test[['padded_context_audio_feature', 'padded_audio_feature', 'speaker_encode', 'sarcasm']], True)

num_of_workers = 0
batch_size = 31
valid_size = 0.1

train_indices = list(range(len(rnn_train_tensor)))
np.random.shuffle(train_indices)

test_indices = list(range(len(rnn_test_tensor)))
np.random.shuffle(test_indices)

train_loader = torch.utils.data.DataLoader(
    rnn_train_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(train_indices)
)

test_loader = torch.utils.data.DataLoader(
    rnn_test_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(test_indices)
)

test_loader_epoch = torch.utils.data.DataLoader(
    rnn_test_tensor, batch_size=rnn_test_tensor.__len__())

In [17]:
rnn = RNNetSD(EMBEDDING_DIM_sd, HIDDEN_DIM, OUTPUT_DIM, layers)
print(rnn)

optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)

n_epochs = 501
    
test_min_loss = np.inf

for epoch in range(n_epochs):
    torch.manual_seed(42)
    train_loss = 0.0
    test_loss = 0.0
    rnn.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        output = rnn(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*data.size(0)

    rnn.eval()
    for data, target in test_loader:
        if data.shape[1] < 31:
            continue
        output = rnn(data)
        loss = criterion(output, target)
        test_loss += loss.item()*data.size(0)

    train_loss = train_loss / len(train_loader.dataset)
    test_loss = test_loss / len(test_loader.dataset)

    if(epoch%10 == 0):
        print("Epoch: " + str(epoch))
        predictions, actuals = test_accuracy(rnn, test_loader_epoch, rnn_test_tensor.__len__())
        print(classification_report(actuals, predictions, digits=4))

RNNetSD(
  (rnn): RNN(691, 20, num_layers=2, batch_first=True)
  (fc): Linear(in_features=20, out_features=2, bias=True)
)
Epoch: 0
              precision    recall  f1-score   support

           0     0.5063    1.0000    0.6723       120
           1     1.0000    0.0331    0.0640       121

    accuracy                         0.5145       241
   macro avg     0.7532    0.5165    0.3681       241
weighted avg     0.7542    0.5145    0.3669       241

Epoch: 10
              precision    recall  f1-score   support

           0     0.7273    0.4667    0.5685       120
           1     0.6098    0.8264    0.7018       121

    accuracy                         0.6473       241
   macro avg     0.6685    0.6466    0.6351       241
weighted avg     0.6683    0.6473    0.6354       241

Epoch: 20
              precision    recall  f1-score   support

           0     0.7129    0.6000    0.6516       120
           1     0.6571    0.7603    0.7050       121

    accuracy                  

Epoch: 240
              precision    recall  f1-score   support

           0     0.5944    0.7083    0.6464       120
           1     0.6429    0.5207    0.5753       121

    accuracy                         0.6141       241
   macro avg     0.6186    0.6145    0.6109       241
weighted avg     0.6187    0.6141    0.6107       241

Epoch: 250
              precision    recall  f1-score   support

           0     0.5899    0.6833    0.6332       120
           1     0.6275    0.5289    0.5740       121

    accuracy                         0.6058       241
   macro avg     0.6087    0.6061    0.6036       241
weighted avg     0.6088    0.6058    0.6035       241

Epoch: 260
              precision    recall  f1-score   support

           0     0.5828    0.7917    0.6714       120
           1     0.6795    0.4380    0.5327       121

    accuracy                         0.6141       241
   macro avg     0.6312    0.6148    0.6020       241
weighted avg     0.6314    0.6141    0.60

Epoch: 490
              precision    recall  f1-score   support

           0     0.5873    0.6167    0.6016       120
           1     0.6000    0.5702    0.5847       121

    accuracy                         0.5934       241
   macro avg     0.5937    0.5935    0.5932       241
weighted avg     0.5937    0.5934    0.5932       241

Epoch: 500
              precision    recall  f1-score   support

           0     0.5872    0.5333    0.5590       120
           1     0.5758    0.6281    0.6008       121

    accuracy                         0.5809       241
   macro avg     0.5815    0.5807    0.5799       241
weighted avg     0.5814    0.5809    0.5800       241

