In [1]:
import pandas as pd 
import pickle
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn import preprocessing

import torch
import torch.nn as nn
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

desired_frames = 1
desired_features = 1024

In [2]:
labels = pd.read_csv("../../../data/scene_labels.csv")

In [3]:
labels.head(5)

Unnamed: 0,SCENE,KEY,SPEAKER,SHOW,Sarcasm,Sarcasm_Type
0,1_10004,1_10004_u,SHELDON,BBT,0.0,NONE
1,1_10009,1_10009_u,PENNY,BBT,0.0,NONE
2,1_1001,1_1001_u,RAJ,BBT,0.0,NONE
3,1_1003,1_1003_u,HOWARD,BBT,1.0,PRO
4,1_10190,1_10190_u,SHELDON,BBT,0.0,NONE


#### Perform mean, median, max, min and sum pooling on audio feature data

In [4]:
def get_model_data(audio_features):
    model_data = pd.DataFrame(columns=['audio_feature','sarcasm','sarcasm_type', 'speaker'])
    for index, row in labels.iterrows():
        audio_key = row["SCENE"] + "_u.wav"
        model_data = model_data.append({'audio_feature': audio_features[audio_key],
                                    'sarcasm' : row["Sarcasm"],
                                    'sarcasm_type' : row["Sarcasm_Type"],
                                    'speaker' : row["SPEAKER"]},
                                  ignore_index=True)
    return model_data

In [5]:
def get_train_test_split(model_data, x_columns, y_column, stratify_column):
    X_train, X_test, Y_train, Y_test = train_test_split(
        model_data[x_columns],
        model_data[y_column],
        train_size=0.8, 
        test_size=0.2, 
        random_state=42, 
        shuffle=True,
        stratify=model_data[stratify_column])
    
    print("Train: ",X_train.shape, Y_train.shape,
      "Test: ",(X_test.shape, Y_test.shape))
    train_data = pd.merge(X_train, Y_train, left_index=True, right_index=True)
    test_data = pd.merge(X_test, Y_test, left_index=True, right_index=True)
    return train_data, test_data

In [6]:
class FNNTensorDataset(Dataset):
    def __init__(self, dataframe, speaker):
        self.data = dataframe
        self.speaker = speaker

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.speaker:
            features = self.data.loc[index, 'averaged_audio_feature']
            a=np.empty((1,1))
            a.fill(self.data.loc[index, 'speaker_encode'])
            final_features = np.hstack((features, a.reshape((1,))))        
            label = self.data.loc[index, 'sarcasm']
            return torch.from_numpy(final_features).float(), label
        else:
            features = self.data.loc[index, 'averaged_audio_feature']
            label = self.data.loc[index, 'sarcasm']
            return torch.from_numpy(features).float(), label
    
    def __getindexlist__(self):
        return list(self.data.index.values)

class FNNNetSID(nn.Module):
    def __init__(self):
        super(FNNNetSID, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(desired_frames*desired_features, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)

    def forward(self, x):
        x = x.view(-1, desired_frames*desired_features)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x
    
class FNNNetSD(nn.Module):
    def __init__(self):
        super(FNNNetSD, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(desired_frames*(desired_features+1), hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)

    def forward(self, x):
        x = x.view(-1, desired_frames*(desired_features+1))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x
    
def predict_fnn(fnn_model, dataloader):
    prediction_list = []
    actual_list = []
    for data, target in dataloader:
        outputs = fnn_model(data)
        _, predicted = torch.max(outputs.data, 1) 
        prediction_list.append(predicted.cpu())
        actual_list.append(target)
    return prediction_list, actual_list

In [7]:
with open('../../../audio_features/trill_features.pickle', 'rb') as f:
    librosa_audio_features = pickle.load(f, encoding='latin1')
    
model_data = get_model_data(librosa_audio_features)
le = preprocessing.LabelEncoder()
model_data['speaker_encode'] = le.fit_transform(model_data['speaker'])
model_data.head(5)

Unnamed: 0,audio_feature,sarcasm,sarcasm_type,speaker,speaker_encode
0,"[-1.406311, -0.46280488, -1.1319538, -1.194219...",0.0,NONE,SHELDON,25
1,"[-1.4183334, -0.36521277, -1.1331908, -1.17071...",0.0,NONE,PENNY,15
2,"[-1.3609562, -0.27142158, -0.63655925, -1.0188...",0.0,NONE,RAJ,21
3,"[-1.4286865, -0.31481665, -0.67340577, -0.9725...",1.0,PRO,HOWARD,7
4,"[-1.3821282, -0.4007631, -1.1102539, -1.178829...",0.0,NONE,SHELDON,25


In [8]:
train_data, test_data = get_train_test_split(model_data, ['audio_feature', 'speaker_encode'], 'sarcasm', 'sarcasm_type')
fnn_train = train_data.copy()
fnn_test = test_data.copy()
fnn_train.reset_index(drop=True, inplace = True)
fnn_test.reset_index(drop=True, inplace = True)

        
fnn_train['averaged_audio_feature'] = fnn_train.loc[:, 'audio_feature']

fnn_test['averaged_audio_feature'] = fnn_test.loc[:, 'audio_feature']

fnn_train["sarcasm"] = fnn_train["sarcasm"].astype('int').to_numpy()
fnn_test["sarcasm"] = fnn_test["sarcasm"].astype('int').to_numpy()

Train:  (961, 2) (961,) Test:  ((241, 2), (241,))


In [9]:
fnn_train.at[0, "averaged_audio_feature"].shape

(1024,)

### Speaker InDependent

In [10]:
fnn_train_tensor = FNNTensorDataset(fnn_train[['averaged_audio_feature', 'sarcasm']], False)
fnn_test_tensor = FNNTensorDataset(fnn_test[['averaged_audio_feature', 'sarcasm']], False)

num_of_workers = 0
batch_size = 100
valid_size = 0.2

train_indices = list(range(len(fnn_train_tensor)))
np.random.shuffle(train_indices)

test_indices = list(range(len(fnn_test_tensor)))
np.random.shuffle(test_indices)

train_loader = torch.utils.data.DataLoader(
    fnn_train_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(train_indices)
)

test_loader = torch.utils.data.DataLoader(
    fnn_test_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(test_indices)
)

In [11]:
model = FNNNetSID()
print(model)

FNNNetSID(
  (fc1): Linear(in_features=1024, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
)


In [12]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
test_min_loss = np.inf

for epoch in range(501):
    model.train()
    train_loss = 0.0
    test_loss = 0.0
    for inputs, target in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*inputs.size(0)

    model.eval()
    for inputs, target in test_loader:
        inputs, target = inputs, target
        output = model(inputs)
        loss = loss_fn(output, target)
        test_loss += loss.item()*inputs.size(0)

    train_loss = train_loss / len(train_loader.dataset)
    test_loss = test_loss / len(test_loader.dataset)
    
    if(epoch%20 == 0):
        print("Epoch: " + str(epoch))
        test_loader_test = torch.utils.data.DataLoader(fnn_test_tensor, batch_size=fnn_test_tensor.__len__())
        predictions, actuals = predict_fnn(model, test_loader_test)
        print(classification_report(actuals[0].tolist(), predictions[0].tolist(), digits=4))

Epoch: 0
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       120
           1     0.5021    1.0000    0.6685       121

    accuracy                         0.5021       241
   macro avg     0.2510    0.5000    0.3343       241
weighted avg     0.2521    0.5021    0.3356       241

Epoch: 20
              precision    recall  f1-score   support

           0     0.5271    0.5667    0.5462       120
           1     0.5357    0.4959    0.5150       121

    accuracy                         0.5311       241
   macro avg     0.5314    0.5313    0.5306       241
weighted avg     0.5314    0.5311    0.5305       241

Epoch: 40
              precision    recall  f1-score   support

           0     0.5208    0.6250    0.5682       120
           1     0.5361    0.4298    0.4771       121

    accuracy                         0.5270       241
   macro avg     0.5285    0.5274    0.5226       241
weighted avg     0.5285    0.5270    0.5224  

Epoch: 500
              precision    recall  f1-score   support

           0     0.5545    0.5083    0.5304       120
           1     0.5496    0.5950    0.5714       121

    accuracy                         0.5519       241
   macro avg     0.5521    0.5517    0.5509       241
weighted avg     0.5521    0.5519    0.5510       241



### Speaker Dependent

In [13]:
fnn_train_tensor = FNNTensorDataset(fnn_train[['averaged_audio_feature', 'speaker_encode', 'sarcasm']], True)
fnn_test_tensor = FNNTensorDataset(fnn_test[['averaged_audio_feature', 'speaker_encode', 'sarcasm']], True)

num_of_workers = 0
batch_size = 100
valid_size = 0.2

train_indices = list(range(len(fnn_train_tensor)))
np.random.shuffle(train_indices)

test_indices = list(range(len(fnn_test_tensor)))
np.random.shuffle(test_indices)

train_loader = torch.utils.data.DataLoader(
    fnn_train_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(train_indices)
)

test_loader = torch.utils.data.DataLoader(
    fnn_test_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(test_indices)
)

In [14]:
model = FNNNetSD()
print(model)

FNNNetSD(
  (fc1): Linear(in_features=1025, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
)


In [15]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
test_min_loss = np.inf

for epoch in range(501):
    model.train()
    train_loss = 0.0
    test_loss = 0.0
    for inputs, target in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*inputs.size(0)

    model.eval()
    for inputs, target in test_loader:
        inputs, target = inputs, target
        output = model(inputs)
        loss = loss_fn(output, target)
        test_loss += loss.item()*inputs.size(0)

    train_loss = train_loss / len(train_loader.dataset)
    test_loss = test_loss / len(test_loader.dataset)
    
    if(epoch%20 == 0):
        print("Epoch: " + str(epoch))
        test_loader_test = torch.utils.data.DataLoader(fnn_test_tensor, batch_size=fnn_test_tensor.__len__())
        predictions, actuals = predict_fnn(model, test_loader_test)
        print(classification_report(actuals[0].tolist(), predictions[0].tolist(), digits=4))

Epoch: 0
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       120
           1     0.5021    1.0000    0.6685       121

    accuracy                         0.5021       241
   macro avg     0.2510    0.5000    0.3343       241
weighted avg     0.2521    0.5021    0.3356       241

Epoch: 20
              precision    recall  f1-score   support

           0     0.5323    0.8250    0.6471       120
           1     0.6182    0.2810    0.3864       121

    accuracy                         0.5519       241
   macro avg     0.5752    0.5530    0.5167       241
weighted avg     0.5754    0.5519    0.5162       241

Epoch: 40
              precision    recall  f1-score   support

           0     0.6053    0.3833    0.4694       120
           1     0.5515    0.7521    0.6364       121

    accuracy                         0.5685       241
   macro avg     0.5784    0.5677    0.5529       241
weighted avg     0.5783    0.5685    0.5532  

Epoch: 500
              precision    recall  f1-score   support

           0     0.6300    0.5250    0.5727       120
           1     0.5957    0.6942    0.6412       121

    accuracy                         0.6100       241
   macro avg     0.6129    0.6096    0.6070       241
weighted avg     0.6128    0.6100    0.6071       241

