In [1]:
import pandas as pd 
import pickle
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from sklearn.metrics import classification_report
from sklearn import preprocessing
import torch
import torch.nn as nn
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [2]:
y_train = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/train_labels.csv")
y_test = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/test_labels.csv")


In [3]:
with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/train_roberta_embeddings_target_context.pkl', 'rb') as f:
    x_train = pickle.load(f, encoding='latin1')

with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/test_roberta_embeddings_target_context.pkl', 'rb') as f:
    x_test = pickle.load(f, encoding='latin1')

In [4]:
x_train_vals = []
for sample in x_train["embeddings"]:
    x_train_vals.append(sample[0].tolist())
    
x_test_vals = []
for sample in x_test["embeddings"]:
    x_test_vals.append(sample[0].tolist())

x_train_df = pd.DataFrame({'embeddings':x_train_vals})
x_test_df = pd.DataFrame({'embeddings':x_test_vals})

x_train_df["sarcasm"] = y_train["sarcasm"]
x_test_df["sarcasm"] = y_test["sarcasm"]
x_train_df["sarcasm"] = x_train_df["sarcasm"].astype('int').to_numpy()
x_test_df["sarcasm"] = x_test_df["sarcasm"].astype('int').to_numpy()

In [5]:
class FNNTensorDataset(Dataset):
    def __init__(self, dataframe, speaker):
        self.data = dataframe
        self.speaker = speaker

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.speaker:
            features = self.data.loc[index, 'embeddings']
            features = np.array(features)
            a=np.empty((1,1))
            a.fill(self.data.loc[index, 'speaker'])
            a = a.reshape((-1,))
            final_features = np.hstack((features, a))
            label = self.data.loc[index, 'sarcasm']
            return torch.Tensor(final_features).float(), label
        else:
            features = self.data.loc[index, 'embeddings']
            label = self.data.loc[index, 'sarcasm']
            return torch.Tensor(features).float(), label
    
    def __getindexlist__(self):
        return list(self.data.index.values)

class FNNNetSID(nn.Module):
    def __init__(self):
        super(FNNNetSID, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(768, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)

    def forward(self, x):
        x = x.view(-1, 768)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x
    
class FNNNetSD(nn.Module):
    def __init__(self):
        super(FNNNetSD, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(769, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)

    def forward(self, x):
        x = x.view(-1, 769)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x
    
def predict_fnn(fnn_model, dataloader):
    prediction_list = []
    actual_list = []
    for data, target in dataloader:
        outputs = fnn_model(data)
        _, predicted = torch.max(outputs.data, 1) 
        prediction_list.append(predicted.cpu())
        actual_list.append(target)
    return prediction_list, actual_list

### Speaker Independent and Context dependent

In [10]:
fnn_train_tensor = FNNTensorDataset(x_train_df[['embeddings', 'sarcasm']], False)
fnn_test_tensor = FNNTensorDataset(x_test_df[['embeddings', 'sarcasm']], False)

num_of_workers = 0
batch_size = 100
valid_size = 0.2

train_indices = list(range(len(fnn_train_tensor)))
np.random.shuffle(train_indices)

test_indices = list(range(len(fnn_test_tensor)))
np.random.shuffle(test_indices)

train_loader = torch.utils.data.DataLoader(
    fnn_train_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(train_indices)
)

test_loader = torch.utils.data.DataLoader(
    fnn_test_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(test_indices)
)

In [11]:
model = FNNNetSID()
print(model)

FNNNetSID(
  (fc1): Linear(in_features=768, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
)


In [12]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
test_min_loss = np.inf

for epoch in range(201):
    model.train()
    train_loss = 0.0
    test_loss = 0.0
    for inputs, target in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target.long())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*inputs.size(0)

    model.eval()
    for inputs, target in test_loader:
        inputs, target = inputs, target
        output = model(inputs)
        loss = loss_fn(output, target)
        test_loss += loss.item()*inputs.size(0)

    train_loss = train_loss / len(train_loader.dataset)
    test_loss = test_loss / len(test_loader.dataset)
    
    if(epoch%20 == 0):
        print("Epoch: " + str(epoch))
        test_loader_test = torch.utils.data.DataLoader(fnn_test_tensor, batch_size=fnn_test_tensor.__len__())
        predictions, actuals = predict_fnn(model, test_loader_test)
        print(classification_report(actuals[0].tolist(), predictions[0].tolist(), digits=4))

Epoch: 0
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       121
           1     0.4979    1.0000    0.6648       120

    accuracy                         0.4979       241
   macro avg     0.2490    0.5000    0.3324       241
weighted avg     0.2479    0.4979    0.3310       241

Epoch: 20
              precision    recall  f1-score   support

           0     0.5419    0.9091    0.6790       121
           1     0.7105    0.2250    0.3418       120

    accuracy                         0.5685       241
   macro avg     0.6262    0.5670    0.5104       241
weighted avg     0.6258    0.5685    0.5111       241

Epoch: 40
              precision    recall  f1-score   support

           0     0.6210    0.6364    0.6286       121
           1     0.6239    0.6083    0.6160       120

    accuracy                         0.6224       241
   macro avg     0.6224    0.6223    0.6223       241
weighted avg     0.6224    0.6224    0.6223  

### Speaker Dependent and Context dependent

In [13]:
x_train_speakers = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/train_data.csv")
x_test_speakers = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/test_data.csv")
x_train_speakers

Unnamed: 0,target_,target_context,speaker
0,<s> i have been told it is a good way to move...,<s> i have been told it is a good way to move...,25
1,"<s> yeah, sure. you slept with your husband. ...","<s> yeah, sure. you slept with your husband. ...",1
2,<s> when are you coming home? </s>,<s> when are you coming home? okay. alright....,16
3,<s> riveting. </s>,<s> riveting. bingo. then i lifted the cushi...,0
4,"<s> no, this is just part of a daredevil game...","<s> no, this is just part of a daredevil game...",2
...,...,...,...
956,"<s> oh, that is sweet, but today is all about...","<s> oh, that is sweet, but today is all about...",7
957,<s> if you want to put a label on it. </s>,<s> if you want to put a label on it. you me...,24
958,<s> that you are an alcoholic? </s>,<s> that you are an alcoholic? i realized so...,3
959,<s> all i see is a yellow smudge. </s>,<s> all i see is a yellow smudge. now go bac...,15


In [14]:
x_train_df["speaker"] = x_train_speakers["speaker"]
x_test_df["speaker"] = x_test_speakers["speaker"]
x_train_df

Unnamed: 0,embeddings,sarcasm,speaker
0,"[-0.038043130189180374, 0.020484089851379395, ...",0,25
1,"[-0.048003457486629486, 0.009103945456445217, ...",0,1
2,"[-0.044101618230342865, 0.03517680615186691, -...",0,16
3,"[-0.027050701901316643, 0.022058051079511642, ...",1,0
4,"[-0.04363812506198883, 0.004367925692349672, -...",1,2
...,...,...,...
956,"[-0.03931039199233055, 0.037066489458084106, -...",0,7
957,"[-0.050667256116867065, 0.03380413353443146, -...",1,24
958,"[-0.048101410269737244, 0.0004800709430128336,...",1,3
959,"[-0.048232078552246094, 0.011562079191207886, ...",0,15


In [15]:
fnn_train_tensor = FNNTensorDataset(x_train_df[['embeddings', 'speaker', 'sarcasm']], True)
fnn_test_tensor = FNNTensorDataset(x_test_df[['embeddings', 'speaker', 'sarcasm']], True)

num_of_workers = 0
batch_size = 100
valid_size = 0.2

train_indices = list(range(len(fnn_train_tensor)))
np.random.shuffle(train_indices)

test_indices = list(range(len(fnn_test_tensor)))
np.random.shuffle(test_indices)

train_loader = torch.utils.data.DataLoader(
    fnn_train_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(train_indices)
)

test_loader = torch.utils.data.DataLoader(
    fnn_test_tensor, 
    batch_size=batch_size, 
    sampler=SubsetRandomSampler(test_indices)
)

In [16]:
model = FNNNetSD()
print(model)

FNNNetSD(
  (fc1): Linear(in_features=769, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
)


In [17]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
test_min_loss = np.inf

for epoch in range(101):
    model.train()
    train_loss = 0.0
    test_loss = 0.0
    for inputs, target in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*inputs.size(0)

    model.eval()
    for inputs, target in test_loader:
        inputs, target = inputs, target
        output = model(inputs)
        loss = loss_fn(output, target)
        test_loss += loss.item()*inputs.size(0)

    train_loss = train_loss / len(train_loader.dataset)
    test_loss = test_loss / len(test_loader.dataset)
    
    if(epoch%20 == 0):
        print("Epoch: " + str(epoch))
        test_loader_test = torch.utils.data.DataLoader(fnn_test_tensor, batch_size=fnn_test_tensor.__len__())
        predictions, actuals = predict_fnn(model, test_loader_test)
        print(classification_report(actuals[0].tolist(), predictions[0].tolist(), digits=4))

Epoch: 0
              precision    recall  f1-score   support

           0     0.5814    0.8264    0.6826       121
           1     0.6957    0.4000    0.5079       120

    accuracy                         0.6141       241
   macro avg     0.6385    0.6132    0.5953       241
weighted avg     0.6383    0.6141    0.5956       241

Epoch: 20
              precision    recall  f1-score   support

           0     0.6396    0.5868    0.6121       121
           1     0.6154    0.6667    0.6400       120

    accuracy                         0.6266       241
   macro avg     0.6275    0.6267    0.6260       241
weighted avg     0.6276    0.6266    0.6260       241

Epoch: 40
              precision    recall  f1-score   support

           0     0.6515    0.7107    0.6798       121
           1     0.6789    0.6167    0.6463       120

    accuracy                         0.6639       241
   macro avg     0.6652    0.6637    0.6631       241
weighted avg     0.6652    0.6639    0.6631  