In [1]:
import pandas as pd 
import pickle
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import torch

In [2]:
def svm_classifier(X_train, X_test, Y_train, Y_test):
    svm_clf = svm.SVC(random_state=0, kernel = "rbf", gamma = "scale", class_weight = "balanced")
    C = C = [0.0001, 0.0003, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10]
    gscv_clf = GridSearchCV(
        estimator=svm_clf, 
        param_grid=dict(C=C),
        n_jobs=-1, 
        cv = 10, 
        scoring = 'f1_weighted', 
        refit = True)

    gscv_clf.fit(X_train, Y_train)
    Y_test_pred = gscv_clf.predict(X_test)
    report = classification_report(Y_test, Y_test_pred, digits=4)
    return report, gscv_clf.best_estimator_

In [3]:
y_train = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Emotion Roberta/embeddings/train_labels.csv")
y_test = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Emotion Roberta/embeddings/test_labels.csv")


In [4]:
with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Emotion Roberta/embeddings/train_emoberta_embeddings_target_.pkl', 'rb') as f:
    x_train = pickle.load(f, encoding='latin1')

with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Emotion Roberta/embeddings/test_emoberta_embeddings_target_.pkl', 'rb') as f:
    x_test = pickle.load(f, encoding='latin1')

In [5]:
x_train_vals = []
for sample in x_train["embeddings"]:
    x_train_vals.append(sample[0].tolist())
    
x_test_vals = []
for sample in x_test["embeddings"]:
    x_test_vals.append(sample[0].tolist())

x_train_df = pd.DataFrame({'embeddings':x_train_vals})
x_test_df = pd.DataFrame({'embeddings':x_test_vals})

def process_dataframes(data):
    temp_concat = pd.concat([data, data.embeddings.apply(pd.Series)], axis=1)
    temp_concat.drop(columns=['embeddings'], inplace = True)
    return temp_concat.add_prefix('feat_')

x_train_df = process_dataframes(x_train_df)
x_test_df = process_dataframes(x_test_df)

In [6]:
x_train_df.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_758,feat_759,feat_760,feat_761,feat_762,feat_763,feat_764,feat_765,feat_766,feat_767
0,-0.346661,-0.076316,-0.176457,-0.430842,0.133689,0.069174,0.162834,-0.316667,0.685901,0.066745,...,-0.177601,0.071811,0.426316,0.393841,0.417978,0.104095,-0.450451,0.052605,-0.092888,-0.107823
1,-0.211392,-0.113207,-0.149064,-0.718321,0.149626,-0.030859,-0.398602,-0.755846,0.409559,0.227644,...,-0.151625,0.682756,0.130624,0.516527,-0.091031,0.005343,0.122272,0.255122,0.013893,-0.075256
2,-0.130957,-0.344759,-0.049595,-0.384732,0.192853,-0.511049,0.21824,0.072904,0.326533,0.097631,...,-0.060887,0.568969,-0.035156,0.448589,0.097846,0.080323,-0.422726,0.010065,0.076732,0.074437
3,-0.166356,0.177973,-0.197496,-0.417468,0.009837,-0.160716,-0.434448,-0.526498,0.64225,-0.146033,...,-0.108388,0.250758,0.211999,0.724478,0.361014,0.227494,-0.208437,0.449303,-0.062656,0.356658
4,-0.144698,-0.36029,-0.123821,-0.626001,-0.146996,-0.377608,-0.096727,-0.18514,0.437847,0.149392,...,0.001158,0.205276,0.113454,0.134896,0.1428,-0.065591,-0.144461,-0.065818,0.097677,-0.109731


### Speaker Independent and Context Independent

In [7]:
report, best_est = svm_classifier(x_train_df, x_test_df, y_train, y_test)

In [8]:
best_est

In [9]:
print(report)

              precision    recall  f1-score   support

         0.0     0.5840    0.6033    0.5935       121
         1.0     0.5862    0.5667    0.5763       120

    accuracy                         0.5851       241
   macro avg     0.5851    0.5850    0.5849       241
weighted avg     0.5851    0.5851    0.5849       241



### Speaker Dependent and Context Independent

In [10]:
x_train_speakers = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Emotion Roberta/embeddings/train_data.csv")
x_test_speakers = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Emotion Roberta/embeddings/test_data.csv")
x_train_speakers

Unnamed: 0,target_,target_context,speaker
0,<s> i have been told it is a good way to move...,<s> i have been told it is a good way to move...,25
1,"<s> yeah, sure. you slept with your husband. ...","<s> yeah, sure. you slept with your husband. ...",1
2,<s> when are you coming home? </s>,<s> when are you coming home? okay. alright....,16
3,<s> riveting. </s>,<s> riveting. bingo. then i lifted the cushi...,0
4,"<s> no, this is just part of a daredevil game...","<s> no, this is just part of a daredevil game...",2
...,...,...,...
956,"<s> oh, that is sweet, but today is all about...","<s> oh, that is sweet, but today is all about...",7
957,<s> if you want to put a label on it. </s>,<s> if you want to put a label on it. you me...,24
958,<s> that you are an alcoholic? </s>,<s> that you are an alcoholic? i realized so...,3
959,<s> all i see is a yellow smudge. </s>,<s> all i see is a yellow smudge. now go bac...,15


In [11]:
x_train_df["speaker"] = x_train_speakers["speaker"]
x_test_df["speaker"] = x_test_speakers["speaker"]
x_train_df

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_759,feat_760,feat_761,feat_762,feat_763,feat_764,feat_765,feat_766,feat_767,speaker
0,-0.346661,-0.076316,-0.176457,-0.430842,0.133689,0.069174,0.162834,-0.316667,0.685901,0.066745,...,0.071811,0.426316,0.393841,0.417978,0.104095,-0.450451,0.052605,-0.092888,-0.107823,25
1,-0.211392,-0.113207,-0.149064,-0.718321,0.149626,-0.030859,-0.398602,-0.755846,0.409559,0.227644,...,0.682756,0.130624,0.516527,-0.091031,0.005343,0.122272,0.255122,0.013893,-0.075256,1
2,-0.130957,-0.344759,-0.049595,-0.384732,0.192853,-0.511049,0.218240,0.072904,0.326533,0.097631,...,0.568969,-0.035156,0.448589,0.097846,0.080323,-0.422726,0.010065,0.076732,0.074437,16
3,-0.166356,0.177973,-0.197496,-0.417468,0.009837,-0.160716,-0.434448,-0.526498,0.642250,-0.146033,...,0.250758,0.211999,0.724478,0.361014,0.227494,-0.208437,0.449303,-0.062656,0.356658,0
4,-0.144698,-0.360290,-0.123821,-0.626001,-0.146996,-0.377608,-0.096727,-0.185140,0.437847,0.149392,...,0.205276,0.113454,0.134896,0.142800,-0.065591,-0.144461,-0.065818,0.097677,-0.109731,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,0.105888,0.100209,-0.053107,-0.519480,0.287733,0.153177,-0.062070,-0.301622,0.466676,0.199987,...,0.217009,0.255100,0.300264,0.248019,0.092152,-0.073072,-0.013160,-0.019867,0.131380,7
957,-0.081320,-0.063892,-0.166051,-0.409307,-0.285387,-0.208739,-0.105344,-0.510182,0.453736,-0.352903,...,0.202251,-0.066086,0.495700,0.105306,0.194420,-0.259736,0.530509,-0.057561,-0.011852,24
958,-0.123531,-0.132446,-0.101091,-0.686636,-0.035489,-0.223620,0.190708,-0.076541,0.147838,0.057696,...,0.761178,-0.089701,0.299700,-0.267968,-0.008222,-0.567253,0.195762,0.011469,-0.356796,3
959,0.003655,-0.216856,-0.378327,-0.563901,0.132323,-0.456657,-0.129780,-0.321604,0.082073,-0.140326,...,0.072743,0.036093,0.347542,-0.491985,0.044802,-0.406558,-0.036060,0.086675,-0.218487,15


In [12]:
report, best_est = svm_classifier(x_train_df, x_test_df, y_train, y_test)
best_est

In [13]:
print(report)

              precision    recall  f1-score   support

         0.0     0.6852    0.6116    0.6463       121
         1.0     0.6466    0.7167    0.6798       120

    accuracy                         0.6639       241
   macro avg     0.6659    0.6641    0.6631       241
weighted avg     0.6660    0.6639    0.6630       241



### Speaker Independent and Context dependent

In [14]:
with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Emotion Roberta/embeddings/train_emoberta_embeddings_target_context.pkl', 'rb') as f:
    x_train_context = pickle.load(f, encoding='latin1')

with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Emotion Roberta/embeddings/test_emoberta_embeddings_target_context.pkl', 'rb') as f:
    x_test_context = pickle.load(f, encoding='latin1')

In [15]:
x_train_vals = []
for sample in x_train_context["embeddings"]:
    x_train_vals.append(sample[0].tolist())
    
x_test_vals = []
for sample in x_test_context["embeddings"]:
    x_test_vals.append(sample[0].tolist())

x_train_df = pd.DataFrame({'embeddings':x_train_vals})
x_test_df = pd.DataFrame({'embeddings':x_test_vals})

def process_dataframes(data):
    temp_concat = pd.concat([data, data.embeddings.apply(pd.Series)], axis=1)
    temp_concat.drop(columns=['embeddings'], inplace = True)
    return temp_concat.add_prefix('feat_')

x_train_df = process_dataframes(x_train_df)
x_test_df = process_dataframes(x_test_df)

In [16]:
report, best_est = svm_classifier(x_train_df, x_test_df, y_train.values.ravel(), y_test.values.ravel())

In [17]:
best_est

In [18]:
print(report)

              precision    recall  f1-score   support

         0.0     0.6250    0.6198    0.6224       121
         1.0     0.6198    0.6250    0.6224       120

    accuracy                         0.6224       241
   macro avg     0.6224    0.6224    0.6224       241
weighted avg     0.6224    0.6224    0.6224       241



### Speaker Dependent and Context Dependent

In [19]:
x_train_df["speaker"] = x_train_speakers["speaker"]
x_test_df["speaker"] = x_test_speakers["speaker"]
x_train_df

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_759,feat_760,feat_761,feat_762,feat_763,feat_764,feat_765,feat_766,feat_767,speaker
0,-0.260339,-0.001590,-0.106586,-0.676144,-0.056974,-0.265580,0.284544,0.005275,0.410756,0.016236,...,0.180100,0.252734,0.529407,0.184306,-0.137758,-0.459916,-0.170638,0.016497,0.009023,25
1,-0.134949,-0.114905,-0.102827,-0.633639,-0.098503,-0.320460,0.012300,-0.390006,0.096913,0.000132,...,0.489561,0.117410,0.546330,-0.219387,-0.194568,-0.079066,-0.086963,0.178945,-0.227929,1
2,-0.217401,-0.259364,-0.071771,-0.751696,0.072838,-0.386390,-0.077681,-0.160109,0.353239,-0.230072,...,0.472851,0.014619,0.513739,-0.061896,-0.079626,-0.160311,-0.244905,-0.000189,0.054380,16
3,0.019844,-0.093364,-0.102180,-0.508913,-0.119105,-0.061720,0.341775,-0.056822,0.204588,-0.145091,...,0.267042,-0.150327,0.492823,0.250624,-0.015410,-0.195509,-0.097871,0.067725,-0.136071,0
4,-0.155321,-0.299884,-0.195379,-0.620971,-0.181744,-0.381488,-0.021245,-0.115612,0.400439,0.029108,...,0.163647,0.016251,0.246408,0.059884,-0.110360,-0.226482,-0.082159,0.085994,-0.098782,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,-0.147050,0.103499,0.013003,-0.674521,0.067728,-0.110955,0.039080,-0.274449,0.460904,0.075730,...,0.437590,0.116509,0.451178,0.059624,-0.058985,-0.167596,-0.156237,-0.120912,0.068402,7
957,0.030477,0.020006,-0.235825,-0.668431,-0.319798,-0.045076,0.159677,-0.008670,0.256615,-0.055519,...,0.477176,-0.029886,0.105006,-0.093603,0.252784,0.138410,0.120047,0.082827,-0.146047,24
958,0.098432,-0.297345,-0.181105,-0.639331,0.087709,-0.507514,0.399132,0.006550,0.151426,-0.078753,...,0.470522,-0.272250,0.491799,-0.074762,-0.179157,-0.329647,0.054555,0.098914,-0.212182,3
959,-0.213635,-0.292421,-0.203788,-0.575084,-0.075586,-0.717352,-0.067712,-0.233745,0.238681,-0.169930,...,0.314694,-0.160910,0.544496,-0.108063,-0.175749,-0.270244,-0.258349,0.058116,-0.152623,15


In [20]:
report, best_est = svm_classifier(x_train_df, x_test_df, y_train.values.ravel(), y_test.values.ravel())
best_est

In [21]:
print(report)

              precision    recall  f1-score   support

         0.0     0.6549    0.6116    0.6325       121
         1.0     0.6328    0.6750    0.6532       120

    accuracy                         0.6432       241
   macro avg     0.6438    0.6433    0.6429       241
weighted avg     0.6439    0.6432    0.6428       241

