In [1]:
import pandas as pd 
import pickle
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import torch

In [2]:
def svm_classifier(X_train, X_test, Y_train, Y_test):
    svm_clf = svm.SVC(random_state=0, kernel = "rbf", gamma = "scale", class_weight = "balanced")
    C = C = [0.0001, 0.0003, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10]
    gscv_clf = GridSearchCV(
        estimator=svm_clf, 
        param_grid=dict(C=C),
        n_jobs=-1, 
        cv = 10, 
        scoring = 'f1_weighted', 
        refit = True)

    gscv_clf.fit(X_train, Y_train)
    Y_test_pred = gscv_clf.predict(X_test)
    report = classification_report(Y_test, Y_test_pred, digits=4)
    return report, gscv_clf.best_estimator_

In [3]:
y_train = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/train_labels.csv")
y_test = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/test_labels.csv")


In [5]:
with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/train_roberta_embeddings_target_.pkl', 'rb') as f:
    x_train = pickle.load(f, encoding='latin1')

with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/test_roberta_embeddings_target_.pkl', 'rb') as f:
    x_test = pickle.load(f, encoding='latin1')

In [6]:
x_train_vals = []
for sample in x_train["embeddings"]:
    x_train_vals.append(sample[0].tolist())
    
x_test_vals = []
for sample in x_test["embeddings"]:
    x_test_vals.append(sample[0].tolist())

x_train_df = pd.DataFrame({'embeddings':x_train_vals})
x_test_df = pd.DataFrame({'embeddings':x_test_vals})

def process_dataframes(data):
    temp_concat = pd.concat([data, data.embeddings.apply(pd.Series)], axis=1)
    temp_concat.drop(columns=['embeddings'], inplace = True)
    return temp_concat.add_prefix('feat_')

x_train_df = process_dataframes(x_train_df)
x_test_df = process_dataframes(x_test_df)

In [7]:
x_train_df.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_758,feat_759,feat_760,feat_761,feat_762,feat_763,feat_764,feat_765,feat_766,feat_767
0,-0.032825,0.024251,-0.053847,-0.06085,0.062177,0.021897,-0.024974,0.062814,0.004985,0.011356,...,0.039643,-0.015554,-0.007881,-0.065008,0.091693,0.092131,0.045042,0.016046,0.015124,-0.016107
1,-0.058609,0.024145,-0.057527,-0.058993,0.058775,0.021409,-0.02364,0.072696,0.017599,0.007511,...,0.045519,-0.010732,-0.027528,-0.061475,0.10336,0.09464,0.060435,0.023012,0.022691,-0.0089
2,-0.042612,0.032693,-0.063451,-0.042959,0.051935,0.019185,-0.045627,0.07038,0.022928,0.017556,...,0.040271,-0.024351,-0.03022,-0.064117,0.117595,0.097762,0.053058,0.051895,0.001435,-0.009274
3,-0.046511,0.027672,-0.039181,-0.048538,0.053328,0.004649,-0.05429,0.054852,0.015167,0.009534,...,0.050531,-0.030233,-0.015655,-0.070434,0.091203,0.100987,0.045941,0.067512,-0.013537,0.001858
4,-0.036844,0.012853,-0.062388,-0.056663,0.053127,-0.002635,-0.029347,0.095471,0.021105,0.015301,...,0.057115,-0.00923,-0.021183,-0.081909,0.091972,0.096955,0.059666,0.03797,-0.010764,0.000462


### Speaker Independent and Context Independent

In [8]:
report, best_est = svm_classifier(x_train_df, x_test_df, y_train, y_test)

In [9]:
best_est

In [10]:
print(report)

              precision    recall  f1-score   support

         0.0     0.0000    0.0000    0.0000       121
         1.0     0.4979    1.0000    0.6648       120

    accuracy                         0.4979       241
   macro avg     0.2490    0.5000    0.3324       241
weighted avg     0.2479    0.4979    0.3310       241



### Speaker Dependent and Context Independent

In [11]:
x_train_speakers = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/train_data.csv")
x_test_speakers = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/test_data.csv")
x_train_speakers

Unnamed: 0,target_,target_context,speaker
0,<s> i have been told it is a good way to move...,<s> i have been told it is a good way to move...,25
1,"<s> yeah, sure. you slept with your husband. ...","<s> yeah, sure. you slept with your husband. ...",1
2,<s> when are you coming home? </s>,<s> when are you coming home? okay. alright....,16
3,<s> riveting. </s>,<s> riveting. bingo. then i lifted the cushi...,0
4,"<s> no, this is just part of a daredevil game...","<s> no, this is just part of a daredevil game...",2
...,...,...,...
956,"<s> oh, that is sweet, but today is all about...","<s> oh, that is sweet, but today is all about...",7
957,<s> if you want to put a label on it. </s>,<s> if you want to put a label on it. you me...,24
958,<s> that you are an alcoholic? </s>,<s> that you are an alcoholic? i realized so...,3
959,<s> all i see is a yellow smudge. </s>,<s> all i see is a yellow smudge. now go bac...,15


In [12]:
x_train_df["speaker"] = x_train_speakers["speaker"]
x_test_df["speaker"] = x_test_speakers["speaker"]
x_train_df

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_759,feat_760,feat_761,feat_762,feat_763,feat_764,feat_765,feat_766,feat_767,speaker
0,-0.032825,0.024251,-0.053847,-0.060850,0.062177,0.021897,-0.024974,0.062814,0.004985,0.011356,...,-0.015554,-0.007881,-0.065008,0.091693,0.092131,0.045042,0.016046,0.015124,-0.016107,25
1,-0.058609,0.024145,-0.057527,-0.058993,0.058775,0.021409,-0.023640,0.072696,0.017599,0.007511,...,-0.010732,-0.027528,-0.061475,0.103360,0.094640,0.060435,0.023012,0.022691,-0.008900,1
2,-0.042612,0.032693,-0.063451,-0.042959,0.051935,0.019185,-0.045627,0.070380,0.022928,0.017556,...,-0.024351,-0.030220,-0.064117,0.117595,0.097762,0.053058,0.051895,0.001435,-0.009274,16
3,-0.046511,0.027672,-0.039181,-0.048538,0.053328,0.004649,-0.054290,0.054852,0.015167,0.009534,...,-0.030233,-0.015655,-0.070434,0.091203,0.100987,0.045941,0.067512,-0.013537,0.001858,0
4,-0.036844,0.012853,-0.062388,-0.056663,0.053127,-0.002635,-0.029347,0.095471,0.021105,0.015301,...,-0.009230,-0.021183,-0.081909,0.091972,0.096955,0.059666,0.037970,-0.010764,0.000462,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,-0.040808,0.037226,-0.053446,-0.044131,0.069197,0.010806,-0.034296,0.071188,0.016049,0.011235,...,-0.007758,-0.007453,-0.061591,0.121767,0.099865,0.047019,0.035760,0.001817,-0.009310,7
957,-0.056173,0.027654,-0.066472,-0.046955,0.065946,0.019450,-0.030465,0.061375,0.008863,0.009817,...,-0.012963,-0.020338,-0.064424,0.108703,0.094731,0.036609,0.037237,0.010994,-0.015129,24
958,-0.042268,0.024428,-0.065457,-0.045282,0.058501,0.004446,-0.048650,0.073093,0.017978,0.010898,...,-0.023594,-0.035485,-0.076856,0.110092,0.097804,0.045815,0.050381,0.010365,-0.009330,3
959,-0.048291,0.024253,-0.058090,-0.049777,0.065684,0.012749,-0.029067,0.065140,0.017709,0.003354,...,-0.025283,-0.004741,-0.071955,0.094485,0.091972,0.044412,0.026763,0.005267,-0.023925,15


In [13]:
report, best_est = svm_classifier(x_train_df, x_test_df, y_train, y_test)
best_est

In [14]:
print(report)

              precision    recall  f1-score   support

         0.0     0.6273    0.5702    0.5974       121
         1.0     0.6031    0.6583    0.6295       120

    accuracy                         0.6141       241
   macro avg     0.6152    0.6143    0.6134       241
weighted avg     0.6152    0.6141    0.6134       241



### Speaker Independent and Context dependent

In [15]:
with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/train_roberta_embeddings_target_context.pkl', 'rb') as f:
    x_train_context = pickle.load(f, encoding='latin1')

with open('/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/lexical/text_feature_extraction/Roberta/embeddings/test_roberta_embeddings_target_context.pkl', 'rb') as f:
    x_test_context = pickle.load(f, encoding='latin1')

In [16]:
x_train_vals = []
for sample in x_train_context["embeddings"]:
    x_train_vals.append(sample[0].tolist())
    
x_test_vals = []
for sample in x_test_context["embeddings"]:
    x_test_vals.append(sample[0].tolist())

x_train_df = pd.DataFrame({'embeddings':x_train_vals})
x_test_df = pd.DataFrame({'embeddings':x_test_vals})

def process_dataframes(data):
    temp_concat = pd.concat([data, data.embeddings.apply(pd.Series)], axis=1)
    temp_concat.drop(columns=['embeddings'], inplace = True)
    return temp_concat.add_prefix('feat_')

x_train_df = process_dataframes(x_train_df)
x_test_df = process_dataframes(x_test_df)

In [17]:
report, best_est = svm_classifier(x_train_df, x_test_df, y_train.values.ravel(), y_test.values.ravel())

In [18]:
best_est

In [19]:
print(report)

              precision    recall  f1-score   support

         0.0     0.5556    0.2479    0.3429       121
         1.0     0.5134    0.8000    0.6254       120

    accuracy                         0.5228       241
   macro avg     0.5345    0.5240    0.4841       241
weighted avg     0.5345    0.5228    0.4835       241



### Speaker Dependent and Context Dependent

In [20]:
x_train_df["speaker"] = x_train_speakers["speaker"]
x_test_df["speaker"] = x_test_speakers["speaker"]
x_train_df

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_759,feat_760,feat_761,feat_762,feat_763,feat_764,feat_765,feat_766,feat_767,speaker
0,-0.038043,0.020484,-0.037934,-0.047533,0.041384,-0.021605,-0.025485,0.056009,0.020886,-0.007872,...,-0.033682,0.017938,-0.046704,0.046065,0.076831,0.061433,0.048966,0.005408,-0.020410,25
1,-0.048003,0.009104,-0.058173,-0.055861,0.043636,-0.012225,-0.030814,0.078927,0.009098,0.000673,...,-0.024686,-0.018508,-0.053180,0.060910,0.086797,0.079986,0.025053,0.008293,-0.012534,1
2,-0.044102,0.035177,-0.058733,-0.057955,0.032190,-0.004664,-0.047953,0.070250,0.026520,0.009233,...,-0.022925,-0.034780,-0.049423,0.084519,0.086156,0.062470,0.059223,-0.004788,0.018165,16
3,-0.027051,0.022058,-0.036427,-0.048455,0.033961,-0.030353,-0.045193,0.036105,0.038044,-0.000136,...,-0.031423,-0.000003,-0.049954,0.040303,0.072793,0.067036,0.066101,-0.009949,-0.006590,0
4,-0.043638,0.004368,-0.065374,-0.061146,0.054605,-0.002144,-0.026990,0.094215,0.027731,0.010567,...,-0.000973,-0.027652,-0.085327,0.091328,0.095861,0.067118,0.046059,-0.016599,0.012356,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,-0.039310,0.037066,-0.046571,-0.065260,0.051698,-0.016767,-0.031291,0.076648,0.015499,0.004263,...,-0.015319,-0.000511,-0.049920,0.071715,0.074563,0.076162,0.042678,0.012004,-0.010996,7
957,-0.050667,0.033804,-0.058418,-0.056176,0.054804,-0.012460,-0.033632,0.075970,0.010548,0.015176,...,-0.020443,-0.012083,-0.055914,0.056376,0.088470,0.062911,0.035794,0.006294,-0.014431,24
958,-0.048101,0.000480,-0.064807,-0.035729,0.062656,-0.016133,-0.025848,0.072533,0.009934,-0.011529,...,-0.030411,-0.024485,-0.055541,0.046072,0.077137,0.052030,0.028238,0.019283,-0.009146,3
959,-0.048232,0.011562,-0.062602,-0.058484,0.043577,-0.018654,-0.039850,0.071719,0.015226,-0.000606,...,-0.019766,-0.022871,-0.050284,0.049105,0.075946,0.081527,0.021215,-0.000873,-0.000177,15


In [21]:
report, best_est = svm_classifier(x_train_df, x_test_df, y_train.values.ravel(), y_test.values.ravel())
best_est

In [22]:
print(report)

              precision    recall  f1-score   support

         0.0     0.6273    0.5702    0.5974       121
         1.0     0.6031    0.6583    0.6295       120

    accuracy                         0.6141       241
   macro avg     0.6152    0.6143    0.6134       241
weighted avg     0.6152    0.6141    0.6134       241



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu