In [1]:
import pandas as pd 
import pickle
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [2]:
labels = pd.read_csv("../../data/scene_labels.csv")

In [3]:
labels.head(5)

Unnamed: 0,SCENE,KEY,SPEAKER,SHOW,Sarcasm,Sarcasm_Type
0,1_10004,1_10004_u,SHELDON,BBT,0.0,NONE
1,1_10009,1_10009_u,PENNY,BBT,0.0,NONE
2,1_1001,1_1001_u,RAJ,BBT,0.0,NONE
3,1_1003,1_1003_u,HOWARD,BBT,1.0,PRO
4,1_10190,1_10190_u,SHELDON,BBT,0.0,NONE


#### Perform mean, median, max, min and sum pooling on audio feature data

In [4]:
def get_audio_mean_pool(audio) -> np.ndarray:
    return np.array([np.mean(feature_vector, axis=1) for feature_vector in audio])
    
def get_audio_median_pool(audio) -> np.ndarray:
        return np.array([np.median(feature_vector, axis=1) for feature_vector in audio])
    
def get_audio_max_pool(audio) -> np.ndarray:
        return np.array([np.max(feature_vector, axis=1) for feature_vector in audio])

def get_audio_min_pool(audio) -> np.ndarray:
        return np.array([np.min(feature_vector, axis=1) for feature_vector in audio])

def get_audio_sum_pool(audio) -> np.ndarray:
        return np.array([np.sum(feature_vector, axis=1) for feature_vector in audio])

In [5]:
def get_model_data(context_audio_features, audio_features):
    model_data = pd.DataFrame(columns=['context_audio_feature', 'audio_feature','sarcasm','sarcasm_type', 'speaker'])
    for index, row in labels.iterrows():
        audio_key = row["SCENE"] + "_u.wav"
        context_audio_key = row["SCENE"] + "_c.wav"
        model_data = model_data.append({'context_audio_feature': context_audio_features[context_audio_key],
                                    'audio_feature': audio_features[audio_key],
                                    'sarcasm' : row["Sarcasm"],
                                    'sarcasm_type' : row["Sarcasm_Type"],
                                    'speaker' : row["SPEAKER"]},
                                  ignore_index=True)
    return model_data

In [6]:
def get_train_test_split(model_data, x_columns, y_column, stratify_column):
    X_train, X_test, Y_train, Y_test = train_test_split(
        model_data[x_columns],
        model_data[y_column],
        train_size=0.8, 
        test_size=0.2, 
        random_state=42, 
        shuffle=True,
        stratify=model_data[stratify_column])
    
    print("Train: ",X_train.shape, Y_train.shape,
      "Test: ",(X_test.shape, Y_test.shape))
    return X_train, X_test, Y_train, Y_test

def process_dataframes_pool(data):
    temp_concat_utt = pd.concat([data, data.audio_feature.apply(pd.Series)], axis=1)
    temp_concat_utt.drop(columns=['audio_feature', 'context_audio_feature'], inplace = True)
    temp_concat_ctxt = pd.concat([data, data.context_audio_feature.apply(pd.Series)], axis=1)
    temp_concat_ctxt.drop(columns=['audio_feature','context_audio_feature'], inplace = True)
    final = pd.merge(temp_concat_ctxt, temp_concat_utt, left_index=True, right_index=True)
    return final.add_prefix('feat_')

def get_pooled_data(X_train, X_test, pool_type):
    if pool_type == "mean":
        X_train_mean = X_train.copy()
        X_train_mean["audio_feature"] = get_audio_mean_pool(X_train["audio_feature"]).tolist()
        X_train_mean["context_audio_feature"] = get_audio_mean_pool(X_train["context_audio_feature"]).tolist()
        X_test_mean = X_test.copy()
        X_test_mean["audio_feature"] = get_audio_mean_pool(X_test["audio_feature"]).tolist()
        X_test_mean["context_audio_feature"] = get_audio_mean_pool(X_test["context_audio_feature"]).tolist()
        return process_dataframes_pool(X_train_mean), process_dataframes_pool(X_test_mean)
    if pool_type == "median":
        X_train_median = X_train.copy()
        X_train_median["audio_feature"] = get_audio_median_pool(X_train["audio_feature"]).tolist()
        X_train_median["context_audio_feature"] = get_audio_median_pool(X_train["context_audio_feature"]).tolist()
        X_test_median = X_test.copy()
        X_test_median["audio_feature"] = get_audio_median_pool(X_test["audio_feature"]).tolist()
        X_test_median["context_audio_feature"] = get_audio_median_pool(X_test["context_audio_feature"]).tolist()
        return process_dataframes_pool(X_train_median), process_dataframes_pool(X_test_median)
    if pool_type == "max":
        X_train_max = X_train.copy()
        X_train_max["audio_feature"] = get_audio_max_pool(X_train["audio_feature"]).tolist()
        X_train_max["context_audio_feature"] = get_audio_max_pool(X_train["context_audio_feature"]).tolist()
        X_test_max = X_test.copy()
        X_test_max["audio_feature"] = get_audio_max_pool(X_test["audio_feature"]).tolist()
        X_test_max["context_audio_feature"] = get_audio_max_pool(X_test["context_audio_feature"]).tolist()
        return process_dataframes_pool(X_train_max), process_dataframes_pool(X_test_max)
    if pool_type == "min":
        X_train_min = X_train.copy()
        X_train_min["audio_feature"] = get_audio_min_pool(X_train["audio_feature"]).tolist()
        X_train_min["context_audio_feature"] = get_audio_min_pool(X_train["context_audio_feature"]).tolist()
        X_test_min = X_test.copy()
        X_test_min["audio_feature"] = get_audio_min_pool(X_test["audio_feature"]).tolist()
        X_test_min["context_audio_feature"] = get_audio_min_pool(X_test["context_audio_feature"]).tolist()
        return process_dataframes_pool(X_train_min), process_dataframes_pool(X_test_min)
    if pool_type == "sum":
        X_train_sum = X_train.copy()
        X_train_sum["audio_feature"] = get_audio_sum_pool(X_train["audio_feature"]).tolist()
        X_train_sum["context_audio_feature"] = get_audio_sum_pool(X_train["context_audio_feature"]).tolist()
        X_test_sum = X_test.copy()
        X_test_sum["audio_feature"] = get_audio_sum_pool(X_test["audio_feature"]).tolist()
        X_test_sum["context_audio_feature"] = get_audio_sum_pool(X_test["context_audio_feature"]).tolist()
        return process_dataframes_pool(X_train_sum), process_dataframes_pool(X_test_sum)
    

In [7]:
def svm_classifier(X_train, X_test, Y_train, Y_test):
    svm_clf = svm.SVC(random_state=0, kernel = "rbf", gamma = "scale", class_weight = "balanced")
    C = C = [0.0001, 0.0003, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10]
    gscv_clf = GridSearchCV(
        estimator=svm_clf, 
        param_grid=dict(C=C),
        n_jobs=-1, 
        cv = 10, 
        scoring = 'f1_micro', 
        refit = True)

    gscv_clf.fit(X_train, Y_train)
    Y_test_pred = gscv_clf.predict(X_test)
    report = classification_report(Y_test, Y_test_pred, digits=4)
    return report, gscv_clf.best_estimator_

### Librosa

In [8]:
with open('../../audio_features/feat_dict_librosa_lld.pickle', 'rb') as f:
    librosa_audio_features = pickle.load(f, encoding='latin1')
with open('../../audio_features/feat_dict_context_librosa_lld.pickle', 'rb') as f:
    librosa_context_audio_features = pickle.load(f, encoding='latin1')

model_data = get_model_data(librosa_context_audio_features, librosa_audio_features)

In [9]:
le = preprocessing.LabelEncoder()
model_data['speaker_encode'] = le.fit_transform(model_data['speaker'])
model_data.head(5)

Unnamed: 0,context_audio_feature,audio_feature,sarcasm,sarcasm_type,speaker,speaker_encode
0,"[[-637.1869506835938, -527.0845538153164, -543...","[[-386.6164855957031, -649.6673512776692, -633...",0.0,NONE,SHELDON,25
1,"[[-625.8624267578125, -403.14568843397984, -38...","[[-255.5221405029297, -484.69307309105284, -52...",0.0,NONE,PENNY,15
2,"[[-500.3988952636719, -396.36312383413315, -22...","[[-569.0548095703125, -381.4147456242488, -221...",0.0,NONE,RAJ,21
3,"[[-313.1777038574219, -228.85994769251624, -35...","[[-237.61074829101562, -211.002773845897, -382...",1.0,PRO,HOWARD,7
4,"[[-337.88116455078125, -420.55431980762665, -3...","[[-530.5701293945312, -374.83951552370763, -42...",0.0,NONE,SHELDON,25


### Speaker InDependent

In [10]:
X_train, X_test, Y_train, Y_test = get_train_test_split(model_data, ['context_audio_feature', 'audio_feature'], 'sarcasm', 'sarcasm_type')
X_train_mean, X_test_mean = get_pooled_data(X_train, X_test, "mean")
X_train_median, X_test_median = get_pooled_data(X_train, X_test, "median")
X_train_max, X_test_max = get_pooled_data(X_train, X_test, "max")
X_train_min, X_test_min = get_pooled_data(X_train, X_test, "min")
X_train_sum, X_test_sum = get_pooled_data(X_train, X_test, "sum")

Train:  (961, 2) (961,) Test:  ((241, 2), (241,))


In [11]:
X_train_mean

Unnamed: 0,feat_0_x,feat_1_x,feat_2_x,feat_3_x,feat_4_x,feat_5_x,feat_6_x,feat_7_x,feat_8_x,feat_9_x,...,feat_680_y,feat_681_y,feat_682_y,feat_683_y,feat_684_y,feat_685_y,feat_686_y,feat_687_y,feat_688_y,feat_689_y
424,-491.387419,30.205155,-27.412366,5.604003,-13.323685,-6.865882,-20.542915,-4.678920,-7.905652,3.155878,...,1.125484e-06,-8.350791e-06,-2.448649e-05,-2.444950e-06,1.773051e-06,-5.933391e-06,5.577799e-06,2.768175e-06,-2.865543e-08,2457.913572
190,-432.786251,50.456066,-11.555711,16.138151,-9.621079,-3.223019,-3.271723,-0.723591,-15.082747,8.399972,...,6.155641e-08,-3.376856e-07,1.734986e-06,1.481730e-05,1.833685e-05,4.994186e-06,8.753401e-07,5.883414e-05,6.618425e-05,2204.617012
1080,-469.233189,57.092191,-18.906468,7.119017,-16.023651,-7.064552,-11.714972,-6.315686,-3.519953,1.053781,...,5.256801e-07,-4.159585e-08,-1.279666e-07,1.928197e-06,8.877141e-06,5.344956e-06,1.235294e-06,8.187448e-08,6.588997e-07,2035.251557
973,-453.350807,33.214758,-9.750560,4.209757,-8.547969,-1.476552,-3.163934,-5.989700,-2.412843,-0.146500,...,1.901608e-05,1.564018e-06,5.818885e-06,1.930007e-05,2.535673e-07,8.110663e-06,5.944234e-06,1.052672e-05,1.391345e-05,2209.364051
410,-521.862728,36.716217,-25.075117,-5.753345,-8.759412,-6.616770,-17.231237,-6.975125,-7.402556,-2.488970,...,9.596038e-06,1.483809e-05,8.396891e-07,7.671362e-07,7.251195e-07,-3.301392e-07,-3.308897e-07,-9.198965e-07,-1.308050e-07,2724.439690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738,-722.788708,29.461742,-9.358579,2.220057,-7.642063,-4.480673,-3.979769,-3.328637,-3.726134,1.021782,...,7.754550e-07,7.467839e-07,1.149825e-07,6.097857e-07,7.645530e-07,1.985524e-06,9.483728e-08,4.420855e-07,-5.887439e-09,1881.235389
647,-524.698857,67.313351,-34.677084,5.744490,-14.416633,-17.488467,-15.380007,-11.692525,-8.962209,-5.476233,...,-3.690226e-08,-6.260465e-09,-3.353474e-08,-9.692826e-09,-9.518642e-09,-2.775267e-09,-6.361671e-09,-6.533792e-09,-7.563470e-09,1953.300301
711,-557.584545,41.227323,-17.090427,0.030986,-3.074235,-5.508230,-3.933226,-7.866933,-8.919098,-1.442423,...,8.801394e-06,4.258324e-06,-9.021307e-07,2.729060e-06,2.914306e-06,4.298751e-06,3.086096e-06,4.279348e-06,1.578884e-06,1826.415762
48,-466.616656,38.633176,-10.559482,4.132284,-12.130911,-8.026599,-9.757533,4.581836,-11.894246,-2.978326,...,8.225064e-08,5.200861e-07,7.529215e-08,1.695032e-08,9.834820e-09,1.669789e-07,4.496658e-07,9.583579e-07,1.025741e-05,2318.016493


In [12]:
mean_report, mean_best_est = svm_classifier(X_train_mean, X_test_mean, Y_train, Y_test)
median_report, median_best_est = svm_classifier(X_train_median, X_test_median, Y_train, Y_test)
max_report, max_best_est = svm_classifier(X_train_max, X_test_max, Y_train, Y_test)
min_report, min_best_est = svm_classifier(X_train_min, X_test_min, Y_train, Y_test)
sum_report, sum_best_est = svm_classifier(X_train_sum, X_test_sum, Y_train, Y_test)

In [13]:
print("********************************mean report********************************")
print(mean_report)
print(mean_best_est)
print("\n")
print("********************************median report********************************")
print(median_report)
print(median_best_est)
print("\n")
print("********************************max report********************************")
print(max_report)
print(max_best_est)
print("\n")
print("********************************min report********************************")
print(min_report)
print(min_best_est)
print("\n")
print("********************************sum report********************************")
print(sum_report)
print(sum_best_est)

********************************mean report********************************
              precision    recall  f1-score   support

         0.0     0.6016    0.6167    0.6091       120
         1.0     0.6102    0.5950    0.6025       121

    accuracy                         0.6058       241
   macro avg     0.6059    0.6059    0.6058       241
weighted avg     0.6059    0.6058    0.6058       241

SVC(C=3, class_weight='balanced', random_state=0)


********************************median report********************************
              precision    recall  f1-score   support

         0.0     0.5893    0.5500    0.5690       120
         1.0     0.5814    0.6198    0.6000       121

    accuracy                         0.5851       241
   macro avg     0.5853    0.5849    0.5845       241
weighted avg     0.5853    0.5851    0.5845       241

SVC(C=1, class_weight='balanced', random_state=0)


********************************max report********************************
             

### Speaker Dependent

In [14]:
X_train, X_test, Y_train, Y_test = get_train_test_split(model_data, ['context_audio_feature', 'audio_feature', 'speaker_encode'], 'sarcasm', 'sarcasm')
X_train_mean, X_test_mean = get_pooled_data(X_train, X_test, "mean")
X_train_median, X_test_median = get_pooled_data(X_train, X_test, "median")
X_train_max, X_test_max = get_pooled_data(X_train, X_test, "max")
X_train_min, X_test_min = get_pooled_data(X_train, X_test, "min")
X_train_sum, X_test_sum = get_pooled_data(X_train, X_test, "sum")

Train:  (961, 3) (961,) Test:  ((241, 3), (241,))


In [15]:
X_train_mean

Unnamed: 0,feat_speaker_encode_x,feat_0_x,feat_1_x,feat_2_x,feat_3_x,feat_4_x,feat_5_x,feat_6_x,feat_7_x,feat_8_x,...,feat_680_y,feat_681_y,feat_682_y,feat_683_y,feat_684_y,feat_685_y,feat_686_y,feat_687_y,feat_688_y,feat_689_y
324,25,-506.672798,46.154263,-39.211877,13.523715,0.821942,-16.954055,-12.122244,-7.327957,-8.786229,...,5.007085e-09,1.532212e-10,-3.114050e-11,-1.249344e-09,-5.524266e-10,-4.393906e-11,-4.192109e-11,-4.314128e-11,-1.434404e-10,2514.293564
642,1,-565.886723,33.248333,-11.532116,2.689587,-4.614599,-7.403536,-10.708334,-8.096213,-11.006827,...,5.266067e-07,2.706749e-07,1.233596e-07,1.913343e-08,7.980824e-08,2.036104e-07,1.299349e-07,3.845283e-08,2.892322e-08,2720.597151
799,16,-618.150105,57.119137,-29.939492,6.748259,-9.540614,-15.023442,-3.246388,-9.056393,-6.913536,...,-1.628289e-10,6.016880e-10,-5.335005e-10,1.531447e-10,2.369769e-11,-1.373060e-10,-3.229824e-10,-8.908204e-10,1.700640e-10,1805.621151
581,0,-566.445730,61.147471,-23.738294,5.316752,7.590967,-2.952158,-12.334577,0.072482,-11.712783,...,1.181333e-07,1.427768e-07,1.822243e-07,1.777306e-07,2.573679e-07,2.792644e-07,2.595060e-07,1.272342e-07,2.191990e-07,1968.553605
718,2,-588.388707,46.635526,-14.493789,6.114791,-10.791523,-7.389698,-6.379806,-4.767111,-6.825133,...,1.784563e-05,1.005852e-05,2.323093e-05,1.582298e-05,1.579704e-05,2.709138e-05,7.804031e-05,2.842325e-05,1.140084e-05,2099.081642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,7,-526.558038,48.874100,-17.952136,10.690188,-8.937657,-11.189482,-12.513108,-11.275591,-11.511780,...,1.441415e-07,4.495046e-08,-1.342612e-07,-2.561603e-07,-2.188260e-07,-2.232353e-07,-4.022474e-08,-3.665159e-08,-2.721018e-08,2181.861249
939,24,-632.765117,59.722220,-35.491817,8.438597,-14.606148,-8.889788,-6.981102,-12.914771,-7.929532,...,2.717587e-07,9.716023e-08,2.525609e-07,3.294379e-07,4.831609e-08,3.236329e-08,4.769985e-07,6.273771e-07,1.918419e-08,1737.472835
1201,3,-535.175479,69.368147,-6.378856,2.857985,-12.734441,-4.675329,-4.091378,-5.301596,-10.900802,...,1.030835e-06,-1.142428e-06,-4.459561e-07,-5.881707e-07,-4.240471e-07,-2.867816e-06,-2.142010e-06,-5.976282e-07,-3.004520e-07,1249.896708
618,15,-522.744893,36.211814,-18.205056,6.475225,-15.690371,-8.670938,-5.495549,-7.829533,-11.639393,...,1.203774e-05,1.251057e-05,1.386756e-05,1.984167e-05,3.665304e-06,9.000139e-07,-6.361246e-07,4.504843e-06,9.575403e-07,2703.056200


In [16]:
mean_report, mean_best_est = svm_classifier(X_train_mean, X_test_mean, Y_train, Y_test)
median_report, median_best_est = svm_classifier(X_train_median, X_test_median, Y_train, Y_test)
max_report, max_best_est = svm_classifier(X_train_max, X_test_max, Y_train, Y_test)
min_report, min_best_est = svm_classifier(X_train_min, X_test_min, Y_train, Y_test)
sum_report, sum_best_est = svm_classifier(X_train_sum, X_test_sum, Y_train, Y_test)

In [17]:
print("********************************mean report********************************")
print(mean_report)
print(mean_best_est)
print("\n")
print("********************************median report********************************")
print(median_report)
print(median_best_est)
print("\n")
print("********************************max report********************************")
print(max_report)
print(max_best_est)
print("\n")
print("********************************min report********************************")
print(min_report)
print(min_best_est)
print("\n")
print("********************************sum report********************************")
print(sum_report)
print(sum_best_est)

********************************mean report********************************
              precision    recall  f1-score   support

         0.0     0.6522    0.6198    0.6356       121
         1.0     0.6349    0.6667    0.6504       120

    accuracy                         0.6432       241
   macro avg     0.6435    0.6433    0.6430       241
weighted avg     0.6436    0.6432    0.6430       241

SVC(C=3, class_weight='balanced', random_state=0)


********************************median report********************************
              precision    recall  f1-score   support

         0.0     0.6207    0.5950    0.6076       121
         1.0     0.6080    0.6333    0.6204       120

    accuracy                         0.6141       241
   macro avg     0.6143    0.6142    0.6140       241
weighted avg     0.6144    0.6141    0.6140       241

SVC(C=10, class_weight='balanced', random_state=0)


********************************max report********************************
            