In [1]:
import pandas as pd 
import pickle
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn import preprocessing

In [2]:
labels = pd.read_csv("../../data/scene_labels.csv")

In [3]:
labels.head(5)

Unnamed: 0,SCENE,KEY,SPEAKER,SHOW,Sarcasm,Sarcasm_Type
0,1_10004,1_10004_u,SHELDON,BBT,0.0,NONE
1,1_10009,1_10009_u,PENNY,BBT,0.0,NONE
2,1_1001,1_1001_u,RAJ,BBT,0.0,NONE
3,1_1003,1_1003_u,HOWARD,BBT,1.0,PRO
4,1_10190,1_10190_u,SHELDON,BBT,0.0,NONE


#### Perform mean, median, max, min and sum pooling on audio feature data

In [4]:
def get_audio_mean_pool(audio) -> np.ndarray:
        return np.array([np.mean(feature_vector, axis=1) for feature_vector in audio])
    
def get_audio_median_pool(audio) -> np.ndarray:
        return np.array([np.median(feature_vector, axis=1) for feature_vector in audio])
    
def get_audio_max_pool(audio) -> np.ndarray:
        return np.array([np.max(feature_vector, axis=1) for feature_vector in audio])

def get_audio_min_pool(audio) -> np.ndarray:
        return np.array([np.min(feature_vector, axis=1) for feature_vector in audio])

def get_audio_sum_pool(audio) -> np.ndarray:
        return np.array([np.sum(feature_vector, axis=1) for feature_vector in audio])

In [5]:
def get_model_data(audio_features):
    model_data = pd.DataFrame(columns=['audio_feature','sarcasm','sarcasm_type', 'speaker'])
    for index, row in labels.iterrows():
        audio_key = row["SCENE"] + "_u.wav"
        model_data = model_data.append({'audio_feature': audio_features[audio_key],
                                    'sarcasm' : row["Sarcasm"],
                                    'sarcasm_type' : row["Sarcasm_Type"],
                                    'speaker' : row["SPEAKER"]},
                                  ignore_index=True)
    return model_data

In [6]:
def get_train_test_split(model_data, x_column, y_column, stratify_column):
    model_data = model_data[model_data[y_column] != "NONE"]
    model_data = model_data[model_data[y_column] != "LIK"]
    X_train, X_test, Y_train, Y_test = train_test_split(
        model_data[x_column],
        model_data[y_column],
        train_size=0.8, 
        test_size=0.2, 
        random_state=0, 
        shuffle=True,
        stratify=model_data[stratify_column])
    
    print("Train: ",X_train.shape, Y_train.shape,
      "Test: ",(X_test.shape, Y_test.shape))
    return X_train, X_test, Y_train, Y_test

def process_dataframes_pool(data):
    temp_concat = pd.concat([data, data.audio_feature.apply(pd.Series)], axis=1)
    temp_concat.drop(columns=['audio_feature'], inplace = True)
    return temp_concat.add_prefix('feat_')

def get_pooled_data(X_train, X_test, pool_type):
    if pool_type == "mean":
        X_train_mean = X_train.copy()
        X_train_mean["audio_feature"] = get_audio_mean_pool(X_train["audio_feature"]).tolist()
        X_test_mean = X_test.copy()
        X_test_mean["audio_feature"] = get_audio_mean_pool(X_test["audio_feature"]).tolist()
        return process_dataframes_pool(X_train_mean), process_dataframes_pool(X_test_mean)
    if pool_type == "median":
        X_train_median = X_train.copy()
        X_train_median["audio_feature"] = get_audio_median_pool(X_train["audio_feature"]).tolist()
        X_test_median = X_test.copy()
        X_test_median["audio_feature"] = get_audio_median_pool(X_test["audio_feature"]).tolist()
        return process_dataframes_pool(X_train_median), process_dataframes_pool(X_test_median)
    if pool_type == "max":
        X_train_max = X_train.copy()
        X_train_max["audio_feature"] = get_audio_max_pool(X_train["audio_feature"]).tolist()
        X_test_max = X_test.copy()
        X_test_max["audio_feature"] = get_audio_max_pool(X_test["audio_feature"]).tolist()
        return process_dataframes_pool(X_train_max), process_dataframes_pool(X_test_max)
    if pool_type == "min":
        X_train_min = X_train.copy()
        X_train_min["audio_feature"] = get_audio_min_pool(X_train["audio_feature"]).tolist()
        X_test_min = X_test.copy()
        X_test_min["audio_feature"] = get_audio_min_pool(X_test["audio_feature"]).tolist()
        return process_dataframes_pool(X_train_min), process_dataframes_pool(X_test_min)
    if pool_type == "sum":
        X_train_sum = X_train.copy()
        X_train_sum["audio_feature"] = get_audio_sum_pool(X_train["audio_feature"]).tolist()
        X_test_sum = X_test.copy()
        X_test_sum["audio_feature"] = get_audio_sum_pool(X_test["audio_feature"]).tolist()
        return process_dataframes_pool(X_train_sum), process_dataframes_pool(X_test_sum)

In [7]:
def svm_classifier(X_train, X_test, Y_train, Y_test):
    y = Y_train.to_frame()
    class_weight = compute_class_weight(
        "balanced", classes=np.unique(y["sarcasm_type"]), y=y["sarcasm_type"])
    class_weight = dict(zip(np.unique(y["sarcasm_type"]), class_weight))
    print(class_weight)
    svm_clf = svm.SVC(random_state=0, kernel = "rbf", gamma = "scale", class_weight = class_weight)
    C = [0.0001, 0.0003, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10]
    gscv_clf = GridSearchCV(
        estimator=svm_clf, 
        param_grid=dict(C=C),
        n_jobs=-1, 
        cv = 10, 
        scoring = 'f1_weighted', 
        refit = True)

    gscv_clf.fit(X_train, Y_train)
    Y_test_pred = gscv_clf.predict(X_test)
    report = classification_report(Y_test, Y_test_pred)
    return report, gscv_clf.best_estimator_

### Librosa

In [8]:
with open('../../audio_features/feat_dict_librosa_lld.pickle', 'rb') as f:
    librosa_audio_features = pickle.load(f, encoding='latin1')
    
model_data = get_model_data(librosa_audio_features)

In [9]:
le = preprocessing.LabelEncoder()
model_data['speaker_encode'] = le.fit_transform(model_data['speaker'])
model_data.head(5)

Unnamed: 0,audio_feature,sarcasm,sarcasm_type,speaker,speaker_encode
0,"[[-386.6164855957031, -649.6673512776692, -633...",0.0,NONE,SHELDON,25
1,"[[-255.5221405029297, -484.69307309105284, -52...",0.0,NONE,PENNY,15
2,"[[-569.0548095703125, -381.4147456242488, -221...",0.0,NONE,RAJ,21
3,"[[-237.61074829101562, -211.002773845897, -382...",1.0,PRO,HOWARD,7
4,"[[-530.5701293945312, -374.83951552370763, -42...",0.0,NONE,SHELDON,25


### Speaker InDependent

In [10]:
X_train, X_test, Y_train, Y_test = get_train_test_split(model_data, ['audio_feature'], 'sarcasm_type', 'sarcasm_type')
X_train_mean, X_test_mean = get_pooled_data(X_train, X_test, "mean")
X_train_median, X_test_median = get_pooled_data(X_train, X_test, "median")
X_train_max, X_test_max = get_pooled_data(X_train, X_test, "max")
X_train_min, X_test_min = get_pooled_data(X_train, X_test, "min")
X_train_sum, X_test_sum = get_pooled_data(X_train, X_test, "sum")

Train:  (478, 1) (478,) Test:  ((120, 1), (120,))


In [11]:
X_train_mean

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_680,feat_681,feat_682,feat_683,feat_684,feat_685,feat_686,feat_687,feat_688,feat_689
958,-369.351871,91.254959,-27.379227,18.191076,-4.789608,-7.801251,-6.451133,2.833261,-3.444577,7.773839,...,1.779793e-05,-5.644049e-06,-2.281017e-05,-7.371887e-06,-3.191250e-06,1.667960e-05,1.208145e-05,1.312270e-05,1.776264e-05,1782.744959
202,-495.675627,59.551065,-20.761832,2.463188,-7.607014,1.502851,-11.113201,3.539889,-14.341005,1.449337,...,1.202403e-06,1.885561e-07,2.332944e-06,7.759040e-07,2.582447e-06,1.680403e-06,2.156706e-09,2.042830e-06,8.282749e-06,1751.437062
682,-535.365411,63.200451,-12.870674,3.025093,-6.761493,-11.449728,-6.086660,-5.562491,-16.128709,-0.356241,...,1.996847e-08,-2.545700e-08,-1.552495e-08,2.520681e-08,-6.182463e-08,-7.405121e-10,1.424300e-08,7.301724e-08,4.308089e-08,2007.142911
194,-405.503566,64.153635,-25.050759,16.500235,-21.582463,-9.272239,-9.394320,-2.923790,-21.772381,4.787057,...,-2.098489e-05,-1.403024e-05,2.955682e-06,4.132535e-06,-6.073916e-06,-9.611974e-07,-3.149529e-06,-4.346540e-06,-6.391744e-06,2028.466167
900,-323.004289,78.224168,-14.782346,20.668334,-1.690239,0.336689,-11.012535,-10.321979,-15.713233,-6.917664,...,-1.381945e-06,-6.843335e-05,-4.792991e-05,-8.316555e-06,-3.735689e-05,-1.473786e-04,-2.383466e-03,-1.091554e-03,-3.314700e-05,1908.701238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,-481.760038,34.439554,-25.271098,0.690394,-13.593833,-14.584145,-8.424248,-13.193332,-14.872142,2.135144,...,-1.938303e-04,-7.420840e-05,-2.244490e-04,-4.480754e-04,-3.742732e-04,-3.474449e-04,-4.195189e-04,-2.095974e-04,-2.609573e-04,2722.748783
438,-549.143421,22.699838,-28.601302,-8.092967,-23.170595,1.609487,-1.672145,-3.816946,-10.176296,-5.658785,...,1.750270e-07,8.334630e-07,3.383938e-07,5.043198e-07,-6.990797e-08,-2.671723e-08,-5.918899e-08,-5.624387e-08,-7.403888e-08,2545.743937
360,-535.827895,57.926660,-26.572901,13.809408,-20.541574,-12.534308,-20.489499,0.673617,-16.524239,-11.804054,...,-1.498952e-06,-4.371936e-07,-3.325493e-07,3.662553e-07,7.750654e-08,-9.355444e-07,-4.605985e-06,-2.588148e-06,-3.972060e-07,1967.934957
364,-503.175404,72.958792,-14.206524,3.430940,-2.234581,0.002384,-5.310516,-11.065383,-12.215080,3.054173,...,-2.368018e-09,-2.143062e-09,-4.371254e-09,-7.882857e-08,-6.372154e-08,-8.034233e-10,-5.406834e-10,-2.595908e-08,-7.114213e-09,1924.303525


In [12]:
mean_report, mean_best_est = svm_classifier(X_train_mean, X_test_mean, Y_train, Y_test)
median_report, median_best_est = svm_classifier(X_train_median, X_test_median, Y_train, Y_test)
max_report, max_best_est = svm_classifier(X_train_max, X_test_max, Y_train, Y_test)
min_report, min_best_est = svm_classifier(X_train_min, X_test_min, Y_train, Y_test)
sum_report, sum_best_est = svm_classifier(X_train_sum, X_test_sum, Y_train, Y_test)

{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}
{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}
{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}
{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}
{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}


In [13]:
print("********************************mean report********************************")
print(mean_report)
print(mean_best_est)
print("\n")
print("********************************median report********************************")
print(median_report)
print(median_best_est)
print("\n")
print("********************************max report********************************")
print(max_report)
print(max_best_est)
print("\n")
print("********************************min report********************************")
print(min_report)
print(min_best_est)
print("\n")
print("********************************sum report********************************")
print(sum_report)
print(sum_best_est)

********************************mean report********************************
              precision    recall  f1-score   support

         EMB       0.17      0.71      0.28        17
         ILL       0.00      0.00      0.00        36
         PRO       0.56      0.42      0.48        67

    accuracy                           0.33       120
   macro avg       0.24      0.37      0.25       120
weighted avg       0.34      0.33      0.31       120

SVC(C=0.5,
    class_weight={'EMB': 2.276190476190476, 'ILL': 1.1220657276995305,
                  'PRO': 0.5989974937343359},
    random_state=0)


********************************median report********************************
              precision    recall  f1-score   support

         EMB       0.21      0.76      0.33        17
         ILL       0.50      0.28      0.36        36
         PRO       0.63      0.36      0.46        67

    accuracy                           0.39       120
   macro avg       0.45      0.47      0.38

### Speaker Dependent

In [14]:
X_train, X_test, Y_train, Y_test = get_train_test_split(model_data, ['audio_feature', 'speaker_encode'], 'sarcasm_type', 'sarcasm_type')
X_train_mean, X_test_mean = get_pooled_data(X_train, X_test, "mean")
X_train_median, X_test_median = get_pooled_data(X_train, X_test, "median")
X_train_max, X_test_max = get_pooled_data(X_train, X_test, "max")
X_train_min, X_test_min = get_pooled_data(X_train, X_test, "min")
X_train_sum, X_test_sum = get_pooled_data(X_train, X_test, "sum")

Train:  (478, 2) (478,) Test:  ((120, 2), (120,))


In [15]:
X_train_mean

Unnamed: 0,feat_speaker_encode,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_680,feat_681,feat_682,feat_683,feat_684,feat_685,feat_686,feat_687,feat_688,feat_689
958,2,-369.351871,91.254959,-27.379227,18.191076,-4.789608,-7.801251,-6.451133,2.833261,-3.444577,...,1.779793e-05,-5.644049e-06,-2.281017e-05,-7.371887e-06,-3.191250e-06,1.667960e-05,1.208145e-05,1.312270e-05,1.776264e-05,1782.744959
202,15,-495.675627,59.551065,-20.761832,2.463188,-7.607014,1.502851,-11.113201,3.539889,-14.341005,...,1.202403e-06,1.885561e-07,2.332944e-06,7.759040e-07,2.582447e-06,1.680403e-06,2.156706e-09,2.042830e-06,8.282749e-06,1751.437062
682,7,-535.365411,63.200451,-12.870674,3.025093,-6.761493,-11.449728,-6.086660,-5.562491,-16.128709,...,1.996847e-08,-2.545700e-08,-1.552495e-08,2.520681e-08,-6.182463e-08,-7.405121e-10,1.424300e-08,7.301724e-08,4.308089e-08,2007.142911
194,25,-405.503566,64.153635,-25.050759,16.500235,-21.582463,-9.272239,-9.394320,-2.923790,-21.772381,...,-2.098489e-05,-1.403024e-05,2.955682e-06,4.132535e-06,-6.073916e-06,-9.611974e-07,-3.149529e-06,-4.346540e-06,-6.391744e-06,2028.466167
900,4,-323.004289,78.224168,-14.782346,20.668334,-1.690239,0.336689,-11.012535,-10.321979,-15.713233,...,-1.381945e-06,-6.843335e-05,-4.792991e-05,-8.316555e-06,-3.735689e-05,-1.473786e-04,-2.383466e-03,-1.091554e-03,-3.314700e-05,1908.701238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,11,-481.760038,34.439554,-25.271098,0.690394,-13.593833,-14.584145,-8.424248,-13.193332,-14.872142,...,-1.938303e-04,-7.420840e-05,-2.244490e-04,-4.480754e-04,-3.742732e-04,-3.474449e-04,-4.195189e-04,-2.095974e-04,-2.609573e-04,2722.748783
438,15,-549.143421,22.699838,-28.601302,-8.092967,-23.170595,1.609487,-1.672145,-3.816946,-10.176296,...,1.750270e-07,8.334630e-07,3.383938e-07,5.043198e-07,-6.990797e-08,-2.671723e-08,-5.918899e-08,-5.624387e-08,-7.403888e-08,2545.743937
360,9,-535.827895,57.926660,-26.572901,13.809408,-20.541574,-12.534308,-20.489499,0.673617,-16.524239,...,-1.498952e-06,-4.371936e-07,-3.325493e-07,3.662553e-07,7.750654e-08,-9.355444e-07,-4.605985e-06,-2.588148e-06,-3.972060e-07,1967.934957
364,14,-503.175404,72.958792,-14.206524,3.430940,-2.234581,0.002384,-5.310516,-11.065383,-12.215080,...,-2.368018e-09,-2.143062e-09,-4.371254e-09,-7.882857e-08,-6.372154e-08,-8.034233e-10,-5.406834e-10,-2.595908e-08,-7.114213e-09,1924.303525


In [16]:
mean_report, mean_best_est = svm_classifier(X_train_mean, X_test_mean, Y_train, Y_test)
median_report, median_best_est = svm_classifier(X_train_median, X_test_median, Y_train, Y_test)
max_report, max_best_est = svm_classifier(X_train_max, X_test_max, Y_train, Y_test)
min_report, min_best_est = svm_classifier(X_train_min, X_test_min, Y_train, Y_test)
sum_report, sum_best_est = svm_classifier(X_train_sum, X_test_sum, Y_train, Y_test)

{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}
{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}
{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}
{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}
{'EMB': 2.276190476190476, 'ILL': 1.1220657276995305, 'PRO': 0.5989974937343359}


In [17]:
print("********************************mean report********************************")
print(mean_report)
print(mean_best_est)
print("\n")
print("********************************median report********************************")
print(median_report)
print(median_best_est)
print("\n")
print("********************************max report********************************")
print(max_report)
print(max_best_est)
print("\n")
print("********************************min report********************************")
print(min_report)
print(min_best_est)
print("\n")
print("********************************sum report********************************")
print(sum_report)
print(sum_best_est)

********************************mean report********************************
              precision    recall  f1-score   support

         EMB       0.17      0.71      0.28        17
         ILL       0.00      0.00      0.00        36
         PRO       0.56      0.42      0.48        67

    accuracy                           0.33       120
   macro avg       0.24      0.37      0.25       120
weighted avg       0.34      0.33      0.31       120

SVC(C=0.5,
    class_weight={'EMB': 2.276190476190476, 'ILL': 1.1220657276995305,
                  'PRO': 0.5989974937343359},
    random_state=0)


********************************median report********************************
              precision    recall  f1-score   support

         EMB       0.21      0.76      0.33        17
         ILL       0.48      0.28      0.35        36
         PRO       0.63      0.36      0.46        67

    accuracy                           0.39       120
   macro avg       0.44      0.47      0.38