In [29]:
import pandas as pd
import numpy as np
import os
import warnings
from sklearn import preprocessing

from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [30]:
labels = pd.read_csv("/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/data/scene_labels.csv")

In [31]:
labels.head(5)

Unnamed: 0,SCENE,KEY,SPEAKER,SHOW,Sarcasm,Sarcasm_Type
0,1_10004,1_10004_u,SHELDON,BBT,0.0,NONE
1,1_10009,1_10009_u,PENNY,BBT,0.0,NONE
2,1_1001,1_1001_u,RAJ,BBT,0.0,NONE
3,1_1003,1_1003_u,HOWARD,BBT,1.0,PRO
4,1_10190,1_10190_u,SHELDON,BBT,0.0,NONE


In [32]:
scenes = list(pd.unique(labels["SCENE"]))

In [33]:
len(scenes)

1202

In [34]:
parent_dir = "/Users/yoshithaakunuri/Documents/CSCI535/Project/Final/data/features"
visual_features = {}
context_visual_features = {}
for i in range(len(scenes)):
    # Visual Features
    try:
        vf = np.load(os.path.join(parent_dir, "utterances_final", "resnet_pool5_" + scenes[i] + ".npy"))
        #Global average pooling
        vf_p = np.apply_over_axes(np.mean, vf, [2, 3]) 
        vf_p = np.reshape(vf_p, (vf_p.shape[0],2048))
    except:
        vf_p = np.zeros((15, 2048))
        
    # Visual Features - Context
    try:
        c_vf = np.load(os.path.join(parent_dir, "context_final", "resnet_pool5_" + scenes[i] + "_c.npy"))
        #Global average pooling
        c_vf_p = np.apply_over_axes(np.mean, c_vf, [2, 3]) 
        c_vf_p = np.reshape(c_vf_p, (c_vf_p.shape[0],2048))
    except:
        c_vf_p = np.zeros((15, 2048))
    
    visual_features[scenes[i]] = vf_p
    context_visual_features[scenes[i]] = c_vf_p

In [35]:
def get_model_data(context_video_features, video_features):
    model_data = pd.DataFrame(columns=['context_video_feature', 'video_feature','sarcasm','sarcasm_type', 'speaker'])
    for index, row in labels.iterrows():
#         audio_key = row["SCENE"] + "_u.wav"
        model_data = model_data.append({'context_video_feature': context_video_features[row["SCENE"]],
                                    'video_feature': video_features[row["SCENE"]],
                                    'sarcasm' : row["Sarcasm"],
                                    'sarcasm_type' : row["Sarcasm_Type"],
                                    'speaker' : row["SPEAKER"]},
                                  ignore_index=True)
    return model_data

In [36]:
def get_video_mean_pool(video) -> np.ndarray:
    return np.array([np.mean(feature_vector, axis=0) for feature_vector in video])
    
def get_video_median_pool(video) -> np.ndarray:
        return np.array([np.median(feature_vector, axis=0) for feature_vector in video])
    
def get_video_max_pool(video) -> np.ndarray:
        return np.array([np.max(feature_vector, axis=0) for feature_vector in video])

def get_video_min_pool(video) -> np.ndarray:
        return np.array([np.min(feature_vector, axis=0) for feature_vector in video])

def get_video_sum_pool(video) -> np.ndarray:
        return np.array([np.sum(feature_vector, axis=0) for feature_vector in video])

In [37]:
def get_train_test_split(model_data, x_columns, y_column, stratify_column):
    X_train, X_test, Y_train, Y_test = train_test_split(
        model_data[x_columns],
        model_data[y_column],
        train_size=0.8, 
        test_size=0.2, 
        random_state=42, 
        shuffle=True,
        stratify=model_data[stratify_column])
    
    print("Train: ",X_train.shape, Y_train.shape,
      "Test: ",(X_test.shape, Y_test.shape))
    print(type(X_train))
    return X_train, X_test, Y_train, Y_test

def process_dataframes_pool(data):
    temp_concat_utt = pd.concat([data, data.video_feature.apply(pd.Series)], axis=1)
    temp_concat_utt.drop(columns=['video_feature', 'context_video_feature'], inplace = True)
    temp_concat_ctxt = pd.concat([data, data.context_video_feature.apply(pd.Series)], axis=1)
    temp_concat_ctxt.drop(columns=['video_feature','context_video_feature'], inplace = True)
    final = pd.merge(temp_concat_ctxt, temp_concat_utt, left_index=True, right_index=True)
    return final.add_prefix('feat_')
    
def get_pooled_data(X_train, X_test, pool_type):
    if pool_type == "mean":
        X_train_mean = X_train.copy()
        X_train_mean["video_feature"] = get_video_mean_pool(X_train["video_feature"]).tolist()
        X_train_mean["context_video_feature"] = get_video_mean_pool(X_train["context_video_feature"]).tolist()
        X_test_mean = X_test.copy()
        X_test_mean["video_feature"] = get_video_mean_pool(X_test["video_feature"]).tolist()
        X_test_mean["context_video_feature"] = get_video_mean_pool(X_test["context_video_feature"]).tolist()
        return process_dataframes_pool(X_train_mean), process_dataframes_pool(X_test_mean)
    if pool_type == "median":
        X_train_median = X_train.copy()
        X_train_median["video_feature"] = get_video_median_pool(X_train["video_feature"]).tolist()
        X_train_median["context_video_feature"] = get_video_median_pool(X_train["context_video_feature"]).tolist()
        X_test_median = X_test.copy()
        X_test_median["video_feature"] = get_video_median_pool(X_test["video_feature"]).tolist()
        X_test_median["context_video_feature"] = get_video_median_pool(X_test["context_video_feature"]).tolist()
        return process_dataframes_pool(X_train_median), process_dataframes_pool(X_test_median)
    if pool_type == "max":
        X_train_max = X_train.copy()
        X_train_max["video_feature"] = get_video_max_pool(X_train["video_feature"]).tolist()
        X_train_max["context_video_feature"] = get_video_max_pool(X_train["context_video_feature"]).tolist()
        X_test_max = X_test.copy()
        X_test_max["video_feature"] = get_video_max_pool(X_test["video_feature"]).tolist()
        X_test_max["context_video_feature"] = get_video_max_pool(X_test["context_video_feature"]).tolist()
        return process_dataframes_pool(X_train_max), process_dataframes_pool(X_test_max)
    if pool_type == "min":
        X_train_min = X_train.copy()
        X_train_min["video_feature"] = get_video_min_pool(X_train["video_feature"]).tolist()
        X_train_min["context_video_feature"] = get_video_min_pool(X_train["context_video_feature"]).tolist()
        X_test_min = X_test.copy()
        X_test_min["video_feature"] = get_video_min_pool(X_test["video_feature"]).tolist()
        X_test_min["context_video_feature"] = get_video_min_pool(X_test["context_video_feature"]).tolist()
        return process_dataframes_pool(X_train_min), process_dataframes_pool(X_test_min)
    if pool_type == "sum":
        X_train_sum = X_train.copy()
        X_train_sum["video_feature"] = get_video_sum_pool(X_train["video_feature"]).tolist()
        X_train_sum["context_video_feature"] = get_video_sum_pool(X_train["context_video_feature"]).tolist()
        X_test_sum = X_test.copy()
        X_test_sum["video_feature"] = get_video_sum_pool(X_test["video_feature"]).tolist()
        X_test_sum["context_video_feature"] = get_video_sum_pool(X_test["context_video_feature"]).tolist()
        return process_dataframes_pool(X_train_sum), process_dataframes_pool(X_test_sum)
    

In [38]:
def svm_classifier(X_train, X_test, Y_train, Y_test):
    svm_clf = svm.SVC(random_state=0, kernel = "rbf", gamma = "scale", class_weight = "balanced")
    C = C = [0.0001, 0.0003, 0.0005, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10]
    gscv_clf = GridSearchCV(
        estimator=svm_clf, 
        param_grid=dict(C=C),
        n_jobs=-1, 
        cv = 10, 
        scoring = 'f1_micro', 
        refit = True)

    gscv_clf.fit(X_train, Y_train)
    Y_test_pred = gscv_clf.predict(X_test)
    report = classification_report(Y_test, Y_test_pred, digits=4)
    return report, gscv_clf.best_estimator_

In [39]:
warnings.filterwarnings("ignore")

model_data = get_model_data(context_visual_features, visual_features)
# Label Encode Speaker
le = preprocessing.LabelEncoder()
model_data['speaker_encode'] = le.fit_transform(model_data['speaker'])
model_data.head(5)

Unnamed: 0,context_video_feature,video_feature,sarcasm,sarcasm_type,speaker,speaker_encode
0,"[[0.30626756, 0.66872513, 0.28155187, 0.312841...","[[0.13835047, 0.2704592, 0.44648886, 0.1415337...",0.0,NONE,SHELDON,25
1,"[[0.17222668, 0.7350801, 0.30408216, 0.3297085...","[[0.46479157, 0.1813915, 0.22123067, 0.5245148...",0.0,NONE,PENNY,15
2,"[[0.048892517, 0.23698963, 0.4193368, 0.341263...","[[0.253619, 0.25664786, 0.6646118, 0.4821793, ...",0.0,NONE,RAJ,21
3,"[[0.5546928, 0.36914897, 0.8243424, 0.18547274...","[[0.55624646, 0.16990338, 0.62457716, 0.209021...",1.0,PRO,HOWARD,7
4,"[[0.73847526, 1.0405933, 0.11508263, 0.9313859...","[[0.6140023, 0.4846397, 0.79425097, 0.13518682...",0.0,NONE,SHELDON,25


### Speaker Independent

In [40]:
X_train, X_test, Y_train, Y_test = get_train_test_split(model_data, ['context_video_feature', 'video_feature'], 'sarcasm', 'sarcasm_type')
X_train_mean, X_test_mean = get_pooled_data(X_train, X_test, "mean")
X_train_median, X_test_median = get_pooled_data(X_train, X_test, "median")
X_train_max, X_test_max = get_pooled_data(X_train, X_test, "max")
X_train_min, X_test_min = get_pooled_data(X_train, X_test, "min")
X_train_sum, X_test_sum = get_pooled_data(X_train, X_test, "sum")

Train:  (961, 2) (961,) Test:  ((241, 2), (241,))
<class 'pandas.core.frame.DataFrame'>


In [41]:
X_train_mean.head()

Unnamed: 0,feat_0_x,feat_1_x,feat_2_x,feat_3_x,feat_4_x,feat_5_x,feat_6_x,feat_7_x,feat_8_x,feat_9_x,...,feat_2038_y,feat_2039_y,feat_2040_y,feat_2041_y,feat_2042_y,feat_2043_y,feat_2044_y,feat_2045_y,feat_2046_y,feat_2047_y
424,0.359086,0.57387,0.522864,0.396681,0.533292,0.485872,0.522722,0.473802,0.448652,0.405195,...,0.407312,0.451004,0.436169,0.403903,0.458166,0.186549,0.437864,0.346711,0.424549,0.356524
190,0.355151,0.576513,0.495811,0.418642,0.559866,0.472464,0.505025,0.457916,0.420476,0.420158,...,0.393004,0.448692,0.426229,0.393386,0.479226,0.199675,0.470453,0.318255,0.464625,0.323267
1080,0.369109,0.561941,0.476888,0.40867,0.549181,0.463585,0.471377,0.493767,0.439472,0.387979,...,0.361899,0.44532,0.422796,0.388583,0.439197,0.181434,0.459565,0.320772,0.446543,0.327752
973,0.3628,0.527821,0.499812,0.405818,0.537153,0.497216,0.490736,0.493988,0.411164,0.4031,...,0.403726,0.408884,0.413097,0.378066,0.485424,0.18004,0.440938,0.335303,0.41697,0.322297
410,0.373813,0.586645,0.490355,0.393859,0.547158,0.473,0.499915,0.499821,0.440584,0.410531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
print("Processing Mean-pooled data")
mean_report, mean_best_est = svm_classifier(X_train_mean, X_test_mean, Y_train, Y_test)
print("Processing Median-pooled data")
median_report, median_best_est = svm_classifier(X_train_median, X_test_median, Y_train, Y_test)
print("Processing Max-pooled data")
max_report, max_best_est = svm_classifier(X_train_max, X_test_max, Y_train, Y_test)
print("Processing Min-pooled data")
min_report, min_best_est = svm_classifier(X_train_min, X_test_min, Y_train, Y_test)
print("Processing Sum-pooled data")
sum_report, sum_best_est = svm_classifier(X_train_sum, X_test_sum, Y_train, Y_test)

Processing Mean-pooled data
Processing Median-pooled data
Processing Max-pooled data
Processing Min-pooled data
Processing Sum-pooled data


In [43]:
print("********************************mean report********************************")
print(mean_report)
print(mean_best_est)
print("\n")
print("********************************median report********************************")
print(median_report)
print(median_best_est)
print("\n")
print("********************************max report********************************")
print(max_report)
print(max_best_est)
print("\n")
print("********************************min report********************************")
print(min_report)
print(min_best_est)
print("\n")
print("********************************sum report********************************")
print(sum_report)
print(sum_best_est)

********************************mean report********************************
              precision    recall  f1-score   support

         0.0     0.5727    0.5250    0.5478       120
         1.0     0.5649    0.6116    0.5873       121

    accuracy                         0.5685       241
   macro avg     0.5688    0.5683    0.5676       241
weighted avg     0.5688    0.5685    0.5676       241

SVC(C=10, class_weight='balanced', random_state=0)


********************************median report********************************
              precision    recall  f1-score   support

         0.0     0.6071    0.5667    0.5862       120
         1.0     0.5969    0.6364    0.6160       121

    accuracy                         0.6017       241
   macro avg     0.6020    0.6015    0.6011       241
weighted avg     0.6020    0.6017    0.6012       241

SVC(C=3, class_weight='balanced', random_state=0)


********************************max report********************************
            

### Speaker Dependent

In [44]:
X_train, X_test, Y_train, Y_test = get_train_test_split(model_data, ['context_video_feature', 'video_feature', 'speaker_encode'], 'sarcasm', 'sarcasm')
X_train_mean, X_test_mean = get_pooled_data(X_train, X_test, "mean")
X_train_median, X_test_median = get_pooled_data(X_train, X_test, "median")
X_train_max, X_test_max = get_pooled_data(X_train, X_test, "max")
X_train_min, X_test_min = get_pooled_data(X_train, X_test, "min")
X_train_sum, X_test_sum = get_pooled_data(X_train, X_test, "sum")

Train:  (961, 3) (961,) Test:  ((241, 3), (241,))
<class 'pandas.core.frame.DataFrame'>


In [45]:
X_train_mean.head()

Unnamed: 0,feat_speaker_encode_x,feat_0_x,feat_1_x,feat_2_x,feat_3_x,feat_4_x,feat_5_x,feat_6_x,feat_7_x,feat_8_x,...,feat_2038_y,feat_2039_y,feat_2040_y,feat_2041_y,feat_2042_y,feat_2043_y,feat_2044_y,feat_2045_y,feat_2046_y,feat_2047_y
324,25,0.354183,0.616095,0.510825,0.399152,0.551451,0.490694,0.526886,0.478711,0.445661,...,0.371151,0.478997,0.423825,0.396466,0.521244,0.187658,0.481839,0.318229,0.456867,0.352971
642,1,0.37111,0.563054,0.518036,0.404901,0.551501,0.482981,0.512165,0.482048,0.414397,...,0.424716,0.491309,0.461088,0.411208,0.405419,0.161563,0.461361,0.312351,0.452058,0.347572
799,16,0.340702,0.576864,0.507647,0.373208,0.538533,0.481508,0.478072,0.452048,0.420225,...,0.341406,0.444857,0.411741,0.386679,0.434025,0.19392,0.465375,0.327785,0.425144,0.335552
581,0,0.32531,0.599653,0.455508,0.412584,0.611186,0.49196,0.521322,0.473488,0.44412,...,0.437738,0.424267,0.460791,0.390121,0.468468,0.178169,0.43797,0.345549,0.428563,0.353435
718,2,0.369256,0.51452,0.491212,0.383352,0.567897,0.51993,0.515037,0.489647,0.44512,...,0.412736,0.445004,0.442901,0.358753,0.461017,0.177771,0.452176,0.322367,0.458215,0.31609


In [46]:
print("Processing Mean-pooled data")
mean_report, mean_best_est = svm_classifier(X_train_mean, X_test_mean, Y_train, Y_test)
print("Processing Median-pooled data")
median_report, median_best_est = svm_classifier(X_train_median, X_test_median, Y_train, Y_test)
print("Processing Max-pooled data")
max_report, max_best_est = svm_classifier(X_train_max, X_test_max, Y_train, Y_test)
print("Processing Min-pooled data")
min_report, min_best_est = svm_classifier(X_train_min, X_test_min, Y_train, Y_test)
print("Processing Sum-pooled data")
sum_report, sum_best_est = svm_classifier(X_train_sum, X_test_sum, Y_train, Y_test)

Processing Mean-pooled data
Processing Median-pooled data
Processing Max-pooled data
Processing Min-pooled data
Processing Sum-pooled data


In [47]:
print("********************************mean report********************************")
print(mean_report)
print(mean_best_est)
print("\n")
print("********************************median report********************************")
print(median_report)
print(median_best_est)
print("\n")
print("********************************max report********************************")
print(max_report)
print(max_best_est)
print("\n")
print("********************************min report********************************")
print(min_report)
print(min_best_est)
print("\n")
print("********************************sum report********************************")
print(sum_report)
print(sum_best_est)

********************************mean report********************************
              precision    recall  f1-score   support

         0.0     0.6061    0.6612    0.6324       121
         1.0     0.6239    0.5667    0.5939       120

    accuracy                         0.6141       241
   macro avg     0.6150    0.6139    0.6131       241
weighted avg     0.6149    0.6141    0.6132       241

SVC(C=10, class_weight='balanced', random_state=0)


********************************median report********************************
              precision    recall  f1-score   support

         0.0     0.6048    0.6198    0.6122       121
         1.0     0.6068    0.5917    0.5992       120

    accuracy                         0.6058       241
   macro avg     0.6058    0.6058    0.6057       241
weighted avg     0.6058    0.6058    0.6057       241

SVC(C=10, class_weight='balanced', random_state=0)


********************************max report********************************
           