# Load Libraries and Dataframes

In [1]:
import pandas as pd

dataframe = pd.read_csv("mustard++/dataframe.csv")
audio_features = pd.read_csv('audio_features.csv')
visual_features = pd.read_csv('visual_embedding.csv')
text_features = pd.read_csv('text_features_BERT.csv')

dataframe = dataframe.dropna(subset=['Sarcasm'])

# Text cleaning

In [68]:
text_features = text_features.merge(dataframe[['SCENE', 'SPEAKER']], on='SCENE', how='left')
text_features.head()

Unnamed: 0,SCENE,Embedding_0,Embedding_1,Embedding_2,Embedding_3,Embedding_4,Embedding_5,Embedding_6,Embedding_7,Embedding_8,...,Embedding_759,Embedding_760,Embedding_761,Embedding_762,Embedding_763,Embedding_764,Embedding_765,Embedding_766,Embedding_767,SPEAKER
0,1_10004,0.400577,-0.735008,-0.09615,-0.285756,-0.357918,-0.051856,1.045135,-0.481453,-0.091696,...,0.676469,0.125527,-0.045216,-0.050516,-0.035607,-0.009513,-0.084522,0.603833,0.591911,SHELDON
1,1_10009,0.387123,-0.837304,-0.0779,-0.247002,-0.358525,-0.08953,1.096665,-0.495899,-0.07485,...,0.699883,0.094027,-0.067608,-0.036613,-0.093318,0.002613,-0.112674,0.624801,0.613933,PENNY
2,1_1001,0.376343,-0.645923,-0.120487,-0.267022,-0.319736,-0.05079,0.99666,-0.409754,-0.078868,...,0.632195,0.140556,-0.138602,-0.057374,-0.059396,-0.009998,-0.095549,0.570483,0.570835,RAJ
3,1_1003,0.366896,-0.670176,-0.104165,-0.3006,-0.300686,-0.042529,0.994088,-0.427367,-0.069317,...,0.61026,0.107198,-0.092323,-0.042515,-0.088271,-0.008831,-0.11185,0.573721,0.568279,HOWARD
4,1_10190,0.40507,-0.688916,-0.153265,-0.233106,-0.311845,-0.056022,0.998371,-0.423859,-0.054147,...,0.631909,0.132351,-0.105029,-0.070631,-0.131713,-0.016279,-0.082275,0.584081,0.618788,SHELDON


In [69]:
# Perform dummy encoding
dummy_encoded = pd.get_dummies(text_features['SPEAKER'])

# Replace True with 1 and False with 0
dummy_encoded = dummy_encoded.replace({True: 1, False: 0})

# Merge the dummy encoded columns with the original DataFrame
text_features = pd.concat([text_features, dummy_encoded], axis=1)

# Drop the original 'SPEAKER' column
text_features.drop('SPEAKER', axis=1, inplace=True)

In [70]:
text_features.head()

Unnamed: 0,SCENE,Embedding_0,Embedding_1,Embedding_2,Embedding_3,Embedding_4,Embedding_5,Embedding_6,Embedding_7,Embedding_8,...,PERSON1,PERSON3,PHOEBE,RACHEL,RAJ,RICHARD,ROSE,ROSS,SHELDON,STUART
0,1_10004,0.400577,-0.735008,-0.09615,-0.285756,-0.357918,-0.051856,1.045135,-0.481453,-0.091696,...,0,0,0,0,0,0,0,0,1,0
1,1_10009,0.387123,-0.837304,-0.0779,-0.247002,-0.358525,-0.08953,1.096665,-0.495899,-0.07485,...,0,0,0,0,0,0,0,0,0,0
2,1_1001,0.376343,-0.645923,-0.120487,-0.267022,-0.319736,-0.05079,0.99666,-0.409754,-0.078868,...,0,0,0,0,1,0,0,0,0,0
3,1_1003,0.366896,-0.670176,-0.104165,-0.3006,-0.300686,-0.042529,0.994088,-0.427367,-0.069317,...,0,0,0,0,0,0,0,0,0,0
4,1_10190,0.40507,-0.688916,-0.153265,-0.233106,-0.311845,-0.056022,0.998371,-0.423859,-0.054147,...,0,0,0,0,0,0,0,0,1,0


### Save cleaned text dataframe

In [72]:
text_features = text_features.set_index('SCENE')
text_features.to_csv('text_final.csv', index=True)

# Audio cleaning

In [2]:
audio_features['audio_file'] = audio_features['audio_file'].str[:-6]
audio_features = audio_features[audio_features['audio_file'].isin(dataframe['SCENE'])]
audio_features.head()

Unnamed: 0,audio_file,intensity,intensity_median,intensity_std,words_per_minute,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,...,deltamelspectrogram_std_16,deltamelspectrogram_std_17,deltamelspectrogram_std_18,deltamelspectrogram_std_19,deltaspectral_centroids_0,deltaspectral_centroids_median_0,deltaspectral_centroids_std_0,zerocrossingrate_0,zerocrossingrate_median_0,zerocrossingrate_std_0
0,1_10004,0.059431,0.052855,0.04393,120.0,-243.93507,70.85715,-37.16033,19.805502,-22.752916,...,0.019585,0.003203,0.000882,0.000643,-13.020707,-0.776635,197.258437,0.18516,0.134766,0.13143
1,1_10009,0.048186,0.04419,0.024762,132.0,-199.60484,106.24419,-67.8631,5.480549,-34.111588,...,0.003284,0.000476,0.000306,8.8e-05,-4.160503,-3.967439,91.745498,0.127666,0.120117,0.035872
2,1_1001,0.288145,0.320339,0.182242,135.0,-77.95845,85.00533,-20.893145,31.380796,-13.386493,...,0.196793,0.072305,0.033911,0.014702,-11.572872,-6.619797,234.766572,0.150029,0.114258,0.121724
3,1_1003,0.256163,0.257658,0.158541,165.0,-98.87288,110.46051,-17.9312,35.882313,-12.107592,...,0.166771,0.023415,0.008049,0.004693,1.352301,14.914905,174.175723,0.103435,0.074707,0.080655
4,1_10190,0.039448,0.037176,0.029028,190.909091,-298.76166,64.262024,-43.251045,22.255568,-17.129074,...,0.00429,0.001511,0.000361,0.000111,-4.523149,5.029679,233.931902,0.168757,0.134766,0.10812


### Save audio dataframe

In [4]:
audio_features = audio_features.set_index('audio_file')
audio_features.to_csv('audio_final.csv', index=True)

  values = values.astype(str)


# Video/Image dataframe Cleaning

In [78]:
visual_features['name'] = visual_features['name'].str[:-6]
visual_features = visual_features[visual_features['name'].isin(dataframe['SCENE'])]
visual_features.head()

Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_2039,Feature_2040,Feature_2041,Feature_2042,Feature_2043,Feature_2044,Feature_2045,Feature_2046,Feature_2047,name
0,0.0,0.06134,0.249853,0.238058,0.008318,0.027183,0.287098,0.0,0.00116,0.009636,...,0.0,0.0,0.0,0.0,1.031434,0.0,0.0,0.0,0.0,1_6211
1,0.621487,1.806693,0.084369,0.0,0.034292,0.298424,0.0,0.009167,0.412455,0.0,...,0.244329,0.150186,5.013727,0.0,3.366949,1.250887,0.0,0.698703,0.018541,1_6221
2,0.0,0.741148,4.661753,0.003851,0.261149,6.765937,0.0,0.0,0.319741,5.591594,...,0.144673,0.004259,0.0,0.0,0.0,2.623596,0.0,0.0,0.559128,1_6355
3,0.899459,0.266036,0.041808,0.0,0.0,0.370648,1.612567,0.0,1.001736,0.0,...,0.0,0.0,0.683705,1.663404,0.006529,0.0,0.0,3.126461,1.439765,1_6370
4,0.169336,0.142878,0.0,0.084481,0.0,2.796083,0.016931,1.614749,5.185996,0.0,...,0.164436,0.0,1.688359,0.0,0.643267,0.0,0.0,0.507182,0.482349,1_6426


### Saving dataframe

In [80]:
visual_features = visual_features.set_index('name')
visual_features.to_csv('visual_final.csv', index=True)

# Labels

In [85]:
import pandas as pd

dataframe = pd.read_csv("mustard++/dataframe.csv")
dataframe = dataframe.dropna(subset=['Sarcasm'])
dataframe = dataframe[['Sarcasm', 'SCENE']]
dataframe = dataframe.set_index('SCENE')
dataframe.head()

Unnamed: 0_level_0,Sarcasm
SCENE,Unnamed: 1_level_1
1_10004,0.0
1_10009,0.0
1_1001,0.0
1_1003,1.0
1_10190,0.0


### Save labels dataframe

In [86]:
dataframe.to_csv('labels_final.csv', index=True)