In [49]:
import pandas as pd
import numpy as np
import random

In [50]:
audio_feature = pd.read_csv("/Users/amily/Desktop/audio_feature.csv")
text_feature = pd.read_csv('/Users/amily/Desktop/text_feature.csv')


In [51]:
import ast
def convert_string_to_list(vector_string):
    """
    Convert a space-separated string of numbers into a list of floats.

    Args:
    - vector_string (str): String representation of a vector.

    Returns:
    - list: List of floats extracted from the input string.
    """
    # Remove brackets and split by spaces
    clean_string = vector_string.strip("[]")  # Remove leading and trailing brackets
    string_values = clean_string.split()      # Split by spaces

    # Convert to a list of floats
    float_values = [float(value) for value in string_values]
    return float_values
#merged_df["word2vec_pretrained_with_stop"] = merged_df["word2vec_pretrained_with_stop"].apply(convert_string_to_list)
#merged_df["word2vec_pretrained_without_stop"] = merged_df["word2vec_pretrained_without_stop"].apply(convert_string_to_list)
#merged_df['mfccs'] = merged_df['mfccs'].apply(ast.literal_eval)

audio_feature['mfccs'] = audio_feature['mfccs'].apply(ast.literal_eval)
text_feature["word2vec_pretrained_with_stop"] = text_feature["word2vec_pretrained_with_stop"].apply(convert_string_to_list)
text_feature["word2vec_pretrained_without_stop"] = text_feature["word2vec_pretrained_without_stop"].apply(convert_string_to_list)


In [52]:
import pandas as pd

def expand_list_column(df, list_column, prefix):
    """
    Expands a column of lists into multiple columns.

    Args:
    - df (pd.DataFrame): The DataFrame containing the list column.
    - list_column (str): The name of the column with lists to expand.
    - prefix (str): Prefix for the new columns.

    Returns:
    - pd.DataFrame: DataFrame with the list column expanded into separate columns.
    """
    # Create a DataFrame from the list column where each list element becomes a new column
    expanded_cols = pd.DataFrame(df[list_column].tolist(), index=df.index)

    # Rename the new columns with the specified prefix
    expanded_cols = expanded_cols.add_prefix(f"{prefix}_")

    # Drop the original list column and join the expanded columns
    df = df.drop(columns=[list_column]).join(expanded_cols)

    return df

#merged_df = expand_list_column(merged_df, list_column="word2vec_pretrained_with_stop", prefix="wpwiths")
#merged_df = expand_list_column(merged_df, list_column="word2vec_pretrained_without_stop", prefix="wpwithouts")
#merged_df = expand_list_column(merged_df, list_column="mfccs", prefix="mfccs")

text_feature  = expand_list_column(text_feature , list_column="word2vec_pretrained_with_stop", prefix="wpwiths")
text_feature = expand_list_column(text_feature, list_column="word2vec_pretrained_without_stop", prefix="wpwithouts")
audio_feature = expand_list_column(audio_feature, list_column="mfccs", prefix="mfccs")


In [53]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle as pk

columns_to_exclude = ["file_id","duration",'Unnamed: 0', 'sarcasm', 'text', 'label', 'emotion',
       'tokenized_without_stop', 'tokenized_with_stop', 'sentence_split','sarcasm_label', 'emotion_label', 'sentiment_label', 'id', 'spectral_centroid', 'spectral_bandwidth', 'pitch', 'energy',
       'loudness', 'sentence_level_similarity_emotion',
       'sentence_level_similarity_word', 'exclamation']
columns_to_pca_text = [i for i in text_feature.columns if i not in columns_to_exclude]
columns_to_pca_audio = [i for i in audio_feature.columns if i not in columns_to_exclude]

scaler = StandardScaler()
text_subset_data = scaler.fit_transform(text_feature[columns_to_pca_text])
text_pca = PCA()
text_result = text_pca.fit_transform(text_subset_data)
pk.dump(text_pca, open("pca_text.pkl", "wb"))

audio_subset_data = scaler.fit_transform(audio_feature[columns_to_pca_audio])
audio_pca = PCA()
audio_result = audio_pca.fit_transform(audio_subset_data)
pk.dump(audio_pca, open("pca_audio.pkl", "wb"))



def apply_pca_to_subset(df, subset_columns, n_components, pca_name, prefix="PCA"):
    """
    Apply PCA to a subset of columns in a DataFrame.

    Args:
    - df (pd.DataFrame): The input DataFrame.
    - subset_columns (list): List of column names to apply PCA on.
    - n_components (int): Number of principal components to keep.
    - prefix (str): Prefix for the new PCA columns.

    Returns:
    - pd.DataFrame: Updated DataFrame with PCA-transformed columns.
    """
    # Step 1: Standardize the selected columns
    scaler = StandardScaler()
    subset_data = scaler.fit_transform(df[subset_columns])
    
    pca_reload = pk.load(open(pca_name,"rb"))
    pca_components = pca_reload.transform(subset_data)
    pca_components = pca_components[:, :n_components]

    pca_columns = [f"{prefix}_{i+1}" for i in range(n_components)]
    pca_df = pd.DataFrame(pca_components, columns=pca_columns, index=df.index)

    df = df.drop(columns=subset_columns)  # Drop the original columns
    df = pd.concat([df, pca_df], axis=1)  # Add PCA components

    return df


text_feature = apply_pca_to_subset(text_feature, columns_to_pca_text, 50,"pca_text.pkl", prefix="TEXT")
audio_feature = apply_pca_to_subset(audio_feature, columns_to_pca_audio, 8, "pca_audio.pkl",prefix="AUDIO")

In [65]:
merged_df = pd.merge(audio_feature, text_feature, on="id", how="inner")
columns_to_exclude = ['file_id','text_x','sarcasm_y', 'text_y', 'label_y', 'emotion_y','tokenized_without_stop',
       'tokenized_with_stop', 'sentence_split']
merged_remain = [i for i in merged_df .columns if i not in columns_to_exclude]
merged_df = merged_df[merged_remain]
merged_df['exclamation'] = merged_df['exclamation'] .astype(int)
merged_df.rename(columns={'sarcasm_x': 'sarcasm_label', 'emotion_x': 'emotion_label',"label_x": "sentiment_label"}, inplace=True)

In [69]:
sarcasm_only_df = merged_df.loc[merged_df['emotion_label'] == 'sarcasm']["id"]
anger_only_df = merged_df.loc[merged_df['emotion_label'] == 'anger']["id"]
disgust_only_df = merged_df.loc[merged_df['emotion_label'] == 'disgust']["id"]
fear_only_df = merged_df.loc[merged_df['emotion_label'] == 'fear']["id"]
joy_only_df = merged_df.loc[merged_df['emotion_label'] == 'joy']["id"]
sadness_only_df = merged_df.loc[merged_df['emotion_label'] == 'sadness']["id"]
surprise_only_df = merged_df.loc[merged_df['emotion_label'] == 'surprise']["id"]
non_sarcasm_negative_only_df = merged_df.loc[(merged_df['emotion_label'] != 'sarcasm') & (merged_df['sentiment_label'] == 0), "id"]
positive_only_df = merged_df.loc[(merged_df['sentiment_label'] == 1), "id"]


random.seed(42)
sarcasm_gerneral_negative = list(sarcasm_only_df.sample(n=30,random_state = 42))
rest_negative = list(non_sarcasm_negative_only_df.sample(n=70, random_state = 42))
sentiment_index = list(positive_only_df) + rest_negative + sarcasm_gerneral_negative

rest = list(merged_df.loc[~merged_df['id'].isin(sentiment_index),"id"])

sentiment = merged_df[merged_df['id'].isin(sentiment_index)]
sarcasm = merged_df[~merged_df['id'].isin(sentiment_index)]

In [71]:
sarcasm.to_csv("/Users/amily/Desktop/sarcasm_dataset.csv")
sentiment.to_csv("/Users/amily/Desktop/sentiment_dataset.csv")

In [121]:
audio_features = ["sarcasm_label","emotion_label", "sentiment_label","id","spectral_centroid","spectral_bandwidth","pitch","energy","loudness"]
text_features = ["sarcasm_label","emotion_label","sentiment_label","id","sentence_level_similarity_emotion","sentence_level_similarity_word","exclamation"]

#audio feature filter
text = text_features + [col for col in sarcasm.columns if ('text' in col.lower())]
#text feature filter
audio = text_features + [col for col in sarcasm.columns if ('audio' in col.lower())]

sentiment_text = sentiment[text]
sentiment_audio = sentiment[audio]
sarcasm_text = sarcasm[text]
sarcasm_audio = sarcasm[audio]

sentiment_text.to_csv("/Users/amily/Desktop/text_sentiment_dataset.csv")
sentiment_audio.to_csv("/Users/amily/Desktop/audio_sentiment_dataset.csv")
sarcasm_text.to_csv("/Users/amily/Desktop/text_sarcasm_dataset.csv")
sarcasm_audio.to_csv("/Users/amily/Desktop/audio_sarcasm_dataset.csv")


In [122]:
sarcasm_text

Unnamed: 0,sarcasm_label,emotion_label,sentiment_label,id,sentence_level_similarity_emotion,sentence_level_similarity_word,exclamation,TEXT_1,TEXT_2,TEXT_3,...,TEXT_41,TEXT_42,TEXT_43,TEXT_44,TEXT_45,TEXT_46,TEXT_47,TEXT_48,TEXT_49,TEXT_50
0,0,anger,0,1,0.000000,0.000000,0,4.560276,9.361766,-1.944473,...,-3.528326,-5.195582,-2.071413,2.389241,0.679799,1.464318,2.688781,1.950308,-1.281537,-0.505422
1,0,anger,0,2,0.000000,0.000000,0,-1.882026,-10.278302,1.993432,...,4.879186,1.806766,-7.082715,-2.375127,-2.932272,1.903256,7.856745,4.622939,1.243021,2.524237
2,0,anger,0,3,0.439865,0.913640,0,-3.933006,-1.102636,-2.704832,...,8.493502,3.512994,2.752871,-1.689377,-3.118805,0.926467,-0.505093,0.921205,-2.015440,0.257554
3,0,anger,0,4,0.009295,0.920026,1,-0.994384,1.081702,2.105956,...,0.143942,-1.922510,1.761534,-0.180522,-0.999845,-4.713001,0.863674,-0.488971,-1.116376,-2.301616
4,0,anger,0,5,0.000000,0.000000,1,-3.942157,-0.354175,-1.530681,...,-1.793798,-1.769770,0.242620,0.510232,-1.445818,-2.013260,0.900879,1.465607,0.475343,-2.228691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,0,surprise,0,420,0.000000,0.000000,0,-6.692437,-2.622186,-2.577338,...,-1.737109,-4.056155,0.947543,1.190202,-1.446018,-2.297449,-1.035243,-0.385022,2.778561,1.791775
421,0,surprise,0,422,-0.688249,0.938817,0,-4.782721,-6.644730,2.585404,...,-0.281552,0.693386,-0.137404,-0.627561,-0.753154,1.652641,0.852713,3.201987,1.966591,3.279186
422,0,surprise,0,423,0.000000,0.000000,1,-1.147711,12.323190,9.173530,...,0.133216,0.857063,-0.324311,1.159971,1.459785,-0.298021,0.313660,4.078432,1.701545,-2.340982
425,0,surprise,0,426,0.779693,0.706350,1,-1.075184,-1.541483,1.071947,...,0.924546,-1.730823,0.306840,1.716419,-0.743047,2.220859,-2.712793,-0.492945,2.053580,-0.552733
