In [176]:
import pandas as pd
import numpy as np
import random

In [177]:
audio_feature = pd.read_csv("/content/audio_feature.csv")
text_feature = pd.read_csv('/content/text_feature.csv')


In [178]:
merged_df = pd.merge(audio_feature, text_feature, on="id", how="inner")
merged_df = merged_df[["sarcasm_x","emotion_x","label_x",'id',"mfccs","spectral_centroid","spectral_bandwidth","pitch","energy","loudness",'sentence_level_similarity_emotion',"sentence_level_similarity_word","exclamation",'word2vec_pretrained_with_stop', 'word2vec_pretrained_without_stop']]

In [152]:
merged_df.rename(columns={'sarcasm_x': 'sarcasm_label', 'emotion_x': 'emotion_label',"label_x": "sentiment_label"}, inplace=True)

In [153]:
merged_df['exclamation'] = merged_df['exclamation'] .astype(int)

In [154]:
import ast
def convert_string_to_list(vector_string):
    """
    Convert a space-separated string of numbers into a list of floats.

    Args:
    - vector_string (str): String representation of a vector.

    Returns:
    - list: List of floats extracted from the input string.
    """
    # Remove brackets and split by spaces
    clean_string = vector_string.strip("[]")  # Remove leading and trailing brackets
    string_values = clean_string.split()      # Split by spaces

    # Convert to a list of floats
    float_values = [float(value) for value in string_values]
    return float_values
merged_df["word2vec_pretrained_with_stop"] = merged_df["word2vec_pretrained_with_stop"].apply(convert_string_to_list)
merged_df["word2vec_pretrained_without_stop"] = merged_df["word2vec_pretrained_without_stop"].apply(convert_string_to_list)
merged_df['mfccs'] = merged_df['mfccs'].apply(ast.literal_eval)

In [155]:
import pandas as pd

def expand_list_column(df, list_column, prefix):
    """
    Expands a column of lists into multiple columns.

    Args:
    - df (pd.DataFrame): The DataFrame containing the list column.
    - list_column (str): The name of the column with lists to expand.
    - prefix (str): Prefix for the new columns.

    Returns:
    - pd.DataFrame: DataFrame with the list column expanded into separate columns.
    """
    # Create a DataFrame from the list column where each list element becomes a new column
    expanded_cols = pd.DataFrame(df[list_column].tolist(), index=df.index)

    # Rename the new columns with the specified prefix
    expanded_cols = expanded_cols.add_prefix(f"{prefix}_")

    # Drop the original list column and join the expanded columns
    df = df.drop(columns=[list_column]).join(expanded_cols)

    return df

merged_df = expand_list_column(merged_df, list_column="word2vec_pretrained_with_stop", prefix="wpwiths")
merged_df = expand_list_column(merged_df, list_column="word2vec_pretrained_without_stop", prefix="wpwithouts")
merged_df = expand_list_column(merged_df, list_column="mfccs", prefix="mfccs")


In [156]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def apply_pca_to_subset(df, subset_columns, n_components, prefix="PCA"):
    """
    Apply PCA to a subset of columns in a DataFrame.

    Args:
    - df (pd.DataFrame): The input DataFrame.
    - subset_columns (list): List of column names to apply PCA on.
    - n_components (int): Number of principal components to keep.
    - prefix (str): Prefix for the new PCA columns.

    Returns:
    - pd.DataFrame: Updated DataFrame with PCA-transformed columns.
    """
    # Step 1: Standardize the selected columns
    scaler = StandardScaler()
    subset_data = scaler.fit_transform(df[subset_columns])

    # Step 2: Apply PCA
    pca = PCA(n_components=n_components)
    pca_components = pca.fit_transform(subset_data)

    # Step 3: Create a DataFrame for PCA components
    pca_columns = [f"{prefix}_{i+1}" for i in range(n_components)]
    pca_df = pd.DataFrame(pca_components, columns=pca_columns, index=df.index)

    # Step 4: Integrate PCA results back into the original DataFrame
    df = df.drop(columns=subset_columns)  # Drop the original columns
    df = pd.concat([df, pca_df], axis=1)  # Add PCA components

    # Optional: Print explained variance
    explained_variance = pca.explained_variance_ratio_
    print("Explained variance ratio of each component:", explained_variance)
    print("Total explained variance:", explained_variance.sum())

    return df

columns_to_exclude = ['sarcasm_label', 'emotion_label', 'sentiment_label', 'id', 'spectral_centroid', 'spectral_bandwidth', 'pitch', 'energy',
       'loudness', 'sentence_level_similarity_emotion',
       'sentence_level_similarity_word', 'exclamation']
columns_to_pca = [i for i in merged_df.columns if i not in columns_to_exclude]
merged_df = apply_pca_to_subset(merged_df, columns_to_pca, 50, prefix="PCA")

Explained variance ratio of each component: [0.08396164 0.07266012 0.04122102 0.03649229 0.03062402 0.02862101
 0.02601573 0.02372405 0.02173488 0.02049162 0.01924283 0.01896717
 0.01804777 0.0153255  0.01446341 0.01415297 0.01325934 0.01263183
 0.0120798  0.01159052 0.01124044 0.01084427 0.01011255 0.00941507
 0.00932782 0.00906426 0.00893638 0.00851068 0.00824834 0.0079896
 0.00792479 0.00754995 0.00736057 0.0071931  0.00700132 0.00673574
 0.00641341 0.00628771 0.00595    0.005773   0.00562909 0.00557653
 0.00542661 0.00535362 0.00519214 0.005002   0.00474166 0.00466856
 0.00457231 0.00447798]
Total explained variance: 0.7478270227362694


In [157]:
merged_df.columns

Index(['sarcasm_label', 'emotion_label', 'sentiment_label', 'id',
       'spectral_centroid', 'spectral_bandwidth', 'pitch', 'energy',
       'loudness', 'sentence_level_similarity_emotion',
       'sentence_level_similarity_word', 'exclamation', 'PCA_1', 'PCA_2',
       'PCA_3', 'PCA_4', 'PCA_5', 'PCA_6', 'PCA_7', 'PCA_8', 'PCA_9', 'PCA_10',
       'PCA_11', 'PCA_12', 'PCA_13', 'PCA_14', 'PCA_15', 'PCA_16', 'PCA_17',
       'PCA_18', 'PCA_19', 'PCA_20', 'PCA_21', 'PCA_22', 'PCA_23', 'PCA_24',
       'PCA_25', 'PCA_26', 'PCA_27', 'PCA_28', 'PCA_29', 'PCA_30', 'PCA_31',
       'PCA_32', 'PCA_33', 'PCA_34', 'PCA_35', 'PCA_36', 'PCA_37', 'PCA_38',
       'PCA_39', 'PCA_40', 'PCA_41', 'PCA_42', 'PCA_43', 'PCA_44', 'PCA_45',
       'PCA_46', 'PCA_47', 'PCA_48', 'PCA_49', 'PCA_50'],
      dtype='object')

In [166]:
sarcasm_only_df = merged_df.loc[merged_df['emotion_label'] == 'sarcasm']["id"]
anger_only_df = merged_df.loc[merged_df['emotion_label'] == 'anger']["id"]
disgust_only_df = merged_df.loc[merged_df['emotion_label'] == 'disgust']["id"]
fear_only_df = merged_df.loc[merged_df['emotion_label'] == 'fear']["id"]
joy_only_df = merged_df.loc[merged_df['emotion_label'] == 'joy']["id"]
sadness_only_df = merged_df.loc[merged_df['emotion_label'] == 'sadness']["id"]
surprise_only_df = merged_df.loc[merged_df['emotion_label'] == 'surprise']["id"]
non_sarcasm_negative_only_df = merged_df.loc[(merged_df['emotion_label'] != 'sarcasm') & (merged_df['sentiment_label'] == 0), "id"]
positive_only_df = merged_df.loc[(merged_df['sentiment_label'] == 1), "id"]


random.seed(42)
sarcasm_gerneral_negative = list(sarcasm_only_df.sample(n=30,random_state = 42))
rest_negative = list(non_sarcasm_negative_only_df.sample(n=70, random_state = 42))
sentiment_index = list(positive_only_df) + rest_negative + sarcasm_gerneral_negative

rest = list(merged_df.loc[~merged_df['id'].isin(sentiment_index),"id"])

sentiment = merged_df[merged_df['id'].isin(sentiment_index)]
sarcasm = merged_df[~merged_df['id'].isin(sentiment_index)]

In [172]:
sarcasm.to_csv("sarcasm_dataset.csv")
sentiment.to_csv("sentiment_dataset.csv")