In [1]:
import pandas as pd
import regex as re

In [2]:
# Step 1: Read the data from 'train_emoticon.csv'
# Assuming the CSV file has no header and columns are 'emoji_sequence' and 'label'
df = pd.read_csv('/home/belief/Desktop/MLProj1/mini-project-1/datasets/train/train_emoticon.csv')

In [3]:
# Step 2: Split each emoji sequence into individual emojis
# Use regex to handle emojis correctly
def split_emojis(emoji_sequence):
    # Use the regex pattern \X to match grapheme clusters (i.e., emojis)
    return re.findall(r'\X', emoji_sequence)

In [4]:
# Apply the function to create a new column with the list of emojis
df['emoji_list'] = df['input_emoticon'].apply(split_emojis)

In [5]:
# Verify that all sequences have 13 emojis
sequence_lengths = df['emoji_list'].apply(len)
if not all(sequence_lengths == 13):
    # Find rows with incorrect sequence lengths
    incorrect_lengths = df[sequence_lengths != 13]
    print("Warning: The following rows do not have 13 emojis:")
    print(incorrect_lengths)
    # Optionally, handle these rows (e.g., drop them or pad/truncate the sequences)
    # For now, we'll proceed but you may need to address this

In [6]:
# Step 3: Collect all unique emojis and assign a unique numerical ID to each
# Flatten the list of emoji lists to get all emojis
all_emojis = [emoji for emoji_list in df['emoji_list'] for emoji in emoji_list]
unique_emojis = sorted(set(all_emojis))
emoji_to_id = {emoji: idx for idx, emoji in enumerate(unique_emojis)}

In [7]:
# Save the mapping to a file (optional)
with open('emoji_mapping.txt', 'w', encoding='utf-8') as f:
    for emoji, idx in emoji_to_id.items():
        f.write(f'{emoji}: {idx}\n')

In [8]:
# Step 4: Replace the emojis in the data with the assigned numbers
# Convert each emoji in the list to its corresponding ID
def emojis_to_ids(emoji_list):
    return [emoji_to_id[emoji] for emoji in emoji_list]

df['emoji_ids'] = df['emoji_list'].apply(emojis_to_ids)

In [9]:
# Expand the emoji IDs into separate columns
emoji_columns = [f'emoji_{i+1}' for i in range(13)]
emoji_ids_df = pd.DataFrame(df['emoji_ids'].tolist(), columns=emoji_columns)

In [10]:
# Combine the emoji ID columns with the label
processed_df = pd.concat([emoji_ids_df, df['label']], axis=1)

In [11]:
# Step 5: Export the processed data to a new CSV file
processed_df.to_csv('processed_train_emoticon.csv', index=False)

print("Processing complete. The processed data has been saved to 'processed_train_emoticon.csv'.")

Processing complete. The processed data has been saved to 'processed_train_emoticon.csv'.
