In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import regex as re

In [19]:
# Step 1: Read the data from 'train_emoticon.csv'
# Assuming the CSV file has no header and columns are 'emoji_sequence' and 'label'
df = pd.read_csv('/home/belief/Desktop/MLProj1/mini-project-1/datasets/train/train_emoticon.csv')
# Step 2: Split each emoji sequence into individual emojis
# Use regex to handle emojis correctly
def split_emojis(emoji_sequence):
    # Use the regex pattern \X to match grapheme clusters (i.e., emojis)
    return re.findall(r'\X', emoji_sequence)
# Apply the function to create a new column with the list of emojis
df['emoji_list'] = df['input_emoticon'].apply(split_emojis)
# Verify that all sequences have 13 emojis
sequence_lengths = df['emoji_list'].apply(len)
if not all(sequence_lengths == 13):
    # Find rows with incorrect sequence lengths
    incorrect_lengths = df[sequence_lengths != 13]
    print("Warning: The following rows do not have 13 emojis:")
    print(incorrect_lengths)
    # Optionally, handle these rows (e.g., drop them or pad/truncate the sequences)
    # For now, we'll proceed but you may need to address this
# Step 3: Collect all unique emojis and assign a unique numerical ID to each
# Flatten the list of emoji lists to get all emojis
all_emojis = [emoji for emoji_list in df['emoji_list'] for emoji in emoji_list]
unique_emojis = sorted(set(all_emojis))
emoji_to_id = {emoji: idx for idx, emoji in enumerate(unique_emojis)}
# Save the mapping to a file (optional)
with open('emoji_mapping.txt', 'w', encoding='utf-8') as f:
    for emoji, idx in emoji_to_id.items():
        f.write(f'{emoji}: {idx}\n')
# Step 4: Replace the emojis in the data with the assigned numbers
# Convert each emoji in the list to its corresponding ID
def emojis_to_ids(emoji_list):
    return [emoji_to_id[emoji] for emoji in emoji_list]

df['emoji_ids'] = df['emoji_list'].apply(emojis_to_ids)
# Expand the emoji IDs into separate columns
emoji_columns = [f'emoji_{i+1}' for i in range(13)]
emoji_ids_df = pd.DataFrame(df['emoji_ids'].tolist(), columns=emoji_columns)
# Combine the emoji ID columns with the label
df = pd.concat([emoji_ids_df, df['label']], axis=1)

In [20]:
print(df)

      emoji_1  emoji_2  emoji_3  emoji_4  emoji_5  emoji_6  emoji_7  emoji_8  \
0          26      198       58       16       34       93      106      179   
1         198       16       41       26      179      106       34      126   
2          26      106       16      154       34      179      198      109   
3          26      179      198       78       34      106       16      102   
4         198      152      179       26       73       16       34      106   
...       ...      ...      ...      ...      ...      ...      ...      ...   
7075      106       57       58       16       34      198       26      179   
7076      198      179        5      106       34      115       26       16   
7077      209       16       80       26       34      106      198      179   
7078       34       16       26      193      198      106      179      163   
7079       26       34      179       16      198       56      106      165   

      emoji_9  emoji_10  emoji_11  emoj

In [21]:
# Separate features and labels
X = df.drop('label', axis=1)  # Features: all columns except 'label'
y = df['label']               # Target variable: 'label' column

In [22]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# # Feature scaling
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

X_train_scaled = X_train
X_test_scaled = X_test

In [24]:
# Initial SVM training
clf = svm.SVC(kernel='rbf')  # You can experiment with 'rbf', 'poly', etc.
clf.fit(X_train_scaled, y_train)

In [25]:
# Prediction
y_pred = clf.predict(X_test_scaled)

In [26]:
# Initial evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.5374293785310734

Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.51      0.53       719
           1       0.53      0.57      0.55       697

    accuracy                           0.54      1416
   macro avg       0.54      0.54      0.54      1416
weighted avg       0.54      0.54      0.54      1416


Confusion Matrix:
 [[365 354]
 [301 396]]


In [None]:
# Hyperparameter tuning with Grid Search
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

grid = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train_scaled, y_train)

print("Best Parameters:", grid.best_params_)

In [None]:
# Evaluation with the best estimator
best_clf = grid.best_estimator_
y_pred = best_clf.predict(X_test_scaled)

print("Accuracy after tuning:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))