In [1]:
import pandas as pd
import numpy as nm 

# For Data Encoding
from sklearn.preprocessing import MultiLabelBinarizer

# For data splitting and model evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import the SVM classifier
from sklearn.svm import SVC

# For scaling (optional)
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the dataset and Split each sequence into a list of individual emoticons
emoticon_data = pd.read_csv("/home/belief/Desktop/MLAss1/mini-project-1/datasets/train/train_emoticon.csv")
emoticon_data['split_emoticons'] = emoticon_data['input_emoticon'].apply(list)

# Remove the 'input_emoticon' column
emoticon_data = emoticon_data.drop('input_emoticon', axis=1)

print(emoticon_data)
df = emoticon_data

In [None]:
mlb = MultiLabelBinarizer()

# Fit the binarizer and transform the split_emoticons column
emoji_encoded = mlb.fit_transform(df['split_emoticons'])

# Create a DataFrame with the encoded emojis
emoji_df = pd.DataFrame(emoji_encoded, columns=mlb.classes_)

# Concatenate the label column with the encoded emojis
final_df = pd.concat([emoji_df, df['label']], axis=1)

print(final_df.head)
print(final_df.shape)

In [4]:
# Separate features and labels
X = final_df.drop('label', axis=1)
y = final_df['label']

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize the SVM classifier with RBF kernel
svm_classifier = SVC(kernel='rbf', random_state=42)

# Train the classifier
svm_classifier.fit(X_train_scaled, y_train)

In [6]:
# Predict on the test data
y_pred = svm_classifier.predict(X_test_scaled)

In [None]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

# Initialize GridSearchCV
grid = GridSearchCV(SVC(random_state=42), param_grid, refit=True, verbose=2, cv=5)

# Fit the grid search to the data
grid.fit(X_train_scaled, y_train)

print("Best Parameters found by Grid Search:")
print(grid.best_params_)

# Predict on the test data using the best estimator
grid_predictions = grid.predict(X_test_scaled)

# Classification report
print("Classification Report after Hyperparameter Tuning:")
print(classification_report(y_test, grid_predictions))

# Confusion matrix
print("Confusion Matrix after Hyperparameter Tuning:")
print(confusion_matrix(y_test, grid_predictions))

