In [2]:
import pandas as pd
import numpy as np

# Load embeddings
train_embeddings = pd.read_csv("../artifacts/embeddings/train_embeddings.csv")
test_embeddings = pd.read_csv("../artifacts/embeddings/test_embeddings.csv")

X_train = train_embeddings.drop(columns=["score"])
y_train = train_embeddings["score"]

X_test = test_embeddings.drop(columns=["score"])
y_test = test_embeddings["score"]



In [3]:
train_embeddings.head()

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383,score
0,0.043641,-0.025014,0.049839,-0.03329,0.036302,-0.083492,0.021285,-0.006104,0.0307,0.10801,...,-0.068096,0.040192,-0.039524,-0.045027,0.060908,0.017846,-0.064278,-0.004803,-0.009702,0
1,-0.03875,-0.023617,0.07752,0.036366,-0.041527,-0.018027,0.052052,-0.021477,0.011939,0.014804,...,-0.015257,0.023544,0.065262,0.142146,-0.025768,0.02206,0.002112,-0.029069,0.006965,1
2,-0.106064,0.019397,0.004978,-0.057788,-0.090887,0.014373,0.078101,0.033931,-0.025732,0.025954,...,-0.007251,0.019602,0.065283,-0.00368,0.014151,0.035492,0.011684,-0.053421,-0.044654,0
3,-0.033526,-0.101945,0.022593,-0.00224,0.092876,-0.009948,-0.104065,-0.095733,-0.045092,0.043954,...,-0.047092,-0.037452,-0.059408,-0.003875,-0.046511,0.097345,-0.005512,0.008243,0.082393,0
4,-0.122104,-0.010321,-0.008574,-0.042038,-0.02966,-0.018314,0.063233,-0.063453,-0.012131,0.052477,...,-0.015115,0.003031,-0.007256,-0.012781,-0.011065,0.019015,0.027191,0.055023,-0.041305,0


In [4]:
train_embeddings['score'].value_counts(), test_embeddings['score'].value_counts()

(score
 0    28061
 1    24078
 Name: count, dtype: int64,
 score
 0    7015
 1    6020
 Name: count, dtype: int64)

In [5]:
train_embeddings.isna().sum().sum(), test_embeddings.isna().sum().sum()

(np.int64(0), np.int64(0))

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


scaler = StandardScaler()

train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(train_scaled)
X_test_pca = pca.transform(test_scaled)

print("Original shape:", train_embeddings.shape)
print("After PCA shape:", X_train_pca.shape)

Original shape: (52139, 385)
After PCA shape: (52139, 236)


In [36]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score


svm = LinearSVC(C=1.0, max_iter=2000)
svm.fit(X_train_pca, y_train)
train_predictions = svm.predict(X_train_pca)
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Train Accuracy: {train_accuracy:.4f}")
test_predictions_binary = svm.predict(X_test_pca)
accuracy = accuracy_score(y_test, test_predictions_binary)
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report (Binary):")
print(classification_report(y_test, test_predictions_binary))


Train Accuracy: 0.8716
Test Accuracy: 0.8753

Classification Report (Binary):
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      7015
           1       0.90      0.82      0.86      6020

    accuracy                           0.88     13035
   macro avg       0.88      0.87      0.87     13035
weighted avg       0.88      0.88      0.87     13035



In [6]:
import optuna
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def objective(trial):
    C = trial.suggest_float('C', 1e-3, 1e3, log=True)
    loss = trial.suggest_categorical('loss', ['hinge', 'squared_hinge'])
    dual = trial.suggest_categorical('dual', [True, False])
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])

    if loss == 'hinge' and dual is False:
        raise optuna.TrialPruned()
    
    model = LinearSVC(
        C=C,
        loss=loss,
        dual=dual,
        class_weight=class_weight,
        max_iter=3000,
        random_state=42
    )
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    acc = accuracy_score(y_test, y_pred)

    return acc   

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)
best_model = LinearSVC(**study.best_params, max_iter=3000, random_state=42)
best_model.fit(X_train_pca, y_train)
y_pred=best_model.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_pred))
y_pred = best_model.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[I 2025-08-17 00:39:07,546] A new study created in memory with name: no-name-b7da64b9-115f-43fa-a352-e4f26ab88144
[I 2025-08-17 00:42:38,284] Trial 0 finished with value: 0.8589950134253932 and parameters: {'C': 4.981364780562834, 'loss': 'squared_hinge', 'dual': True, 'class_weight': None}. Best is trial 0 with value: 0.8589950134253932.
[I 2025-08-17 00:42:59,742] Trial 1 finished with value: 0.8751822017644802 and parameters: {'C': 0.30431985797164757, 'loss': 'hinge', 'dual': True, 'class_weight': None}. Best is trial 1 with value: 0.8751822017644802.
[I 2025-08-17 00:43:01,913] Trial 2 finished with value: 0.8751054852320675 and parameters: {'C': 0.011056304969730842, 'loss': 'squared_hinge', 'dual': False, 'class_weight': None}. Best is trial 1 with value: 0.8751822017644802.
[I 2025-08-17 00:43:52,129] Trial 3 finished with value: 0.8746451860375911 and parameters: {'C': 0.07376449973131216, 'loss': 'squared_hinge', 'dual': True, 'class_weight': 'balanced'}. Best is trial 1 with

Best hyperparameters: {'C': 1.9929996462312742, 'loss': 'hinge', 'dual': True, 'class_weight': 'balanced'}
Best validation accuracy: 0.8767932489451477

Train Accuracy: 0.872571395692284

Test Accuracy: 0.8767932489451477
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      7015
           1       0.89      0.83      0.86      6020

    accuracy                           0.88     13035
   macro avg       0.88      0.87      0.88     13035
weighted avg       0.88      0.88      0.88     13035





In [7]:
import optuna
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

def objective(trial):
    var_smoothing = trial.suggest_float("var_smoothing", 1e-12, 1e-6, log=True)
    model = GaussianNB(var_smoothing=var_smoothing)
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    acc = accuracy_score(y_test, y_pred)
    return acc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)
best_model = GaussianNB(**study.best_params)
best_model.fit(X_train_pca, y_train)

y_pred = best_model.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_pred))


y_pred = best_model.predict(X_test_pca)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[I 2025-08-17 01:24:47,182] A new study created in memory with name: no-name-a4bdf1e8-8bf9-4953-9c5c-cef22e7100ac


[I 2025-08-17 01:24:49,394] Trial 0 finished with value: 0.7895665515918681 and parameters: {'var_smoothing': 7.351314520089414e-11}. Best is trial 0 with value: 0.7895665515918681.
[I 2025-08-17 01:24:51,698] Trial 1 finished with value: 0.7895665515918681 and parameters: {'var_smoothing': 1.180523576612653e-11}. Best is trial 0 with value: 0.7895665515918681.
[I 2025-08-17 01:24:53,481] Trial 2 finished with value: 0.7895665515918681 and parameters: {'var_smoothing': 2.342788683503452e-08}. Best is trial 0 with value: 0.7895665515918681.
[I 2025-08-17 01:24:54,733] Trial 3 finished with value: 0.7895665515918681 and parameters: {'var_smoothing': 2.903857077507099e-12}. Best is trial 0 with value: 0.7895665515918681.
[I 2025-08-17 01:24:55,967] Trial 4 finished with value: 0.7895665515918681 and parameters: {'var_smoothing': 1.4348607013406116e-09}. Best is trial 0 with value: 0.7895665515918681.
[I 2025-08-17 01:24:57,133] Trial 5 finished with value: 0.7895665515918681 and parameter

Best hyperparameters: {'var_smoothing': 7.351314520089414e-11}
Best validation accuracy: 0.7895665515918681

Train Accuracy: 0.790828362646004

Test Accuracy: 0.7895665515918681
              precision    recall  f1-score   support

           0       0.80      0.81      0.80      7015
           1       0.77      0.77      0.77      6020

    accuracy                           0.79     13035
   macro avg       0.79      0.79      0.79     13035
weighted avg       0.79      0.79      0.79     13035



In [8]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def objective(trial):
    # Suggest hyperparameters for Logistic Regression
    C = trial.suggest_float("C", 1e-3, 1e3, log=True)  
    penalty = trial.suggest_categorical("penalty", ["l2", "none"])  
    solver = trial.suggest_categorical("solver", ["lbfgs", "saga"])  
    if penalty == "none" and solver not in ["lbfgs", "saga"]:
        raise optuna.TrialPruned()
    
    model = LogisticRegression(
        C=C,
        penalty=penalty if penalty != "none" else None,
        solver=solver,
        max_iter=3000,
        random_state=42
    )
    
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    acc = accuracy_score(y_test, y_pred)
    return acc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

best_model = LogisticRegression(**study.best_params,max_iter=3000,random_state=42)
best_model.fit(X_train_pca, y_train)

y_pred = best_model.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_pred))

y_pred = best_model.predict(X_test_pca)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[I 2025-08-17 01:26:40,877] A new study created in memory with name: no-name-ea01c4f3-2a6b-407b-9733-922528eb40f9
[I 2025-08-17 01:26:46,610] Trial 0 finished with value: 0.8742616033755274 and parameters: {'C': 127.96564931935983, 'penalty': 'l2', 'solver': 'saga'}. Best is trial 0 with value: 0.8742616033755274.
[I 2025-08-17 01:26:49,460] Trial 1 finished with value: 0.8743383199079402 and parameters: {'C': 3.1766581565269973, 'penalty': 'none', 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8743383199079402.
[I 2025-08-17 01:26:55,951] Trial 2 finished with value: 0.8742616033755274 and parameters: {'C': 283.3080439036442, 'penalty': 'none', 'solver': 'saga'}. Best is trial 1 with value: 0.8743383199079402.
[I 2025-08-17 01:27:00,987] Trial 3 finished with value: 0.8741848868431147 and parameters: {'C': 0.013951726814825562, 'penalty': 'l2', 'solver': 'saga'}. Best is trial 1 with value: 0.8743383199079402.
[I 2025-08-17 01:27:06,248] Trial 4 finished with value: 0.8742616033755

Best hyperparameters: {'C': 0.005479833045425943, 'penalty': 'l2', 'solver': 'lbfgs'}
Best validation accuracy: 0.8744917529727656

Train Accuracy: 0.8716124206448148

Test Accuracy: 0.8744917529727656
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      7015
           1       0.89      0.83      0.86      6020

    accuracy                           0.87     13035
   macro avg       0.88      0.87      0.87     13035
weighted avg       0.88      0.87      0.87     13035



In [7]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=1, gamma='scale' , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.9230326626901169

Test Accuracy: 0.883927886459532

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.93      0.90      7015
           1       0.91      0.83      0.87      6020

    accuracy                           0.88     13035
   macro avg       0.89      0.88      0.88     13035
weighted avg       0.89      0.88      0.88     13035



In [4]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=1.2, gamma='scale' , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.9289974874853756

Test Accuracy: 0.8838511699271193

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      7015
           1       0.91      0.83      0.87      6020

    accuracy                           0.88     13035
   macro avg       0.89      0.88      0.88     13035
weighted avg       0.89      0.88      0.88     13035



In [5]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=0.5, gamma='scale' , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.9028941866932623

Test Accuracy: 0.8819332566168009

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      7015
           1       0.91      0.82      0.87      6020

    accuracy                           0.88     13035
   macro avg       0.89      0.88      0.88     13035
weighted avg       0.88      0.88      0.88     13035



In [6]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=0.5, gamma=1 , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.5381959761407008

Test Accuracy: 0.5381664748753356

Classification Report:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70      7015
           1       0.00      0.00      0.00      6020

    accuracy                           0.54     13035
   macro avg       0.27      0.50      0.35     13035
weighted avg       0.29      0.54      0.38     13035



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [7]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=0.5, gamma=2 , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.5381959761407008

Test Accuracy: 0.5381664748753356

Classification Report:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70      7015
           1       0.00      0.00      0.00      6020

    accuracy                           0.54     13035
   macro avg       0.27      0.50      0.35     13035
weighted avg       0.29      0.54      0.38     13035



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [8]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=0.5, gamma=0.1 , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.5425497228562113

Test Accuracy: 0.5383966244725739

Classification Report:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70      7015
           1       1.00      0.00      0.00      6020

    accuracy                           0.54     13035
   macro avg       0.77      0.50      0.35     13035
weighted avg       0.75      0.54      0.38     13035



In [9]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=0.5, gamma=0.01 , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.9381461094382324

Test Accuracy: 0.8733410049865746

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      7015
           1       0.91      0.80      0.85      6020

    accuracy                           0.87     13035
   macro avg       0.88      0.87      0.87     13035
weighted avg       0.88      0.87      0.87     13035



In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=0.5, gamma=0.05 , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.5678666641093999

Test Accuracy: 0.5419255849635597

Classification Report:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70      7015
           1       0.92      0.01      0.02      6020

    accuracy                           0.54     13035
   macro avg       0.73      0.50      0.36     13035
weighted avg       0.71      0.54      0.39     13035



In [11]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=0.3, gamma='scale' , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Train Accuracy: 0.893822282744203

Test Accuracy: 0.8805523590333717

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      7015
           1       0.91      0.82      0.86      6020

    accuracy                           0.88     13035
   macro avg       0.89      0.88      0.88     13035
weighted avg       0.88      0.88      0.88     13035



In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

svm = SVC(kernel="rbf", C=0.4, gamma='scale' , random_state=42)
svm.fit(X_train_pca, y_train)
y_train_pred = svm.predict(X_train_pca)
print("\nTrain Accuracy:", accuracy_score(y_train, y_train_pred))
y_test_pred = svm.predict(X_test_pca)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))
