In [1]:
# Imports

import plotly
import optuna
import mlflow
import dagshub
import mlflow.sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Setting up DagsHub

dagshub.init(repo_owner='SushrutGaikwad', repo_name='youtube-comments-analyzer', mlflow=True)

# Data

In [3]:
PREPROCESSED_DATA_PATH = "../data/processed/reddit_preprocessed.csv"
df = pd.read_csv(PREPROCESSED_DATA_PATH)
df.dropna(subset=["clean_comment"], inplace=True)
df.shape

(36662, 2)

# Running the experiment

In [4]:
# Setting experiment name

mlflow.set_experiment("Improving LightGBM")

<Experiment: artifact_location='mlflow-artifacts:/eb66f0b362cf4a6e9e8119850de3216b', creation_time=1749135817604, experiment_id='7', last_update_time=1749135817604, lifecycle_stage='active', name='Improving LightGBM', tags={}>

## Preprocessing

In [5]:
# Remapping class labels from {-1, 0, 1} to {2, 0, 1}
mapping = {
    -1: 2,
    0: 0,
    1: 1
}
df["category"] = df["category"].map(mapping)

# Removing missing sentiments
df.dropna(subset=["category"], inplace=True)

# Feature engineering
ngram_range = (1, 2)
max_features = 1000
vectorizer = CountVectorizer(
    ngram_range=ngram_range,
    max_features=max_features
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_comment"],
    df["category"],
    test_size=0.2,
    random_state=42,
    stratify=df["category"]
)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_vectorized, y_train = rus.fit_resample(
    X_train_vectorized,
    y_train
)

X_train_vectorized = X_train_vectorized.astype(np.float32)
X_test_vectorized = X_test_vectorized.astype(np.float32)

In [6]:
# Optuna objective function
def objective(trial):
    # Hyperparameter space
    
    lgbm_params = {
        "objective": "multiclass",
        "num_class": 3,
        "learning_rate": trial.suggest_float("lgbm_learning_rate", 1e-3, 0.15),
        "min_child_samples": 12,
        "max_depth": trial.suggest_int("lgbm_max_depth", 3, 20),
        "n_estimators": 300,
        "metric": "multi_logloss",
        "random_state": 42,
        "n_jobs": -1,
    }
    
    xgb_params = {
        "objective": "multi:softprob",
        "num_class": 3,
        "learning_rate": trial.suggest_float("xgb_learning_rate", 1e-3, 0.15),
        "max_depth": 10,
        "n_estimators": 500,
        "reg_alpha": 0.1,
        "use_label_encoder": False,
        "eval_metric": "mlogloss",
        "random_state": 42,
        "n_jobs": -1
    }
    
    logreg_params = {
        "C": trial.suggest_float("logreg_C", 1e-2, 10.0, log=True),
        "penalty": "l1",
        "solver": "liblinear",
        "multi_class": "ovr",
        "max_iter": 500
    }
    
    svm_params = {
        "C": 0.1,
        "kernel": trial.suggest_categorical("svm_kernel", ["linear", "rbf"]),
        "gamma": "scale",
    }
    
    knn_params = {
        "n_neighbors": trial.suggest_int("knn_n_neighbors", 3, 15)
    }
    
    estimators = [
        ("lgbm", LGBMClassifier(**lgbm_params)),
        ("xgb", XGBClassifier(**xgb_params)),
        ("svm", SVC(**svm_params)),
        ("logreg", LogisticRegression(**logreg_params))
    ]

    # Define the stacking classifier
    # model = LGBMClassifier(**params, random_state=42, n_jobs=-1)
    stack_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=KNeighborsClassifier(**knn_params),
        cv=3,
        n_jobs=-1,
        passthrough=True
    )

    # Performing cross-validation
    scores = cross_val_score(
        stack_clf,
        X_train_vectorized,
        y_train,
        cv=3,
        scoring="accuracy",
        n_jobs=-1
    )

    # Returning the average accuracy across folds
    return scores.mean()

In [7]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

[I 2025-06-07 13:42:10,487] A new study created in memory with name: no-name-1565df49-3704-4488-b461-8ac568b5a367
[I 2025-06-07 13:43:00,905] Trial 0 finished with value: 0.7628574315449126 and parameters: {'lgbm_learning_rate': 0.025299633796856753, 'lgbm_max_depth': 17, 'xgb_learning_rate': 0.11711134445854646, 'logreg_C': 0.013779128594097089, 'svm_kernel': 'linear', 'knn_n_neighbors': 15}. Best is trial 0 with value: 0.7628574315449126.
[I 2025-06-07 13:43:53,649] Trial 1 finished with value: 0.7141558047893302 and parameters: {'lgbm_learning_rate': 0.12563526915751616, 'lgbm_max_depth': 5, 'xgb_learning_rate': 0.05953613730476061, 'logreg_C': 0.07031050279908246, 'svm_kernel': 'rbf', 'knn_n_neighbors': 7}. Best is trial 0 with value: 0.7628574315449126.
[I 2025-06-07 13:44:43,052] Trial 2 finished with value: 0.7655350106092755 and parameters: {'lgbm_learning_rate': 0.06426398598151097, 'lgbm_max_depth': 15, 'xgb_learning_rate': 0.09624102484395186, 'logreg_C': 0.620602645424917, 

In [8]:
best_params = study.best_params
best_params

{'lgbm_learning_rate': 0.10498949281491349,
 'lgbm_max_depth': 14,
 'xgb_learning_rate': 0.11222085618519292,
 'logreg_C': 1.135999125292974,
 'svm_kernel': 'linear',
 'knn_n_neighbors': 15}

In [9]:
best_lightgbm_classifier = LGBMClassifier(
    objective="multiclass",
    num_class=3,
    learning_rate=best_params["lgbm_learning_rate"],
    min_child_samples=12,
    max_depth=best_params["lgbm_max_depth"],
    n_estimators=300,
    metric="multi_logloss",
    random_state=42,
    n_jobs=-1,
)
best_xgb_classifier = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    learning_rate=best_params["xgb_learning_rate"],
    max_depth=10,
    n_estimators=500,
    reg_alpha=0.1,
    use_label_encoder=False,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)
best_logreg_classifier = LogisticRegression(
    C=best_params["logreg_C"],
    penalty="l1",
    solver="liblinear",
    multi_class="ovr",
    max_iter=500
)
best_svc_classifier = SVC(
    C=0.1,
    kernel=best_params["svm_kernel"],
    gamma="scale",
)
best_knn_classifier = KNeighborsClassifier(
    n_neighbors=best_params["knn_n_neighbors"]
)

best_estimators = [
    ("lgbm", best_lightgbm_classifier),
    ("xgb", best_xgb_classifier),
    ("svm", best_svc_classifier),
    ("logreg", best_logreg_classifier)
]

best_model = StackingClassifier(
    estimators=best_estimators,
    final_estimator=best_knn_classifier,
    cv=3,
    n_jobs=-1,
    passthrough=True
)

In [10]:
best_model.fit(X_train_vectorized, y_train)

In [11]:
y_train_pred = best_model.predict(X_train_vectorized)
accuracy_train = accuracy_score(
    y_true=y_train,
    y_pred=y_train_pred
)
accuracy_train



0.8146408002424977

In [12]:
classification_report_train = classification_report(
    y_true=y_train,
    y_pred=y_train_pred
)
print(classification_report_train)

              precision    recall  f1-score   support

           0       0.74      0.97      0.84      6598
           1       0.87      0.78      0.82      6598
           2       0.88      0.69      0.77      6598

    accuracy                           0.81     19794
   macro avg       0.83      0.81      0.81     19794
weighted avg       0.83      0.81      0.81     19794



In [13]:
y_test_pred = best_model.predict(X_test_vectorized)
accuracy_test = accuracy_score(
    y_true=y_test,
    y_pred=y_test_pred
)
accuracy_test



0.7815355243420156

In [14]:
classification_report_test = classification_report(
    y_true=y_test,
    y_pred=y_test_pred
)
print(classification_report_test)

              precision    recall  f1-score   support

           0       0.74      0.97      0.84      2529
           1       0.89      0.72      0.79      3154
           2       0.69      0.61      0.65      1650

    accuracy                           0.78      7333
   macro avg       0.77      0.77      0.76      7333
weighted avg       0.79      0.78      0.78      7333



In [15]:
# Function to log results to MLFlow
def log_to_mlflow(
    model_name,
    improvement_technique,
    model,
    X_train,
    X_test,
    y_train,
    y_test,
    best_params
):
    with mlflow.start_run():
        # Tags
        mlflow.set_tag(
            "mlflow.runName", f"{model_name}_{improvement_technique}"
        )
        mlflow.set_tag("experiment_type", "Improving LightGBM")
        
        # Logging improvement technique as a parameter
        mlflow.log_param("improvement_technique", improvement_technique)
        
        # Initializing and training the model
        model.fit(X_train, y_train)
        
        # Making predictions on the test set and logging metrics
        y_pred = model.predict(X_test)
        
        # Logging cross-val accuracy
        scores = cross_val_score(
            model,
            X_train,
            y_train,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
        mlflow.log_metric("cross_val_accuracy", scores.mean())
        
        # Logging accuracy
        accuracy = accuracy_score(
            y_true=y_test,
            y_pred=y_pred
        )
        mlflow.log_metric("test_accuracy", accuracy)
        
        # Logging classification report metrics
        classification_rep = classification_report(
            y_true=y_test,
            y_pred=y_pred,
            output_dict=True
        )
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}: {metric} - test", value)
        
        # Logging the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        
        # Logging the best parameters
        mlflow.log_params(best_params)

In [16]:
log_to_mlflow(
    model_name="LightGBM",
    improvement_technique="stacking",
    model=best_model,
    X_train=X_train_vectorized,
    X_test=X_test_vectorized,
    y_train=y_train,
    y_test=y_test,
    best_params=best_params
)



🏃 View run LightGBM_stacking at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/7/runs/e3dc7d69e6384f4c9f86afaa7cbc5c99
🧪 View experiment at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/7
