In [1]:
# Imports

import plotly
import optuna
import mlflow
import dagshub
import mlflow.sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Setting up DagsHub

dagshub.init(repo_owner='SushrutGaikwad', repo_name='youtube-comments-analyzer', mlflow=True)

# Data

In [3]:
PREPROCESSED_DATA_PATH = "../data/processed/reddit_preprocessed.csv"
df = pd.read_csv(PREPROCESSED_DATA_PATH)
df.dropna(subset=["clean_comment"], inplace=True)
df.shape

(36662, 2)

# Running the experiment

In [4]:
# Setting experiment name

mlflow.set_experiment("Improving LightGBM")

<Experiment: artifact_location='mlflow-artifacts:/eb66f0b362cf4a6e9e8119850de3216b', creation_time=1749135817604, experiment_id='7', last_update_time=1749135817604, lifecycle_stage='active', name='Improving LightGBM', tags={}>

## Preprocessing

In [5]:
# Remapping class labels from {-1, 0, 1} to {2, 0, 1}
mapping = {
    -1: 2,
    0: 0,
    1: 1
}
df["category"] = df["category"].map(mapping)

# Removing missing sentiments
df.dropna(subset=["category"], inplace=True)

# Feature engineering
ngram_range = (1, 2)
max_features = 1000
vectorizer = CountVectorizer(
    ngram_range=ngram_range,
    max_features=max_features
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_comment"],
    df["category"],
    test_size=0.2,
    random_state=42,
    stratify=df["category"]
)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# # Undersampling
# rus = RandomUnderSampler(random_state=42)
# X_train_vectorized, y_train = rus.fit_resample(
#     X_train_vectorized,
#     y_train
# )

X_train_vectorized = X_train_vectorized.astype(np.float32)
X_test_vectorized = X_test_vectorized.astype(np.float32)

In [6]:
# Function to log results to MLFlow
def log_to_mlflow(
    model_name,
    improvement_technique,
    model,
    X_train,
    X_test,
    y_train,
    y_test,
    best_params
):
    with mlflow.start_run():
        # Tags
        mlflow.set_tag(
            "mlflow.runName", f"{model_name}_{improvement_technique}"
        )
        mlflow.set_tag("experiment_type", "Improving LightGBM")
        
        # Logging improvement technique as a parameter
        mlflow.log_param("improvement_technique", improvement_technique)
        
        # Initializing and training the model
        model.fit(X_train, y_train)
        
        # Making predictions on the test set and logging metrics
        y_pred = model.predict(X_test)
        
        # Logging cross-val accuracy
        scores = cross_val_score(
            model,
            X_train,
            y_train,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
        mlflow.log_metric("cross_val_accuracy", scores.mean())
        
        # Logging accuracy
        accuracy = accuracy_score(
            y_true=y_test,
            y_pred=y_pred
        )
        mlflow.log_metric("test_accuracy", accuracy)
        
        # Logging classification report metrics
        classification_rep = classification_report(
            y_true=y_test,
            y_pred=y_pred,
            output_dict=True
        )
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}: {metric} - test", value)
        
        # Logging the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        
        # Logging the best parameters
        mlflow.log_params(best_params)

In [None]:
# Optuna objective function
def objective(trial):
    # Hyperparameter space
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "metric": "multi_logloss",
        "is_unbalance": True,
        "class_weight": "balanced",
    }

    # Define the LightGBM model with the trial parameters
    model = LGBMClassifier(**params, random_state=42, n_jobs=-1)

    # Performing cross-validation
    scores = cross_val_score(
        model,
        X_train_vectorized,
        y_train,
        cv=3,
        scoring="accuracy",
        n_jobs=-1
    )

    # Returning the average accuracy across folds
    return scores.mean()

In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2025-06-05 23:30:28,438] A new study created in memory with name: no-name-731a2364-35b9-4540-81d6-ecd062264617
[I 2025-06-05 23:30:30,919] Trial 0 finished with value: 0.6555968208340036 and parameters: {'learning_rate': 0.023564721504423045, 'min_child_samples': 180, 'max_depth': 9, 'n_estimators': 90}. Best is trial 0 with value: 0.6555968208340036.
[I 2025-06-05 23:30:34,447] Trial 1 finished with value: 0.6726449426688698 and parameters: {'learning_rate': 0.02104596108908152, 'min_child_samples': 192, 'max_depth': 14, 'n_estimators': 184}. Best is trial 1 with value: 0.6726449426688698.
[I 2025-06-05 23:30:47,300] Trial 2 finished with value: 0.7507245870975119 and parameters: {'learning_rate': 0.04381973759052136, 'min_child_samples': 101, 'max_depth': 22, 'n_estimators': 490}. Best is trial 2 with value: 0.7507245870975119.
[I 2025-06-05 23:30:52,594] Trial 3 finished with value: 0.6832147603505806 and parameters: {'learning_rate': 0.058129634987198836, 'min_child_samples': 19

In [9]:
best_params = study.best_params
best_params

{'learning_rate': 0.06747064968662127,
 'min_child_samples': 10,
 'max_depth': 30,
 'n_estimators': 240}

In [10]:
best_model = LGBMClassifier(
    objective="multiclass",
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    n_estimators=best_params["n_estimators"],
    min_child_samples=best_params["min_child_samples"],
    random_state=42,
    n_jobs=-1
)

In [11]:
best_model.fit(X_train_vectorized, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5012
[LightGBM] [Info] Number of data points in the train set: 29329, number of used features: 995
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [12]:
y_train_pred = best_model.predict(X_train_vectorized)
accuracy_train = accuracy_score(
    y_true=y_train,
    y_pred=y_train_pred
)
accuracy_train



0.8439428551945174

In [13]:
classification_report_train = classification_report(
    y_true=y_train,
    y_pred=y_train_pred
)
print(classification_report_train)

              precision    recall  f1-score   support

           0       0.78      0.97      0.86     10115
           1       0.94      0.80      0.86     12616
           2       0.81      0.74      0.77      6598

    accuracy                           0.84     29329
   macro avg       0.84      0.84      0.83     29329
weighted avg       0.86      0.84      0.84     29329



In [14]:
y_test_pred = best_model.predict(X_test_vectorized)
accuracy_test = accuracy_score(
    y_true=y_test,
    y_pred=y_test_pred
)
accuracy_test



0.7940815491613256

In [15]:
classification_report_test = classification_report(
    y_true=y_test,
    y_pred=y_test_pred
)
print(classification_report_test)

              precision    recall  f1-score   support

           0       0.76      0.96      0.85      2529
           1       0.89      0.75      0.81      3154
           2       0.71      0.62      0.66      1650

    accuracy                           0.79      7333
   macro avg       0.78      0.78      0.77      7333
weighted avg       0.80      0.79      0.79      7333



In [16]:
log_to_mlflow(
    model_name="LightGBM",
    improvement_technique="class_weights",
    model=best_model,
    X_train=X_train_vectorized,
    X_test=X_test_vectorized,
    y_train=y_train,
    y_test=y_test,
    best_params=best_params
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5012
[LightGBM] [Info] Number of data points in the train set: 29329, number of used features: 995
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




üèÉ View run LightGBM_class_weights at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/7/runs/ed6df19c7c974906b08fcb0f1d6b8fd6
üß™ View experiment at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/7
