In [None]:
# Imports

import plotly
import optuna
import mlflow
import dagshub
import mlflow.sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Setting up DagsHub

dagshub.init(repo_owner='SushrutGaikwad', repo_name='youtube-comments-analyzer', mlflow=True)

# Data

In [3]:
PREPROCESSED_DATA_PATH = "../data/processed/reddit_preprocessed.csv"
df = pd.read_csv(PREPROCESSED_DATA_PATH)
df.dropna(subset=["clean_comment"], inplace=True)
df.shape

(36662, 2)

# Running the experiment

In [4]:
# Setting experiment name

mlflow.set_experiment("Improving LightGBM")

<Experiment: artifact_location='mlflow-artifacts:/eb66f0b362cf4a6e9e8119850de3216b', creation_time=1749135817604, experiment_id='7', last_update_time=1749135817604, lifecycle_stage='active', name='Improving LightGBM', tags={}>

## Preprocessing

In [5]:
# Remapping class labels from {-1, 0, 1} to {2, 0, 1}
mapping = {
    -1: 2,
    0: 0,
    1: 1
}
df["category"] = df["category"].map(mapping)

# Removing missing sentiments
df.dropna(subset=["category"], inplace=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_comment"],
    df["category"],
    test_size=0.2,
    random_state=42,
    stratify=df["category"]
)

In [6]:
X_train_tokenized = [sentence.split() for sentence in X_train]
X_test_tokenized = [sentence.split() for sentence in X_test]

In [7]:
# Word2Vec
word2vec_model = Word2Vec(
    sentences=X_train_tokenized,
    vector_size=300,
    window=5,
    min_count=1,
    workers=4,
    sg=1  # Skip-Gram model
)

# Generating vector representation for each comment
def vectorize_comments(tokenized_comments, word2vec_model):
    vectorized_comments = []
    for tokens in tokenized_comments:
        vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
        if len(vectors) > 0:
            vectorized_comments.append(np.mean(vectors, axis=0))
        else:
            vectorized_comments.append(np.zeros(word2vec_model.vector_size))
    return np.array(vectorized_comments)

# Vectorizing train and test comments
X_train_word2vec = vectorize_comments(X_train_tokenized, word2vec_model)
X_test_word2vec = vectorize_comments(X_test_tokenized, word2vec_model)

In [8]:
# Encoding the target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [9]:
# Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(
    X_train_word2vec,
    y_train_encoded
)

In [10]:
def objective(trial):
    # Suggest hyperparameters to be tuned
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "metric": "multi_logloss",
        # "is_unbalance": True,
        # "class_weight": "balanced",
    }

    # Initialize the LightGBM model with suggested parameters
    model = LGBMClassifier(**params, random_state=42, n_jobs=-1)

    # Perform cross-validation to evaluate the model performance
    scores = cross_val_score(
        model,
        X_train_resampled,
        y_train_resampled,
        cv=3,
        scoring="accuracy",
        n_jobs=-1
    )

    # Return the mean accuracy score across folds
    return scores.mean()

In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)

[I 2025-06-07 11:51:56,684] A new study created in memory with name: no-name-11854aba-c741-479d-a0e4-5fdc5b9384ce
[I 2025-06-07 11:52:10,052] Trial 0 finished with value: 0.6217035465292513 and parameters: {'learning_rate': 0.019664387089061135, 'min_child_samples': 127, 'max_depth': 8, 'n_estimators': 323}. Best is trial 0 with value: 0.6217035465292513.
[I 2025-06-07 11:52:21,249] Trial 1 finished with value: 0.6015459230069718 and parameters: {'learning_rate': 0.0175228709469025, 'min_child_samples': 60, 'max_depth': 25, 'n_estimators': 111}. Best is trial 0 with value: 0.6217035465292513.
[I 2025-06-07 11:52:36,158] Trial 2 finished with value: 0.6131656057391129 and parameters: {'learning_rate': 0.011033343303859432, 'min_child_samples': 52, 'max_depth': 14, 'n_estimators': 304}. Best is trial 0 with value: 0.6217035465292513.
[I 2025-06-07 11:52:53,369] Trial 3 finished with value: 0.6270081842982722 and parameters: {'learning_rate': 0.04685471095208119, 'min_child_samples': 42, 

In [12]:
best_params = study.best_trial.params
best_params

{'learning_rate': 0.04856071825064453,
 'min_child_samples': 126,
 'max_depth': 13,
 'n_estimators': 411}

In [13]:
best_model = LGBMClassifier(
    objective="multiclass",
    num_class=3,
    metric="multi_logloss",
    # is_unbalance=True,
    # class_weight="balanced",
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    n_estimators=best_params["n_estimators"],
    min_child_samples=best_params["min_child_samples"],
    random_state=42,
    n_jobs=-1
)

In [14]:
best_model.fit(X_train_resampled, y_train_resampled)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 300
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [15]:
y_train_pred = best_model.predict(X_train_resampled)
accuracy_train = accuracy_score(
    y_true=y_train_resampled,
    y_pred=y_train_pred
)
accuracy_train



0.9546327169849449

In [16]:
classification_report_train = classification_report(
    y_true=y_train_resampled,
    y_pred=y_train_pred
)
print(classification_report_train)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      6598
           1       0.95      0.95      0.95      6598
           2       0.95      0.95      0.95      6598

    accuracy                           0.95     19794
   macro avg       0.95      0.95      0.95     19794
weighted avg       0.95      0.95      0.95     19794



In [17]:
# Make predictions on the test data
y_test_pred = best_model.predict(X_test_word2vec)
accuracy_test = accuracy_score(
    y_true=y_test_encoded,
    y_pred=y_test_pred
)
accuracy_test



0.6463930178644484

In [18]:
classification_report_test = classification_report(
    y_true=y_test_encoded,
    y_pred=y_test_pred
)
print(classification_report_test)

              precision    recall  f1-score   support

           0       0.74      0.73      0.74      2529
           1       0.72      0.63      0.67      3154
           2       0.44      0.56      0.49      1650

    accuracy                           0.65      7333
   macro avg       0.63      0.64      0.63      7333
weighted avg       0.67      0.65      0.65      7333



In [19]:
# Function to log results to MLFlow
def log_to_mlflow(
    model_name,
    improvement_technique,
    model,
    X_train,
    X_test,
    y_train,
    y_test,
    best_params
):
    with mlflow.start_run():
        # Tags
        mlflow.set_tag(
            "mlflow.runName", f"{model_name}_{improvement_technique}"
        )
        mlflow.set_tag("experiment_type", "Improving LightGBM")
        
        # Logging improvement technique as a parameter
        mlflow.log_param("improvement_technique", improvement_technique)
        
        # Initializing and training the model
        model.fit(X_train, y_train)
        
        # Making predictions on the test set and logging metrics
        y_pred = model.predict(X_test)
        
        # Logging cross-val accuracy
        scores = cross_val_score(
            model,
            X_train,
            y_train,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
        mlflow.log_metric("cross_val_accuracy", scores.mean())
        
        # Logging accuracy
        accuracy = accuracy_score(
            y_true=y_test,
            y_pred=y_pred
        )
        mlflow.log_metric("test_accuracy", accuracy)
        
        # Logging classification report metrics
        classification_rep = classification_report(
            y_true=y_test,
            y_pred=y_pred,
            output_dict=True
        )
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}: {metric} - test", value)
        
        # Logging the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        
        # Logging the best parameters
        mlflow.log_params(best_params)

In [20]:
log_to_mlflow(
    model_name="LightGBM",
    improvement_technique="word2vec",
    model=best_model,
    X_train=X_train_resampled,
    X_test=X_test_word2vec,
    y_train=y_train_resampled,
    y_test=y_test_encoded,
    best_params=best_params
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 300
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_word2vec at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/7/runs/df9e8ef9b1754c55962f70448bec9d82
🧪 View experiment at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/7
