In [1]:
# Imports

import optuna
import mlflow
import dagshub
import mlflow.sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Setting up DagsHub

dagshub.init(repo_owner='SushrutGaikwad', repo_name='youtube-comments-analyzer', mlflow=True)

# Data

In [3]:
PREPROCESSED_DATA_PATH = "../data/processed/reddit_preprocessed.csv"
df = pd.read_csv(PREPROCESSED_DATA_PATH)
df.dropna(subset=["clean_comment"], inplace=True)
df.shape

(36662, 2)

# Running the experiment

In [4]:
# Setting experiment name

mlflow.set_experiment("Exp 4: ML algorithms with hyperparameter tuning")

<Experiment: artifact_location='mlflow-artifacts:/3f66177ebab24d6392ce8f143c28100b', creation_time=1749046747756, experiment_id='5', last_update_time=1749046747756, lifecycle_stage='active', name='Exp 4: ML algorithms with hyperparameter tuning', tags={}>

## Preprocessing

In [5]:
# Remapping class labels from {-1, 0, 1} to {2, 0, 1}
mapping = {
    -1: 2,
    0: 0,
    1: 1
}
df["category"] = df["category"].map(mapping)

# Removing missing sentiments
df.dropna(subset=["category"], inplace=True)

# Feature engineering
ngram_range = (1, 2)
max_features = 1000
vectorizer = CountVectorizer(
    ngram_range=ngram_range,
    max_features=max_features
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_comment"],
    df["category"],
    test_size=0.2,
    random_state=42,
    stratify=df["category"]
)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_vectorized, y_train = rus.fit_resample(
    X_train_vectorized,
    y_train
)

X_train_vectorized = X_train_vectorized.astype(np.float32)
X_test_vectorized = X_test_vectorized.astype(np.float32)

## Running the experiment

In [6]:
# Function to log results to MLFlow
def log_to_mlflow(
    model_name,
    model,
    X_train,
    X_test,
    y_train,
    y_test
):
    with mlflow.start_run():
        # Tags
        mlflow.set_tag(
            "mlflow.runName", f"{model_name}_BoW_bigrams_1000_undersampling"
        )
        mlflow.set_tag("experiment_type", "ML_models_comparison")
        
        # Logging model name as a parameter
        mlflow.log_param("model_name", model_name)
        
        # Initializing and training the model
        model.fit(X_train, y_train)
        
        # Making predictions on the test set and logging metrics
        y_pred = model.predict(X_test)
        
        # Logging accuracy
        accuracy = accuracy_score(
            y_true=y_test,
            y_pred=y_pred
        )
        mlflow.log_metric("accuracy", accuracy)
        
        # Logging classification report metrics
        classification_rep = classification_report(
            y_true=y_test,
            y_pred=y_pred,
            output_dict=True
        )
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}: {metric}", value)
        
        # Logging the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

In [7]:
# Optuna objective function
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    
    model = LGBMClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)
    return accuracy_score(
        y_true=y_test,
        y_pred=y_pred
    )

In [8]:
def run_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50)
    
    # Getting the best parameters and logging the best model
    best_params = study.best_params
    best_model = LGBMClassifier(
        n_estimators=best_params["n_estimators"],
        learning_rate=best_params["learning_rate"],
        max_depth=best_params["max_depth"],
        random_state=42,
        n_jobs=-1
    )
    log_to_mlflow(
        model_name="LightGBM",
        model=best_model,
        X_train=X_train_vectorized,
        X_test=X_test_vectorized,
        y_train=y_train,
        y_test=y_test
    )

In [9]:
run_experiment()

[I 2025-06-04 20:34:45,586] A new study created in memory with name: no-name-7fc02670-cc70-45af-898a-2f87512896d4


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:46,261] Trial 0 finished with value: 0.5509341333696987 and parameters: {'n_estimators': 126, 'learning_rate': 0.0010008366520759123, 'max_depth': 5}. Best is trial 0 with value: 0.5509341333696987.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015902 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:47,912] Trial 1 finished with value: 0.5659348152188736 and parameters: {'n_estimators': 271, 'learning_rate': 0.0007942334189630676, 'max_depth': 6}. Best is trial 1 with value: 0.5659348152188736.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:48,706] Trial 2 finished with value: 0.6380744579299059 and parameters: {'n_estimators': 114, 'learning_rate': 0.005202737389326532, 'max_depth': 10}. Best is trial 2 with value: 0.6380744579299059.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015749 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:49,250] Trial 3 finished with value: 0.5678439929087686 and parameters: {'n_estimators': 221, 'learning_rate': 0.003927028200307429, 'max_depth': 3}. Best is trial 2 with value: 0.6380744579299059.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:50,489] Trial 4 finished with value: 0.5608891313241511 and parameters: {'n_estimators': 246, 'learning_rate': 0.0005065325210486288, 'max_depth': 6}. Best is trial 2 with value: 0.6380744579299059.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010834 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:51,510] Trial 5 finished with value: 0.6343924723851084 and parameters: {'n_estimators': 186, 'learning_rate': 0.0013234476197729072, 'max_depth': 10}. Best is trial 2 with value: 0.6380744579299059.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:53,245] Trial 6 finished with value: 0.614618846311196 and parameters: {'n_estimators': 248, 'learning_rate': 0.00015123131191287695, 'max_depth': 10}. Best is trial 2 with value: 0.6380744579299059.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:53,583] Trial 7 finished with value: 0.6618028092186008 and parameters: {'n_estimators': 89, 'learning_rate': 0.03642653817748584, 'max_depth': 4}. Best is trial 7 with value: 0.6618028092186008.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:55,282] Trial 8 finished with value: 0.6163916541660984 and parameters: {'n_estimators': 236, 'learning_rate': 0.00023602609489193775, 'max_depth': 10}. Best is trial 7 with value: 0.6618028092186008.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014550 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:56,005] Trial 9 finished with value: 0.7515341606436656 and parameters: {'n_estimators': 99, 'learning_rate': 0.070518076824792, 'max_depth': 9}. Best is trial 9 with value: 0.7515341606436656.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:56,421] Trial 10 finished with value: 0.7215327969453157 and parameters: {'n_estimators': 50, 'learning_rate': 0.09151401603033445, 'max_depth': 8}. Best is trial 9 with value: 0.7515341606436656.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:56,827] Trial 11 finished with value: 0.7222146461202782 and parameters: {'n_estimators': 55, 'learning_rate': 0.08308881655658955, 'max_depth': 8}. Best is trial 9 with value: 0.7515341606436656.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:57,175] Trial 12 finished with value: 0.6541660984590209 and parameters: {'n_estimators': 56, 'learning_rate': 0.021681942882617215, 'max_depth': 8}. Best is trial 9 with value: 0.7515341606436656.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013751 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:58,055] Trial 13 finished with value: 0.6852584208373108 and parameters: {'n_estimators': 159, 'learning_rate': 0.015371755791590534, 'max_depth': 8}. Best is trial 9 with value: 0.7515341606436656.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:58,570] Trial 14 finished with value: 0.7511250511386881 and parameters: {'n_estimators': 90, 'learning_rate': 0.08888668997332368, 'max_depth': 8}. Best is trial 9 with value: 0.7515341606436656.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024001 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:34:59,258] Trial 15 finished with value: 0.6498022637392609 and parameters: {'n_estimators': 101, 'learning_rate': 0.012317890305145365, 'max_depth': 7}. Best is trial 9 with value: 0.7515341606436656.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016040 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:00,324] Trial 16 finished with value: 0.7603981999181781 and parameters: {'n_estimators': 149, 'learning_rate': 0.06171815419687528, 'max_depth': 9}. Best is trial 16 with value: 0.7603981999181781.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022923 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:01,257] Trial 17 finished with value: 0.7519432701486432 and parameters: {'n_estimators': 154, 'learning_rate': 0.045283470315880606, 'max_depth': 9}. Best is trial 16 with value: 0.7603981999181781.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:02,095] Trial 18 finished with value: 0.6603027410336834 and parameters: {'n_estimators': 158, 'learning_rate': 0.008037725013617525, 'max_depth': 9}. Best is trial 16 with value: 0.7603981999181781.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:03,438] Trial 19 finished with value: 0.7503068321287332 and parameters: {'n_estimators': 195, 'learning_rate': 0.036400109750821134, 'max_depth': 9}. Best is trial 16 with value: 0.7603981999181781.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:04,442] Trial 20 finished with value: 0.5816173462430111 and parameters: {'n_estimators': 148, 'learning_rate': 0.0020537769171964757, 'max_depth': 7}. Best is trial 16 with value: 0.7603981999181781.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:05,407] Trial 21 finished with value: 0.7353061502795581 and parameters: {'n_estimators': 135, 'learning_rate': 0.039037649166026936, 'max_depth': 9}. Best is trial 16 with value: 0.7603981999181781.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:06,434] Trial 22 finished with value: 0.7653075139779081 and parameters: {'n_estimators': 200, 'learning_rate': 0.05356851305863035, 'max_depth': 9}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:07,551] Trial 23 finished with value: 0.7314877948997681 and parameters: {'n_estimators': 205, 'learning_rate': 0.02312185808392673, 'max_depth': 9}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:08,509] Trial 24 finished with value: 0.7456702577389881 and parameters: {'n_estimators': 169, 'learning_rate': 0.05059209948758482, 'max_depth': 7}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:10,244] Trial 25 finished with value: 0.6825310241374608 and parameters: {'n_estimators': 295, 'learning_rate': 0.0070264021287233585, 'max_depth': 9}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:11,716] Trial 26 finished with value: 0.7335333424246556 and parameters: {'n_estimators': 214, 'learning_rate': 0.021750840454535895, 'max_depth': 10}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:12,803] Trial 27 finished with value: 0.6694395199781809 and parameters: {'n_estimators': 180, 'learning_rate': 0.012311973445021841, 'max_depth': 7}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:13,779] Trial 28 finished with value: 0.7396699849993181 and parameters: {'n_estimators': 140, 'learning_rate': 0.04609692924723584, 'max_depth': 8}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:14,428] Trial 29 finished with value: 0.6804854766125733 and parameters: {'n_estimators': 121, 'learning_rate': 0.02652523889310575, 'max_depth': 6}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:14,974] Trial 30 finished with value: 0.7378971771444156 and parameters: {'n_estimators': 164, 'learning_rate': 0.0566461522854466, 'max_depth': 5}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:15,459] Trial 31 finished with value: 0.7357152597845357 and parameters: {'n_estimators': 76, 'learning_rate': 0.06521403055800093, 'max_depth': 9}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:16,241] Trial 32 finished with value: 0.7500340924587481 and parameters: {'n_estimators': 114, 'learning_rate': 0.06261947304446794, 'max_depth': 9}. Best is trial 22 with value: 0.7653075139779081.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:17,258] Trial 33 finished with value: 0.7785353879721806 and parameters: {'n_estimators': 132, 'learning_rate': 0.09878961044526967, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:18,149] Trial 34 finished with value: 0.7357152597845357 and parameters: {'n_estimators': 145, 'learning_rate': 0.03154222225781541, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:19,173] Trial 35 finished with value: 0.7043501977362607 and parameters: {'n_estimators': 191, 'learning_rate': 0.015132137358899797, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:19,703] Trial 36 finished with value: 0.6394381562798309 and parameters: {'n_estimators': 125, 'learning_rate': 0.003573454234211947, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:20,433] Trial 37 finished with value: 0.601254602481931 and parameters: {'n_estimators': 175, 'learning_rate': 0.0005190310786248228, 'max_depth': 9}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:20,785] Trial 38 finished with value: 0.7500340924587481 and parameters: {'n_estimators': 222, 'learning_rate': 0.09680202884576013, 'max_depth': 3}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008856 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:21,587] Trial 39 finished with value: 0.6356198009000409 and parameters: {'n_estimators': 152, 'learning_rate': 0.0022950600355634287, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:22,401] Trial 40 finished with value: 0.7483976544388381 and parameters: {'n_estimators': 266, 'learning_rate': 0.0485861506502968, 'max_depth': 5}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011845 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:23,046] Trial 41 finished with value: 0.7518069003136506 and parameters: {'n_estimators': 106, 'learning_rate': 0.06853846871281775, 'max_depth': 9}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009793 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:23,677] Trial 42 finished with value: 0.7426701213691531 and parameters: {'n_estimators': 111, 'learning_rate': 0.06389450174628947, 'max_depth': 8}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:24,166] Trial 43 finished with value: 0.7760807309423156 and parameters: {'n_estimators': 133, 'learning_rate': 0.09753814535258458, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:24,712] Trial 44 finished with value: 0.7339424519296331 and parameters: {'n_estimators': 138, 'learning_rate': 0.0333310127679142, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:25,255] Trial 45 finished with value: 0.7732169644074731 and parameters: {'n_estimators': 129, 'learning_rate': 0.09048135287935384, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:25,738] Trial 46 finished with value: 0.7764898404472931 and parameters: {'n_estimators': 130, 'learning_rate': 0.09870702299838989, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008812 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:26,030] Trial 47 finished with value: 0.7423973816991681 and parameters: {'n_estimators': 67, 'learning_rate': 0.08073907744723295, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:26,571] Trial 48 finished with value: 0.774580662757398 and parameters: {'n_estimators': 131, 'learning_rate': 0.09207523727463884, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-06-04 20:35:27,014] Trial 49 finished with value: 0.7638074457929905 and parameters: {'n_estimators': 87, 'learning_rate': 0.0988645912433744, 'max_depth': 10}. Best is trial 33 with value: 0.7785353879721806.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 19794, number of used features: 984
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_BoW_bigrams_1000_undersampling at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/5/runs/95416144780e411ca7623412707b9f0f
🧪 View experiment at: https://dagshub.com/SushrutGaikwad/youtube-comments-analyzer.mlflow/#/experiments/5
