### Using Hyperparameter Tuning with XGBoost

In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
import os

import warnings
warnings.filterwarnings('ignore')

In [28]:
df = pd.read_csv("./preprocessed_data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [29]:
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

### Model Training

In [30]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import optuna

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

from imblearn.over_sampling import ADASYN

In [None]:
import mlflow
import mlflow.sklearn        # for general sklearn models
import mlflow.xgboost        # for XGBoost models
import mlflow.lightgbm       # for LightGBM models
import pickle
import dagshub

import logging
import time

In [None]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Set up DagsHub credentials for MLflow tracking
username = os.getenv("DAGSHUB_USERNAME")
token = os.getenv("DAGSHUB_TOKEN")

if not username or not token:
    raise ValueError("Missing DagsHub credentials in environment variables")

# Construct the authenticated MLflow tracking URI
mlflow_uri = f"https://{username}:{token}@dagshub.com/{username}/YouTube-Sentiment-Insights-Plugin.mlflow"

dagshub.init(repo_owner=username, repo_name="YouTube-Sentiment-Insights-Plugin", mlflow=True)
mlflow.set_tracking_uri(mlflow_uri)

In [34]:
# Set or create an experiment
experiment = 'Exp 5 - ML Algo (XGBoost) & (LGBM) with HP Tuning'
mlflow.set_experiment(experiment)

<Experiment: artifact_location='mlflow-artifacts:/5e1e10cdd36d45d89572e8e07b05bed9', creation_time=1760737518154, experiment_id='12', last_update_time=1760737518154, lifecycle_stage='active', name='Exp 5 - ML Algo (XGBoost) & (LGBM) with HP Tuning', tags={}>

In [35]:
# TF-IDF Params
ngram_range = (1, 3)  
vec_max_features = 1000

# Vectorizer
vec = TfidfVectorizer(ngram_range=ngram_range, max_features=vec_max_features)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], 
    test_size=0.2, random_state=42, stratify=df['category']
)

# Vectorization
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Handle imbalance
adasyn = ADASYN(random_state=42)
X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)

In [40]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting MLFlow run ....")

# --- MLflow Logging Function ---
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, best_params, vectorizer=None):
    start_time = time.time()
    try:
        with mlflow.start_run() as run:
            # ------------------ Metadata ------------------
            logging.info("Logging Metadata ....")
            
            mlflow.set_tag("mlflow.runName", f"{model_name}_ADASYN_TFIDF_TriGram")
            mlflow.set_tag('experiment_type', 'hypertuning')
            mlflow.set_tag("model_type", model_name)
            mlflow.set_tag("description", f"{model_name} with TF-IDF TriGram & ADASYN")
    
            # ------------------ Log Algorithm and Best params ------------------
            logging.info("Logging Algorithm and its Best Params ....")
            
            mlflow.log_param("algo_name", model_name)
            mlflow.log_params(best_params)
    
            # ------------------ Train Model & Predict ------------------
            logging.info("Model Training & Predicion Started ....")
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            logging.info("Model Training & Predicion Ended ....")
    
            # ------------------ Metrics ------------------
            logging.info("Logging Metrics ....")
            
            accuracy = accuracy_score(y_test, y_pred)
            mlflow.log_metric("accuracy", accuracy)
    
            # Log classification report
            classification_rep = classification_report(y_test, y_pred, output_dict=True)
            for label, metrics in classification_rep.items():
                if isinstance(metrics, dict):
                    for metric, value in metrics.items():
                        mlflow.log_metric(f"{label}_{metric}", value)
    
            # ------------------ Confusion Matrix ------------------
            conf_matrix = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
            plt.xlabel("Predicted")
            plt.ylabel("Actual")
            plt.title("Confusion Matrix")
            plt.tight_layout()
        
            mlflow.log_figure(plt.gcf(), "Confusion_Matrix.png")  # log the plot
            plt.close()
    
            # ------------------ Log the Model Properly ------------------
            logging.info(f"Logging the model {model_name} ....")
            
            if model_name.lower() == "xgboost":
                mlflow.xgboost.log_model(
                    xgb_model=model,
                    artifact_path="model",  # will create artifacts/model folder
                    # registered_model_name=f"{model_name}"
                )
            else:
                mlflow.lightgbm.log_model(
                    lgb_model=model,
                    artifact_path="model",  # will create artifacts/model folder
                    # registered_model_name=f"{model_name}"
                )
    
            end_time = time.time()
        
            logging.info(f"Completed the Experiment in {end_time-start_time} seconds")
        
            logging.info(f"Accuracy -> {accuracy:.2f}")

    except Exception as e:
        logging.error(f"Error occured while Vectorizing: {e}")
        raise

2025-10-18 03:33:10,047 - INFO - Starting MLFlow run ....


In [None]:
# --- Optuna Objective Functions ---
def objective_xgboost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'random_state': 42,
    }
    model = XGBClassifier(**params, tree_method='gpu_hist', predictor='gpu_predictor', n_jobs=-1)
    score = cross_val_score(model, X_train_vec, y_train, cv=3, scoring='f1_macro').mean()
    return score


def objective_lightgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True)
    }
    model = LGBMClassifier(**params, device='gpu', random_state=42, n_jobs=-1)
    score = cross_val_score(model, X_train_vec, y_train, cv=3, scoring='f1_macro').mean()
    return score

In [42]:
# --- Run Optuna & Log Best Model ---
def run_optuna(model_name, n_trials=15):
    if model_name.lower() == "xgboost":
        study = optuna.create_study(direction="maximize")
        study.optimize(objective_xgboost, n_trials=n_trials)
        best_model = XGBClassifier(**study.best_params, tree_method='gpu_hist', predictor='gpu_predictor', n_jobs=-1, random_state=42)
    else:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective_lightgbm, n_trials=n_trials)
        best_model = LGBMClassifier(**study.best_params, device='gpu', n_jobs=-1, random_state=42)

    # Log final model
    log_mlflow(model_name, best_model, X_train_vec, X_test_vec, y_train, y_test, study.best_params, vectorizer=vec)

In [44]:
# run_optuna('XGBoost')
run_optuna('LGBMClassifier')

[I 2025-10-18 03:42:29,185] A new study created in memory with name: no-name-4db8c64f-043b-4ad8-8779-dc03fae7bfe7


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 69672
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 858
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 69536
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 864
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:42:45,816] Trial 0 finished with value: 0.7249133802133169 and parameters: {'n_estimators': 256, 'learning_rate': 0.016238602390232792, 'max_depth': 12, 'num_leaves': 129, 'min_child_samples': 89, 'colsample_bytree': 0.8803147977838606, 'subsample': 0.8073543653424531, 'reg_alpha': 0.09443490177489915, 'reg_lambda': 0.4154908954871287}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72695
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 974
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72381
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 974
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:43:49,543] Trial 1 finished with value: 0.6936378552706789 and parameters: {'n_estimators': 923, 'learning_rate': 0.002806588930148777, 'max_depth': 9, 'num_leaves': 70, 'min_child_samples': 23, 'colsample_bytree': 0.5213971050759543, 'subsample': 0.8743210846599482, 'reg_alpha': 1.2052033459980622, 'reg_lambda': 0.6396042519621732}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72568
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 965
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72254
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 965
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:44:12,308] Trial 2 finished with value: 0.35451400783972825 and parameters: {'n_estimators': 513, 'learning_rate': 0.00015481546420043497, 'max_depth': 8, 'num_leaves': 104, 'min_child_samples': 45, 'colsample_bytree': 0.6175710995620023, 'subsample': 0.5275818231998889, 'reg_alpha': 3.877844470113717, 'reg_lambda': 0.06031259826911767}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72695
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 974
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72381
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 974
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:44:21,558] Trial 3 finished with value: 0.2993051231387083 and parameters: {'n_estimators': 251, 'learning_rate': 0.00012720206765490843, 'max_depth': 5, 'num_leaves': 126, 'min_child_samples': 27, 'colsample_bytree': 0.8783234461223843, 'subsample': 0.9950355032524947, 'reg_alpha': 0.0008441243894014495, 'reg_lambda': 7.659521966269802}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 69672
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 858
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 69536
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 864
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:44:38,308] Trial 4 finished with value: 0.6134018427961203 and parameters: {'n_estimators': 614, 'learning_rate': 0.0038779446189963645, 'max_depth': 3, 'num_leaves': 62, 'min_child_samples': 89, 'colsample_bytree': 0.5977529902694971, 'subsample': 0.6166430158433551, 'reg_alpha': 0.001123462407574419, 'reg_lambda': 4.526247472998621}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72695
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 974
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72397
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 976
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:44:42,745] Trial 5 finished with value: 0.6786274912312993 and parameters: {'n_estimators': 185, 'learning_rate': 0.03481699696896135, 'max_depth': 3, 'num_leaves': 112, 'min_child_samples': 19, 'colsample_bytree': 0.846867489499721, 'subsample': 0.8975463571859674, 'reg_alpha': 0.33250396993296977, 'reg_lambda': 0.01672225482903505}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72702
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 975
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72404
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 977
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:45:04,441] Trial 6 finished with value: 0.34149291485412564 and parameters: {'n_estimators': 442, 'learning_rate': 0.00028275238340595514, 'max_depth': 7, 'num_leaves': 23, 'min_child_samples': 18, 'colsample_bytree': 0.777015269809053, 'subsample': 0.8009243125024594, 'reg_alpha': 0.007678295570962166, 'reg_lambda': 0.36392573337165374}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72732
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 980
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72440
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 983
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:46:07,448] Trial 7 finished with value: 0.6523494013204565 and parameters: {'n_estimators': 906, 'learning_rate': 0.0008654505127802686, 'max_depth': 8, 'num_leaves': 29, 'min_child_samples': 13, 'colsample_bytree': 0.5759653429431792, 'subsample': 0.5385248790924746, 'reg_alpha': 0.00039957981564728604, 'reg_lambda': 0.19823349633046636}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72533
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 963
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72237
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 964
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:46:38,197] Trial 8 finished with value: 0.6787284900402003 and parameters: {'n_estimators': 563, 'learning_rate': 0.005526127262077241, 'max_depth': 6, 'num_leaves': 58, 'min_child_samples': 54, 'colsample_bytree': 0.5603667204605443, 'subsample': 0.7577185159006001, 'reg_alpha': 0.0009143085583527407, 'reg_lambda': 0.00020353934778786776}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72533
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 963
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72237
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 964
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:47:32,184] Trial 9 finished with value: 0.3863545334100973 and parameters: {'n_estimators': 939, 'learning_rate': 0.00016044343970801929, 'max_depth': 9, 'num_leaves': 135, 'min_child_samples': 53, 'colsample_bytree': 0.5283434454272331, 'subsample': 0.7604794494219653, 'reg_alpha': 1.9888004155606942, 'reg_lambda': 9.089840812605111}. Best is trial 0 with value: 0.7249133802133169.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 68309
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 815
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 67661
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 805
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:47:51,167] Trial 10 finished with value: 0.761405547991349 and parameters: {'n_estimators': 329, 'learning_rate': 0.09811986393677827, 'max_depth': 14, 'num_leaves': 146, 'min_child_samples': 95, 'colsample_bytree': 0.9977070859239754, 'subsample': 0.6555106327116779, 'reg_alpha': 0.048446971464759764, 'reg_lambda': 0.0019930945144749344}. Best is trial 10 with value: 0.761405547991349.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 67207
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 782
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 66399
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 767
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:48:09,212] Trial 11 finished with value: 0.7596484905152131 and parameters: {'n_estimators': 330, 'learning_rate': 0.09490470234995005, 'max_depth': 14, 'num_leaves': 147, 'min_child_samples': 99, 'colsample_bytree': 0.9838624189664118, 'subsample': 0.6542208603338151, 'reg_alpha': 0.06787870643391501, 'reg_lambda': 0.001986209502813038}. Best is trial 10 with value: 0.761405547991349.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 67916
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 803
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 67036
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 786
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:48:30,868] Trial 12 finished with value: 0.7611789269593375 and parameters: {'n_estimators': 376, 'learning_rate': 0.09695953048572485, 'max_depth': 15, 'num_leaves': 148, 'min_child_samples': 97, 'colsample_bytree': 0.9904768111195659, 'subsample': 0.6467490706159692, 'reg_alpha': 0.017610773192231043, 'reg_lambda': 0.0014526620120105646}. Best is trial 10 with value: 0.761405547991349.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72155
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 946
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 71790
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 944
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:49:12,963] Trial 13 finished with value: 0.7762337660716864 and parameters: {'n_estimators': 679, 'learning_rate': 0.07741711934422332, 'max_depth': 15, 'num_leaves': 150, 'min_child_samples': 72, 'colsample_bytree': 0.9932215408606573, 'subsample': 0.6210587842092341, 'reg_alpha': 0.012753240867635789, 'reg_lambda': 0.002637790872668392}. Best is trial 13 with value: 0.7762337660716864.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 72155
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 946
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1.042643
[LightGBM] [Info] Start training from score -1.063572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 71790
[LightGBM] [Info] Number of data points in the train set: 23695, number of used features: 944
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196475
[LightGBM] [Info] Start training from score -1

[I 2025-10-18 03:49:52,625] Trial 14 finished with value: 0.770258725522293 and parameters: {'n_estimators': 692, 'learning_rate': 0.02321304236779415, 'max_depth': 12, 'num_leaves': 92, 'min_child_samples': 72, 'colsample_bytree': 0.9345465595641185, 'subsample': 0.6786300230680921, 'reg_alpha': 0.005720436095117656, 'reg_lambda': 0.004559060333753601}. Best is trial 13 with value: 0.7762337660716864.
2025-10-18 03:49:54,009 - INFO - Logging Metadata ....
2025-10-18 03:49:55,462 - INFO - Logging Algorithm and its Best Params ....
2025-10-18 03:49:56,181 - INFO - Model Training & Predicion Started ....


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 98977
[LightGBM] [Info] Number of data points in the train set: 35543, number of used features: 964
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score -1.196489
[LightGBM] [Info] Start training from score -1.042697
[LightGBM] [Info] Start training from score -1.063504


2025-10-18 03:50:16,680 - INFO - Model Training & Predicion Ended ....
2025-10-18 03:50:16,680 - INFO - Logging Metrics ....
2025-10-18 03:50:29,192 - INFO - Logging the model LGBMClassifier ....
2025-10-18 03:51:11,259 - INFO - Completed the Experiment in 78.63404321670532 seconds
2025-10-18 03:51:11,274 - INFO - Accuracy -> 0.80


### Conclusion

The **LightGBM model**, optimized using **Optuna** and balanced with **ADASYN**, achieved strong overall performance with an **accuracy of 79.6%** and a **macro F1-score of 0.778.** These results indicate that the model effectively captures sentiment patterns across all classes while maintaining balanced precision and recall. Although minor variation exists in the neutral class, LightGBM demonstrated the best combination of accuracy, speed, and robustness among all tested models, making it the optimal choice for this sentiment analysis task.

In [45]:
import torch
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


12.1
True
NVIDIA GeForce GTX 1660 Ti
