In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
dataset = pd.read_csv('/content/reddit_preprocessing (1).csv')

# Drop rows with NaN values in 'clean_comment'
cleaned_dataset = dataset.dropna()

In [None]:
# Separate features and target
X_cleaned = cleaned_dataset['clean_comment']
y_cleaned = cleaned_dataset['category']

In [None]:
# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

In [None]:
# Apply TfidfVectorizer with trigram setting and max_features=1000
tfidf_cleaned = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)

In [None]:
# Fit the vectorizer on the training data and transform both train and test sets
X_train_tfidf_cleaned = tfidf_cleaned.fit_transform(X_train_cleaned)
X_test_tfidf_cleaned = tfidf_cleaned.transform(X_test_cleaned)

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import optuna

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Function to optimize LightGBM hyperparameters
def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        "objective": "multiclass",
        "num_class": 3,  # Assuming 3 categories (-1, 0, 1)
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "metric": "multi_logloss",
        "is_unbalance": True,
        "class_weight": "balanced",
    }

    # Define the LightGBM model with the trial parameters
    model = lgb.LGBMClassifier(**param)

    # Perform cross-validation
    scores = cross_val_score(model, X_train_tfidf_cleaned, y_train_cleaned, cv=3, scoring='accuracy')

    # Return the average score across folds
    return scores.mean()

In [None]:
# Create an Optuna study to optimize the hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-10-22 11:33:00,861] A new study created in memory with name: no-name-32a7c64e-8479-42f6-91f6-369d614912e4


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.206283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.370918 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:34:03,032] Trial 0 finished with value: 0.7852637304804952 and parameters: {'learning_rate': 0.039875105863358025, 'n_estimators': 436, 'max_depth': 5}. Best is trial 0 with value: 0.7852637304804952.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.356522 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.370406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:34:13,101] Trial 1 finished with value: 0.6678716020559068 and parameters: {'learning_rate': 0.053397357053433396, 'n_estimators': 51, 'max_depth': 5}. Best is trial 0 with value: 0.7852637304804952.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.624242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.359525 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:35:18,084] Trial 2 finished with value: 0.8227692525590165 and parameters: {'learning_rate': 0.05623450569420743, 'n_estimators': 202, 'max_depth': 17}. Best is trial 2 with value: 0.8227692525590165.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.350544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.372661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:35:57,605] Trial 3 finished with value: 0.6773502181015255 and parameters: {'learning_rate': 0.01282116008942211, 'n_estimators': 179, 'max_depth': 7}. Best is trial 2 with value: 0.8227692525590165.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.346193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.364784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:36:17,899] Trial 4 finished with value: 0.67697515339479 and parameters: {'learning_rate': 0.013889665657657601, 'n_estimators': 56, 'max_depth': 14}. Best is trial 2 with value: 0.8227692525590165.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.373354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.385052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:36:35,452] Trial 5 finished with value: 0.750860902294657 and parameters: {'learning_rate': 0.08244473027088565, 'n_estimators': 115, 'max_depth': 5}. Best is trial 2 with value: 0.8227692525590165.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.990482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.357689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:38:02,856] Trial 6 finished with value: 0.8399878041370014 and parameters: {'learning_rate': 0.08669227367165437, 'n_estimators': 392, 'max_depth': 10}. Best is trial 6 with value: 0.8399878041370014.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.361819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.369421 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:38:22,314] Trial 7 finished with value: 0.7234136854696613 and parameters: {'learning_rate': 0.04649406001481758, 'n_estimators': 58, 'max_depth': 12}. Best is trial 6 with value: 0.8399878041370014.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.388327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:39:12,425] Trial 8 finished with value: 0.7304714207570782 and parameters: {'learning_rate': 0.018700150551460813, 'n_estimators': 482, 'max_depth': 4}. Best is trial 6 with value: 0.8399878041370014.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.396308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.632865 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 11:39:56,427] Trial 9 finished with value: 0.81509763330564 and parameters: {'learning_rate': 0.08575421778565691, 'n_estimators': 227, 'max_depth': 8}. Best is trial 6 with value: 0.8399878041370014.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.453760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.660413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:41:56,479] Trial 10 finished with value: 0.8449316686550893 and parameters: {'learning_rate': 0.09868569306695879, 'n_estimators': 362, 'max_depth': 19}. Best is trial 10 with value: 0.8449316686550893.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.349229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.363866 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:43:54,949] Trial 11 finished with value: 0.8449997966100673 and parameters: {'learning_rate': 0.09971814025345323, 'n_estimators': 354, 'max_depth': 20}. Best is trial 11 with value: 0.8449997966100673.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.366315 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:45:50,645] Trial 12 finished with value: 0.8449998279974027 and parameters: {'learning_rate': 0.0980761820126992, 'n_estimators': 345, 'max_depth': 20}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.359209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.366797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 11:47:40,469] Trial 13 finished with value: 0.8435678540621154 and parameters: {'learning_rate': 0.06876779182491402, 'n_estimators': 312, 'max_depth': 20}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.362799 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.366478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:49:09,133] Trial 14 finished with value: 0.843636034329319 and parameters: {'learning_rate': 0.0963825594349074, 'n_estimators': 300, 'max_depth': 16}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.371765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.353093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:51:00,423] Trial 15 finished with value: 0.8425790902259503 and parameters: {'learning_rate': 0.07162441934763862, 'n_estimators': 362, 'max_depth': 17}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.428394 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.384513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 11:52:28,342] Trial 16 finished with value: 0.8419994184554519 and parameters: {'learning_rate': 0.0745551833658321, 'n_estimators': 254, 'max_depth': 20}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.355013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.349901 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:54:31,429] Trial 17 finished with value: 0.8272017546106323 and parameters: {'learning_rate': 0.03351512216102506, 'n_estimators': 425, 'max_depth': 15}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.358629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.349041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:56:01,029] Trial 18 finished with value: 0.8354870974058102 and parameters: {'learning_rate': 0.0628133016234898, 'n_estimators': 335, 'max_depth': 13}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.350605 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 11:58:36,037] Trial 19 finished with value: 0.844113355486933 and parameters: {'learning_rate': 0.09302083445461892, 'n_estimators': 475, 'max_depth': 19}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.369694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.357965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:00:05,400] Trial 20 finished with value: 0.8420676580098444 and parameters: {'learning_rate': 0.07939977404845197, 'n_estimators': 273, 'max_depth': 18}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.359464 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.364386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:02:14,086] Trial 21 finished with value: 0.8448975296967437 and parameters: {'learning_rate': 0.09833197737233645, 'n_estimators': 375, 'max_depth': 20}. Best is trial 12 with value: 0.8449998279974027.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.376848 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.372751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:04:03,912] Trial 22 finished with value: 0.8459545749370815 and parameters: {'learning_rate': 0.09977358671932686, 'n_estimators': 339, 'max_depth': 18}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.623146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.369291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 12:06:18,536] Trial 23 finished with value: 0.8446588952740495 and parameters: {'learning_rate': 0.0879164485127597, 'n_estimators': 419, 'max_depth': 18}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.648331 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:08:01,593] Trial 24 finished with value: 0.8437042494713397 and parameters: {'learning_rate': 0.09288691483021062, 'n_estimators': 336, 'max_depth': 16}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.366286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:09:39,025] Trial 25 finished with value: 0.8442498206457912 and parameters: {'learning_rate': 0.08004445892114719, 'n_estimators': 300, 'max_depth': 18}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.356785 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.355684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:11:32,331] Trial 26 finished with value: 0.8443520143219992 and parameters: {'learning_rate': 0.09129181679212865, 'n_estimators': 392, 'max_depth': 15}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.347717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 12:13:00,755] Trial 27 finished with value: 0.8442838549796859 and parameters: {'learning_rate': 0.09998569040210374, 'n_estimators': 252, 'max_depth': 20}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.362742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.368131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:14:30,086] Trial 28 finished with value: 0.6656212975150549 and parameters: {'learning_rate': 0.0036436847776480583, 'n_estimators': 339, 'max_depth': 10}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.627152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.365291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:16:54,530] Trial 29 finished with value: 0.8331003416560271 and parameters: {'learning_rate': 0.03232553522432352, 'n_estimators': 451, 'max_depth': 17}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.362448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.653792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:19:11,355] Trial 30 finished with value: 0.8453408304703899 and parameters: {'learning_rate': 0.06640344899137701, 'n_estimators': 403, 'max_depth': 19}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.359836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:21:27,326] Trial 31 finished with value: 0.8450339495183398 and parameters: {'learning_rate': 0.0906706108190915, 'n_estimators': 403, 'max_depth': 19}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.371609 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.351957 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:23:37,725] Trial 32 finished with value: 0.8444202294640197 and parameters: {'learning_rate': 0.06435306149201302, 'n_estimators': 402, 'max_depth': 19}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.350149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.354743 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:25:56,402] Trial 33 finished with value: 0.8442838096424237 and parameters: {'learning_rate': 0.07713067325238049, 'n_estimators': 452, 'max_depth': 18}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.479111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.353963 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 12:28:22,156] Trial 34 finished with value: 0.8428859432781469 and parameters: {'learning_rate': 0.06022995729281782, 'n_estimators': 498, 'max_depth': 16}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.353288 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.657090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:30:37,055] Trial 35 finished with value: 0.8404651497069874 and parameters: {'learning_rate': 0.04356470682743795, 'n_estimators': 416, 'max_depth': 19}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.347460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.366991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:32:54,319] Trial 36 finished with value: 0.8415562118438115 and parameters: {'learning_rate': 0.050750953002049785, 'n_estimators': 446, 'max_depth': 17}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.371515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.356575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 12:34:23,709] Trial 37 finished with value: 0.8428518043198013 and parameters: {'learning_rate': 0.08396331164235982, 'n_estimators': 314, 'max_depth': 15}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.461090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.424322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:35:11,764] Trial 38 finished with value: 0.8276109199134144 and parameters: {'learning_rate': 0.0908065574216212, 'n_estimators': 167, 'max_depth': 14}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.368782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.375692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 12:37:13,094] Trial 39 finished with value: 0.8421699214356863 and parameters: {'learning_rate': 0.05489338850555802, 'n_estimators': 379, 'max_depth': 18}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.348506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.370903 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:38:20,044] Trial 40 finished with value: 0.8277132147265918 and parameters: {'learning_rate': 0.0694251945539678, 'n_estimators': 282, 'max_depth': 11}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.345634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.373042 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:40:16,796] Trial 41 finished with value: 0.8449316756300527 and parameters: {'learning_rate': 0.09525714893081973, 'n_estimators': 348, 'max_depth': 20}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.361934 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.365136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 12:42:29,486] Trial 42 finished with value: 0.8450339181310045 and parameters: {'learning_rate': 0.08955000501456749, 'n_estimators': 399, 'max_depth': 20}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.346411 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.350562 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:44:39,994] Trial 43 finished with value: 0.845511277650917 and parameters: {'learning_rate': 0.08928022185852715, 'n_estimators': 404, 'max_depth': 19}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.350220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.354738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:46:48,688] Trial 44 finished with value: 0.8454771456675348 and parameters: {'learning_rate': 0.08765604089948664, 'n_estimators': 400, 'max_depth': 19}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.626919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.358792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:47:51,028] Trial 45 finished with value: 0.8300999042142226 and parameters: {'learning_rate': 0.08338466712071566, 'n_estimators': 380, 'max_depth': 7}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.365239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.350738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:50:20,054] Trial 46 finished with value: 0.8449316651676075 and parameters: {'learning_rate': 0.08646421907539809, 'n_estimators': 468, 'max_depth': 19}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.573343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.627397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-10-22 12:52:32,698] Trial 47 finished with value: 0.8441133589744148 and parameters: {'learning_rate': 0.07466553147921594, 'n_estimators': 433, 'max_depth': 17}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.350448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.366719 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 12:53:05,125] Trial 48 finished with value: 0.7891166095863561 and parameters: {'learning_rate': 0.07803835425652624, 'n_estimators': 405, 'max_depth': 3}. Best is trial 22 with value: 0.8459545749370815.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.622534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83962
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.635332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83736
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-10-22 12:55:03,439] Trial 49 finished with value: 0.8440111129859812 and parameters: {'learning_rate': 0.08353109113081106, 'n_estimators': 366, 'max_depth': 19}. Best is trial 22 with value: 0.8459545749370815.


In [None]:
# Extract the best hyperparameters
best_params = study.best_params
best_params

{'learning_rate': 0.08081298097796712, 'n_estimators': 367, 'max_depth': 20}

In [None]:
best_model = lgb.LGBMClassifier(

    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance= True,
    class_weight= "balanced",
    reg_alpha= 0.1,  # L1 regularization
    reg_lambda= 0.1,  # L2 regularization
    learning_rate= 0.08,
    max_depth= 20,
    n_estimators=367
)

In [None]:
# Fit the model on the resampled training data
best_model.fit(X_train_tfidf_cleaned, y_train_cleaned)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.267752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 131883
[LightGBM] [Info] Number of data points in the train set: 29329, number of used features: 4437
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [None]:
# Predict on the train set
y_train_pred = best_model.predict(X_train_tfidf_cleaned)

NameError: name 'best_model' is not defined

In [None]:
# Calculate accuracy on the test set
accuracy_train = accuracy_score(y_train_cleaned, y_train_pred)
accuracy_train

0.9276143066589383

In [None]:
# Generate classification report
report_train = classification_report(y_train_cleaned, y_train_pred)
print(report_train)

              precision    recall  f1-score   support

          -1       0.91      0.90      0.91      6601
           0       0.88      0.98      0.93     10134
           1       0.98      0.90      0.94     12594

    accuracy                           0.93     29329
   macro avg       0.93      0.93      0.92     29329
weighted avg       0.93      0.93      0.93     29329



In [None]:
# Predict on the test set
y_pred = best_model.predict(X_test_tfidf_cleaned)

In [None]:
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

0.8658120823673804

In [None]:
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

              precision    recall  f1-score   support

          -1       0.81      0.78      0.79      1647
           0       0.84      0.97      0.90      2510
           1       0.92      0.83      0.87      3176

    accuracy                           0.87      7333
   macro avg       0.86      0.86      0.86      7333
weighted avg       0.87      0.87      0.86      7333



In [None]:
import re
import numpy as np

# Assuming you have pre-trained tfidf_vectorizer and lgbm_model loaded
# tfidf_vectorizer: Your trained TF-IDF vectorizer
# lgbm_model: Your trained LightGBM model

# Function to clean and preprocess a YouTube comment (same as used during training)
def preprocess_comment(comment):
    # Lowercasing
    comment = comment.lower()

    # Remove special characters, URLs, punctuation, and extra spaces
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)  # Remove URLs
    comment = re.sub(r'\W', ' ', comment)  # Remove special characters
    comment = re.sub(r'\s+', ' ', comment).strip()  # Remove extra spaces and newlines

    return comment

# Prediction function
def predict_sentiment(comment, tfidf_vectorizer, lgbm_model):
    # Step 1: Preprocess the YouTube comment
    cleaned_comment = preprocess_comment(comment)

    # Step 2: Transform the comment using the trained TF-IDF vectorizer
    comment_tfidf = tfidf_vectorizer.transform([cleaned_comment])

    # Step 3: Use the trained LightGBM model to predict the sentiment
    prediction = lgbm_model.predict(comment_tfidf)
    prediction_proba = lgbm_model.predict_proba(comment_tfidf)

    # Step 4: Get the predicted sentiment (label) and probability
    sentiment_class = np.argmax(prediction_proba)
    sentiment_proba = np.max(prediction_proba)

    # Step 5: Return the sentiment label and confidence
    return {
        'sentiment_class': int(prediction[0]),  # -1, 0, or 1 depending on your labels
        'confidence': sentiment_proba
    }

# Example usage:
comment1 = "I absolutely hate this video!"
comment2 = "The explanations were confusing and the video quality was poor."
comment3 = "I didn’t learn anything useful. Really disappointed."
comment4 = "Wow, the explanation was so clear and helpful. Definitely subscribing!"
comment5 = "This is the worst video I’ve seen on this topic, very misleading"
comment6 = "Not much to say about this, just a standard video."
comment7 = "The video is okay, but I expected more depth in the content."
comment8 = "Superb content! Mazaa aa gaya dekh ke. Best video on this topic!"
comment9 = "Poor video quality aur explanation bhi weak tha."
comment10 = "Yeh video theek tha, but I was expecting more depth."
result = predict_sentiment(comment10, tfidf_cleaned, best_model)
print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence: {result['confidence']}")

Predicted Sentiment: 0, Confidence: 0.7993257111935146
