In [1]:
import pandas as pd
import numpy as np
import wandb
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV

In [2]:
# Initialize Weights & Biases run for tracking
wandb.init(project="useful_comments_sorter", name="run_initial")

[34m[1mwandb[0m: Currently logged in as: [33mmichelmortier1213[0m ([33mmichelmortier1213-hes-so-gen-ve[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
# Load comment data with embeddings from JSON file
df = pd.read_json("data/comments_with_embeddings.json", lines=False)

In [3]:
df = pd.json_normalize(df["comments"])
df.head()

Unnamed: 0,at,score,content,reviewCreatedVersion,relevance_score,vector
0,2024-02-21T20:52:42,3,Je suis sur facebook depuis 2013 he vous adore...,451.0.0.45.109,85,"[0.021481344476342, 0.07381305843591601, -0.00..."
1,2020-05-22T09:53:04,5,J'adore cette application sauf que pour se met...,6.10.0.541,50,"[0.013015237636864001, 0.08942589908838201, -0..."
2,2020-06-18T20:17:52,5,Il s'agit là d'une très bonne application et d...,135.0.0.28.119,75,"[0.019543664529919, 0.047836035490036004, -0.0..."
3,2021-03-09T07:28:29,5,"depuis la mise à jour, l'appli s'ouvre et se r...",22.5.0.100,85,"[-0.06765174865722601, -0.114827893674373, -0...."
4,2021-01-19T20:22:29,1,Très déçue du sav amazon j'attends un rembours...,16.02.0.100,0,"[0.13606958091259, 0.09106799960136401, -0.014..."


In [4]:
X = df["vector"].tolist()
X = np.array(X, dtype=np.float32)
X = np.array(X).astype(np.float32)
y = df["relevance_score"].astype(float)

In [5]:
y_binary = (y >= 80).astype(int)

In [6]:

# Split the data into train and test sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

In [7]:
n_pos = sum(y_train == 1)
n_neg = sum(y_train == 0)

# Compute class imbalance ratio for scale_pos_weight
ratio = n_neg / n_pos

In [17]:
clf = xgb.XGBClassifier(
    tree_method="hist",          # GPU-compatible method (used instead of 'gpu_hist')
    device="cuda",               # Enable GPU acceleration
    verbosity=1,                 # Show training progress
    eval_metric="logloss",       # Metric for binary classification
    use_label_encoder=False,     # Disable label encoder (deprecated behavior)
    scale_pos_weight=ratio,       # Handle class imbalance
)

# Hyperparameter grid for tuning
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [4, 6, 8],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8, 1.0],
}

# Grid search with cross-validation (parallelized over all CPU cores)
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring="f1",
    cv=3,
    verbose=2
)

In [132]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END learning_rate=0.05, max_depth=4, n_estimators=100, subsample=0.8; total time=   3.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END learning_rate=0.05, max_depth=4, n_estimators=100, subsample=0.8; total time=   3.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END learning_rate=0.05, max_depth=4, n_estimators=100, subsample=0.8; total time=   3.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


KeyboardInterrupt: 

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Define the search space for Bayesian optimization
search_space = {
    "n_estimators": Integer(100, 300),
    "max_depth": Integer(3, 10),
    "learning_rate": Real(0.01, 0.2, prior="log-uniform"),
    "subsample": Real(0.5, 1.0),
    "colsample_bytree": Real(0.5, 1.0),
    "gamma": Real(0, 5),
    "min_child_weight": Integer(1, 10),
    "reg_alpha": Real(0, 2),     # L1 regularization
    "reg_lambda": Real(0, 2),    # L2 regularization
}

# Bayesian hyperparameter search
bayes_search = BayesSearchCV(
    estimator=clf,
    search_spaces=search_space,
    n_iter=30,
    scoring="f1",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=1
)

bayes_search.fit(X_train, y_train)


Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.705051979426657, gamma=3.6386287158866253, learning_rate=0.16356457461011642, max_depth=5, min_child_weight=7, n_estimators=183, reg_alpha=0.701862669798288, reg_lambda=1.479008468380841, subsample=0.6522316555182531; total time=   8.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.705051979426657, gamma=3.6386287158866253, learning_rate=0.16356457461011642, max_depth=5, min_child_weight=7, n_estimators=183, reg_alpha=0.701862669798288, reg_lambda=1.479008468380841, subsample=0.6522316555182531; total time=   5.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.705051979426657, gamma=3.6386287158866253, learning_rate=0.16356457461011642, max_depth=5, min_child_weight=7, n_estimators=183, reg_alpha=0.701862669798288, reg_lambda=1.479008468380841, subsample=0.6522316555182531; total time=   5.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9186941777766422, gamma=4.416576386904311, learning_rate=0.024816792756368698, max_depth=10, min_child_weight=9, n_estimators=112, reg_alpha=0.27661707655715034, reg_lambda=0.7071747952568362, subsample=0.8178645509395852; total time= 1.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9186941777766422, gamma=4.416576386904311, learning_rate=0.024816792756368698, max_depth=10, min_child_weight=9, n_estimators=112, reg_alpha=0.27661707655715034, reg_lambda=0.7071747952568362, subsample=0.8178645509395852; total time= 1.0min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9186941777766422, gamma=4.416576386904311, learning_rate=0.024816792756368698, max_depth=10, min_child_weight=9, n_estimators=112, reg_alpha=0.27661707655715034, reg_lambda=0.7071747952568362, subsample=0.8178645509395852; total time= 1.1min
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7224162561505759, gamma=4.593612608346885, learning_rate=0.013690676564850015, max_depth=6, min_child_weight=3, n_estimators=191, reg_alpha=0.31089610810108664, reg_lambda=1.503105062680993, subsample=0.7786702115169006; total time=   9.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7224162561505759, gamma=4.593612608346885, learning_rate=0.013690676564850015, max_depth=6, min_child_weight=3, n_estimators=191, reg_alpha=0.31089610810108664, reg_lambda=1.503105062680993, subsample=0.7786702115169006; total time=   9.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7224162561505759, gamma=4.593612608346885, learning_rate=0.013690676564850015, max_depth=6, min_child_weight=3, n_estimators=191, reg_alpha=0.31089610810108664, reg_lambda=1.503105062680993, subsample=0.7786702115169006; total time=  10.0s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9061979941786817, gamma=0.8593578069828035, learning_rate=0.059989768591227254, max_depth=9, min_child_weight=6, n_estimators=119, reg_alpha=1.5116010656717631, reg_lambda=1.7452607066198842, subsample=0.9559644307534418; total time=  21.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9061979941786817, gamma=0.8593578069828035, learning_rate=0.059989768591227254, max_depth=9, min_child_weight=6, n_estimators=119, reg_alpha=1.5116010656717631, reg_lambda=1.7452607066198842, subsample=0.9559644307534418; total time=  22.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9061979941786817, gamma=0.8593578069828035, learning_rate=0.059989768591227254, max_depth=9, min_child_weight=6, n_estimators=119, reg_alpha=1.5116010656717631, reg_lambda=1.7452607066198842, subsample=0.9559644307534418; total time=  22.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8997767208035865, gamma=2.190145932204617, learning_rate=0.04843382015645565, max_depth=8, min_child_weight=9, n_estimators=243, reg_alpha=0.8483561449213641, reg_lambda=1.3005682034955253, subsample=0.6765419227639857; total time=  22.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8997767208035865, gamma=2.190145932204617, learning_rate=0.04843382015645565, max_depth=8, min_child_weight=9, n_estimators=243, reg_alpha=0.8483561449213641, reg_lambda=1.3005682034955253, subsample=0.6765419227639857; total time=  23.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8997767208035865, gamma=2.190145932204617, learning_rate=0.04843382015645565, max_depth=8, min_child_weight=9, n_estimators=243, reg_alpha=0.8483561449213641, reg_lambda=1.3005682034955253, subsample=0.6765419227639857; total time=  23.7s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8670140089927842, gamma=4.6968486880138585, learning_rate=0.016325175054961123, max_depth=4, min_child_weight=8, n_estimators=175, reg_alpha=0.9180490283016114, reg_lambda=1.0695303202422712, subsample=0.8926419685712479; total time=   6.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8670140089927842, gamma=4.6968486880138585, learning_rate=0.016325175054961123, max_depth=4, min_child_weight=8, n_estimators=175, reg_alpha=0.9180490283016114, reg_lambda=1.0695303202422712, subsample=0.8926419685712479; total time=   4.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8670140089927842, gamma=4.6968486880138585, learning_rate=0.016325175054961123, max_depth=4, min_child_weight=8, n_estimators=175, reg_alpha=0.9180490283016114, reg_lambda=1.0695303202422712, subsample=0.8926419685712479; total time=   5.0s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8085396792511581, gamma=3.876821894139413, learning_rate=0.02931565510026712, max_depth=9, min_child_weight=6, n_estimators=219, reg_alpha=1.2975550194074517, reg_lambda=0.8441651719013947, subsample=0.866941242634403; total time=  34.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8085396792511581, gamma=3.876821894139413, learning_rate=0.02931565510026712, max_depth=9, min_child_weight=6, n_estimators=219, reg_alpha=1.2975550194074517, reg_lambda=0.8441651719013947, subsample=0.866941242634403; total time=  34.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8085396792511581, gamma=3.876821894139413, learning_rate=0.02931565510026712, max_depth=9, min_child_weight=6, n_estimators=219, reg_alpha=1.2975550194074517, reg_lambda=0.8441651719013947, subsample=0.866941242634403; total time=  34.3s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7717015338451563, gamma=4.600046132186582, learning_rate=0.04425048888775232, max_depth=9, min_child_weight=4, n_estimators=110, reg_alpha=1.1415556053577964, reg_lambda=0.05160217010704217, subsample=0.6928647954923324; total time=  18.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7717015338451563, gamma=4.600046132186582, learning_rate=0.04425048888775232, max_depth=9, min_child_weight=4, n_estimators=110, reg_alpha=1.1415556053577964, reg_lambda=0.05160217010704217, subsample=0.6928647954923324; total time=  19.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7717015338451563, gamma=4.600046132186582, learning_rate=0.04425048888775232, max_depth=9, min_child_weight=4, n_estimators=110, reg_alpha=1.1415556053577964, reg_lambda=0.05160217010704217, subsample=0.6928647954923324; total time=  17.6s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9777389931549642, gamma=3.50310722310683, learning_rate=0.13615867129890064, max_depth=6, min_child_weight=4, n_estimators=142, reg_alpha=0.0001221015053469721, reg_lambda=0.7413709753254089, subsample=0.7971894567956928; total time=   8.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9777389931549642, gamma=3.50310722310683, learning_rate=0.13615867129890064, max_depth=6, min_child_weight=4, n_estimators=142, reg_alpha=0.0001221015053469721, reg_lambda=0.7413709753254089, subsample=0.7971894567956928; total time=   7.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9777389931549642, gamma=3.50310722310683, learning_rate=0.13615867129890064, max_depth=6, min_child_weight=4, n_estimators=142, reg_alpha=0.0001221015053469721, reg_lambda=0.7413709753254089, subsample=0.7971894567956928; total time=   6.7s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5018151536273716, gamma=4.089150098318758, learning_rate=0.09213636741693375, max_depth=5, min_child_weight=6, n_estimators=155, reg_alpha=0.6256786124692563, reg_lambda=0.30518917367688864, subsample=0.9469190305262598; total time=   4.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5018151536273716, gamma=4.089150098318758, learning_rate=0.09213636741693375, max_depth=5, min_child_weight=6, n_estimators=155, reg_alpha=0.6256786124692563, reg_lambda=0.30518917367688864, subsample=0.9469190305262598; total time=   4.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5018151536273716, gamma=4.089150098318758, learning_rate=0.09213636741693375, max_depth=5, min_child_weight=6, n_estimators=155, reg_alpha=0.6256786124692563, reg_lambda=0.30518917367688864, subsample=0.9469190305262598; total time=   6.6s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.622383276443231, gamma=1.6502846559696036, learning_rate=0.2, max_depth=10, min_child_weight=4, n_estimators=300, reg_alpha=0.536806873305035, reg_lambda=0.904548893838038, subsample=0.9772379130495001; total time= 2.0min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.622383276443231, gamma=1.6502846559696036, learning_rate=0.2, max_depth=10, min_child_weight=4, n_estimators=300, reg_alpha=0.536806873305035, reg_lambda=0.904548893838038, subsample=0.9772379130495001; total time=  58.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.622383276443231, gamma=1.6502846559696036, learning_rate=0.2, max_depth=10, min_child_weight=4, n_estimators=300, reg_alpha=0.536806873305035, reg_lambda=0.904548893838038, subsample=0.9772379130495001; total time= 1.1min
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=1.594575848327178, learning_rate=0.05786312817965926, max_depth=10, min_child_weight=2, n_estimators=300, reg_alpha=0.0, reg_lambda=0.0, subsample=0.5; total time= 1.4min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=1.594575848327178, learning_rate=0.05786312817965926, max_depth=10, min_child_weight=2, n_estimators=300, reg_alpha=0.0, reg_lambda=0.0, subsample=0.5; total time= 1.1min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=1.594575848327178, learning_rate=0.05786312817965926, max_depth=10, min_child_weight=2, n_estimators=300, reg_alpha=0.0, reg_lambda=0.0, subsample=0.5; total time= 1.1min
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8643135010173961, gamma=0.5457976444818942, learning_rate=0.011086738648924616, max_depth=10, min_child_weight=2, n_estimators=295, reg_alpha=1.207535107247104, reg_lambda=1.4358228832996576, subsample=0.8987566530459745; total time= 2.1min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8643135010173961, gamma=0.5457976444818942, learning_rate=0.011086738648924616, max_depth=10, min_child_weight=2, n_estimators=295, reg_alpha=1.207535107247104, reg_lambda=1.4358228832996576, subsample=0.8987566530459745; total time= 3.4min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8643135010173961, gamma=0.5457976444818942, learning_rate=0.011086738648924616, max_depth=10, min_child_weight=2, n_estimators=295, reg_alpha=1.207535107247104, reg_lambda=1.4358228832996576, subsample=0.8987566530459745; total time= 3.1min
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5995008610124359, gamma=2.761448038322376, learning_rate=0.19893422120677123, max_depth=3, min_child_weight=6, n_estimators=107, reg_alpha=1.5168763417266078, reg_lambda=1.9027792869972682, subsample=0.5970612446418292; total time=   2.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5995008610124359, gamma=2.761448038322376, learning_rate=0.19893422120677123, max_depth=3, min_child_weight=6, n_estimators=107, reg_alpha=1.5168763417266078, reg_lambda=1.9027792869972682, subsample=0.5970612446418292; total time=   2.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5995008610124359, gamma=2.761448038322376, learning_rate=0.19893422120677123, max_depth=3, min_child_weight=6, n_estimators=107, reg_alpha=1.5168763417266078, reg_lambda=1.9027792869972682, subsample=0.5970612446418292; total time=   2.7s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7666479412463671, gamma=0.4959070959124613, learning_rate=0.16346730389263162, max_depth=3, min_child_weight=3, n_estimators=298, reg_alpha=1.789944739488713, reg_lambda=0.4698703782640893, subsample=0.600065921154478; total time=   5.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7666479412463671, gamma=0.4959070959124613, learning_rate=0.16346730389263162, max_depth=3, min_child_weight=3, n_estimators=298, reg_alpha=1.789944739488713, reg_lambda=0.4698703782640893, subsample=0.600065921154478; total time=   5.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7666479412463671, gamma=0.4959070959124613, learning_rate=0.16346730389263162, max_depth=3, min_child_weight=3, n_estimators=298, reg_alpha=1.789944739488713, reg_lambda=0.4698703782640893, subsample=0.600065921154478; total time=   5.1s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6669077926103604, gamma=4.855652804768744, learning_rate=0.11907412870240315, max_depth=7, min_child_weight=4, n_estimators=293, reg_alpha=0.739778787683102, reg_lambda=1.2325684457629336, subsample=0.9666138306192789; total time=  11.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6669077926103604, gamma=4.855652804768744, learning_rate=0.11907412870240315, max_depth=7, min_child_weight=4, n_estimators=293, reg_alpha=0.739778787683102, reg_lambda=1.2325684457629336, subsample=0.9666138306192789; total time=  10.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6669077926103604, gamma=4.855652804768744, learning_rate=0.11907412870240315, max_depth=7, min_child_weight=4, n_estimators=293, reg_alpha=0.739778787683102, reg_lambda=1.2325684457629336, subsample=0.9666138306192789; total time=  13.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7060233204976595, gamma=4.950967166067057, learning_rate=0.19802283878142263, max_depth=10, min_child_weight=3, n_estimators=102, reg_alpha=0.07647819168572705, reg_lambda=1.0683603627265907, subsample=0.9152925911351995; total time= 1.9min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7060233204976595, gamma=4.950967166067057, learning_rate=0.19802283878142263, max_depth=10, min_child_weight=3, n_estimators=102, reg_alpha=0.07647819168572705, reg_lambda=1.0683603627265907, subsample=0.9152925911351995; total time= 1.7min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7060233204976595, gamma=4.950967166067057, learning_rate=0.19802283878142263, max_depth=10, min_child_weight=3, n_estimators=102, reg_alpha=0.07647819168572705, reg_lambda=1.0683603627265907, subsample=0.9152925911351995; total time=  25.1s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5991465512780847, gamma=1.1834850485139006, learning_rate=0.05295426293551907, max_depth=5, min_child_weight=10, n_estimators=300, reg_alpha=1.4339616136848514, reg_lambda=0.392609403794172, subsample=0.9908964636121995; total time=   8.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5991465512780847, gamma=1.1834850485139006, learning_rate=0.05295426293551907, max_depth=5, min_child_weight=10, n_estimators=300, reg_alpha=1.4339616136848514, reg_lambda=0.392609403794172, subsample=0.9908964636121995; total time=   8.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5991465512780847, gamma=1.1834850485139006, learning_rate=0.05295426293551907, max_depth=5, min_child_weight=10, n_estimators=300, reg_alpha=1.4339616136848514, reg_lambda=0.392609403794172, subsample=0.9908964636121995; total time=   9.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=0.0, learning_rate=0.052524493386849294, max_depth=8, min_child_weight=1, n_estimators=300, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0; total time=  30.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=0.0, learning_rate=0.052524493386849294, max_depth=8, min_child_weight=1, n_estimators=300, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0; total time=  30.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=0.0, learning_rate=0.052524493386849294, max_depth=8, min_child_weight=1, n_estimators=300, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0; total time=  29.9s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5819120888377877, gamma=4.424966790433941, learning_rate=0.03635065275717406, max_depth=3, min_child_weight=9, n_estimators=294, reg_alpha=1.4397449843866927, reg_lambda=1.2172386820108185, subsample=0.5249423696411073; total time=   5.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5819120888377877, gamma=4.424966790433941, learning_rate=0.03635065275717406, max_depth=3, min_child_weight=9, n_estimators=294, reg_alpha=1.4397449843866927, reg_lambda=1.2172386820108185, subsample=0.5249423696411073; total time=   5.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5819120888377877, gamma=4.424966790433941, learning_rate=0.03635065275717406, max_depth=3, min_child_weight=9, n_estimators=294, reg_alpha=1.4397449843866927, reg_lambda=1.2172386820108185, subsample=0.5249423696411073; total time=   5.2s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6208005651026717, gamma=0.46698199695805526, learning_rate=0.010249331750121257, max_depth=3, min_child_weight=9, n_estimators=293, reg_alpha=0.853090392478929, reg_lambda=1.8995872530859046, subsample=0.7100222646364696; total time=   5.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6208005651026717, gamma=0.46698199695805526, learning_rate=0.010249331750121257, max_depth=3, min_child_weight=9, n_estimators=293, reg_alpha=0.853090392478929, reg_lambda=1.8995872530859046, subsample=0.7100222646364696; total time=   9.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6208005651026717, gamma=0.46698199695805526, learning_rate=0.010249331750121257, max_depth=3, min_child_weight=9, n_estimators=293, reg_alpha=0.853090392478929, reg_lambda=1.8995872530859046, subsample=0.7100222646364696; total time=   5.5s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9794420598368971, gamma=1.6973830432585615, learning_rate=0.010380074614328166, max_depth=10, min_child_weight=4, n_estimators=110, reg_alpha=0.27974631609460215, reg_lambda=0.230183401643743, subsample=0.5678180390786722; total time= 1.5min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9794420598368971, gamma=1.6973830432585615, learning_rate=0.010380074614328166, max_depth=10, min_child_weight=4, n_estimators=110, reg_alpha=0.27974631609460215, reg_lambda=0.230183401643743, subsample=0.5678180390786722; total time= 1.0min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9794420598368971, gamma=1.6973830432585615, learning_rate=0.010380074614328166, max_depth=10, min_child_weight=4, n_estimators=110, reg_alpha=0.27974631609460215, reg_lambda=0.230183401643743, subsample=0.5678180390786722; total time= 1.1min
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9682608285738958, gamma=0.9844632708525323, learning_rate=0.19525717758041605, max_depth=6, min_child_weight=5, n_estimators=300, reg_alpha=0.8936359806821186, reg_lambda=1.9586586877536047, subsample=0.7372290568197504; total time=  14.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9682608285738958, gamma=0.9844632708525323, learning_rate=0.19525717758041605, max_depth=6, min_child_weight=5, n_estimators=300, reg_alpha=0.8936359806821186, reg_lambda=1.9586586877536047, subsample=0.7372290568197504; total time=  13.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9682608285738958, gamma=0.9844632708525323, learning_rate=0.19525717758041605, max_depth=6, min_child_weight=5, n_estimators=300, reg_alpha=0.8936359806821186, reg_lambda=1.9586586877536047, subsample=0.7372290568197504; total time=  13.7s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8049058673050258, gamma=0.17605297742226303, learning_rate=0.1533843138597975, max_depth=8, min_child_weight=9, n_estimators=206, reg_alpha=1.5413327297963468, reg_lambda=0.2010171325045978, subsample=0.6130980319293741; total time=  19.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8049058673050258, gamma=0.17605297742226303, learning_rate=0.1533843138597975, max_depth=8, min_child_weight=9, n_estimators=206, reg_alpha=1.5413327297963468, reg_lambda=0.2010171325045978, subsample=0.6130980319293741; total time=  17.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8049058673050258, gamma=0.17605297742226303, learning_rate=0.1533843138597975, max_depth=8, min_child_weight=9, n_estimators=206, reg_alpha=1.5413327297963468, reg_lambda=0.2010171325045978, subsample=0.6130980319293741; total time=  20.0s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8892191244338314, gamma=4.836467201829181, learning_rate=0.018506685856437003, max_depth=8, min_child_weight=7, n_estimators=299, reg_alpha=1.585250512600822, reg_lambda=0.29403476076224716, subsample=0.5354894829340601; total time=  34.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8892191244338314, gamma=4.836467201829181, learning_rate=0.018506685856437003, max_depth=8, min_child_weight=7, n_estimators=299, reg_alpha=1.585250512600822, reg_lambda=0.29403476076224716, subsample=0.5354894829340601; total time=  31.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8892191244338314, gamma=4.836467201829181, learning_rate=0.018506685856437003, max_depth=8, min_child_weight=7, n_estimators=299, reg_alpha=1.585250512600822, reg_lambda=0.29403476076224716, subsample=0.5354894829340601; total time=  33.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=1.0, gamma=0.0, learning_rate=0.05495627199474921, max_depth=9, min_child_weight=10, n_estimators=300, reg_alpha=2.0, reg_lambda=2.0, subsample=0.5; total time=  40.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=1.0, gamma=0.0, learning_rate=0.05495627199474921, max_depth=9, min_child_weight=10, n_estimators=300, reg_alpha=2.0, reg_lambda=2.0, subsample=0.5; total time=  39.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=1.0, gamma=0.0, learning_rate=0.05495627199474921, max_depth=9, min_child_weight=10, n_estimators=300, reg_alpha=2.0, reg_lambda=2.0, subsample=0.5; total time=  40.9s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8458431488190256, gamma=0.008170219561988159, learning_rate=0.06590024654274663, max_depth=3, min_child_weight=6, n_estimators=116, reg_alpha=0.06544135617423953, reg_lambda=1.6131061281772625, subsample=0.9281454408890097; total time=   4.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8458431488190256, gamma=0.008170219561988159, learning_rate=0.06590024654274663, max_depth=3, min_child_weight=6, n_estimators=116, reg_alpha=0.06544135617423953, reg_lambda=1.6131061281772625, subsample=0.9281454408890097; total time=   3.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8458431488190256, gamma=0.008170219561988159, learning_rate=0.06590024654274663, max_depth=3, min_child_weight=6, n_estimators=116, reg_alpha=0.06544135617423953, reg_lambda=1.6131061281772625, subsample=0.9281454408890097; total time=   3.2s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5657010818865271, gamma=0.2032829764792016, learning_rate=0.040247974328028936, max_depth=7, min_child_weight=8, n_estimators=105, reg_alpha=0.8479029296245675, reg_lambda=1.9148696443875421, subsample=0.9770458780747879; total time=   8.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5657010818865271, gamma=0.2032829764792016, learning_rate=0.040247974328028936, max_depth=7, min_child_weight=8, n_estimators=105, reg_alpha=0.8479029296245675, reg_lambda=1.9148696443875421, subsample=0.9770458780747879; total time=   8.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5657010818865271, gamma=0.2032829764792016, learning_rate=0.040247974328028936, max_depth=7, min_child_weight=8, n_estimators=105, reg_alpha=0.8479029296245675, reg_lambda=1.9148696443875421, subsample=0.9770458780747879; total time=  10.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7468283533956264, gamma=4.895688493883358, learning_rate=0.02526206172212955, max_depth=10, min_child_weight=1, n_estimators=299, reg_alpha=1.6774505711636998, reg_lambda=1.539819962375651, subsample=0.8354735811768219; total time= 2.6min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7468283533956264, gamma=4.895688493883358, learning_rate=0.02526206172212955, max_depth=10, min_child_weight=1, n_estimators=299, reg_alpha=1.6774505711636998, reg_lambda=1.539819962375651, subsample=0.8354735811768219; total time= 2.5min


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7468283533956264, gamma=4.895688493883358, learning_rate=0.02526206172212955, max_depth=10, min_child_weight=1, n_estimators=299, reg_alpha=1.6774505711636998, reg_lambda=1.539819962375651, subsample=0.8354735811768219; total time= 2.6min
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=5.0, learning_rate=0.05123396574074597, max_depth=8, min_child_weight=1, n_estimators=300, reg_alpha=2.0, reg_lambda=2.0, subsample=0.5; total time=  20.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=5.0, learning_rate=0.05123396574074597, max_depth=8, min_child_weight=1, n_estimators=300, reg_alpha=2.0, reg_lambda=2.0, subsample=0.5; total time=  19.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.5, gamma=5.0, learning_rate=0.05123396574074597, max_depth=8, min_child_weight=1, n_estimators=300, reg_alpha=2.0, reg_lambda=2.0, subsample=0.5; total time=  22.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:
model_params = {
    'learning_rate': 0.1,
    'max_depth': 8,
    'n_estimators': 200,
    'eval_metric': 'logloss',
    'use_label_encoder': False
}

In [19]:
run = wandb.init(
    project="useful_comments_sorter",
    name="xgb_1_cross_export",
    config=model_params,       # Logged under the “Config” tab in W&B
    reinit='finish_previous'
          # Restart the run if a previous one was active
)

In [9]:
from wandb.integration.xgboost import WandbCallback
import xgboost as xgb

# # W&B callback for logging model and feature importance
# wb_cb = WandbCallback(
#     log_model=True,
#     log_feature_importance=True
# )

In [10]:
model = xgb.XGBClassifier(
    **model_params,
    tree_method="gpu_hist",       # Use GPU-accelerated histogram-based algorithm
    device="cuda",                # Run on GPU
    # callbacks=[wb_cb],            # Log metrics and feature importance to W&B
    scale_pos_weight=ratio        # Handle class imbalance
)

In [11]:
eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(X_train, y_train, eval_set=eval_set, verbose=True)


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-logloss:0.64734	validation_1-logloss:0.65061
[1]	validation_0-logloss:0.60830	validation_1-logloss:0.61433
[2]	validation_0-logloss:0.57476	validation_1-logloss:0.58365
[3]	validation_0-logloss:0.54578	validation_1-logloss:0.55714
[4]	validation_0-logloss:0.52046	validation_1-logloss:0.53493
[5]	validation_0-logloss:0.49810	validation_1-logloss:0.51512
[6]	validation_0-logloss:0.47820	validation_1-logloss:0.49776
[7]	validation_0-logloss:0.46057	validation_1-logloss:0.48282
[8]	validation_0-logloss:0.44494	validation_1-logloss:0.46949
[9]	validation_0-logloss:0.43090	validation_1-logloss:0.45762
[10]	validation_0-logloss:0.41839	validation_1-logloss:0.44741
[11]	validation_0-logloss:0.40627	validation_1-logloss:0.43760
[12]	validation_0-logloss:0.39568	validation_1-logloss:0.42913
[13]	validation_0-logloss:0.38553	validation_1-logloss:0.42145
[14]	validation_0-logloss:0.37660	validation_1-logloss:0.41452
[15]	validation_0-logloss:0.36834	validation_1-logloss:0.40851
[1

In [12]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score

y_pred = model.predict(X_test)
recall = recall_score(y_test, y_pred)
y_proba = model.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class


    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [None]:
# wandb.log({
#     "confusion_matrix": wandb.plot.confusion_matrix(
#         preds=y_pred.tolist(),
#         y_true=y_test.tolist(),
#         class_names=["negative", "positive"]  # Adjust if class labels differ
#     ),
#     "f1": f1_score(y_test, y_pred),
#     "acc": accuracy_score(y_test, y_pred),
#     "auc": roc_auc_score(y_test, y_proba),
#     "recall": recall
# })

# run.finish()


0,1
acc,▁
auc,▁
epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇████
f1,▁
recall,▁
validation_0-logloss,█▅▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_1-logloss,█▅▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
acc,0.85836
auc,0.93572
epoch,199.0
f1,0.8713
recall,0.88328


In [None]:
import joblib

joblib.dump(model, "models/useful-comments-sorter_model_xgb1.joblib")

['models/useful-comments-sorter_model_.joblib']