In [None]:
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from cuml.linear_model import LogisticRegression as cuLogisticRegression

from tqdm import tqdm

In [10]:
# Conduct proper data scaling based on the model requirements
data = pd.read_csv('/content/drive/MyDrive/yrbss/balanced_dataset.csv')
X = data.drop('suicide_attempt', axis=1)
y = data['suicide_attempt']

In [27]:
regression_data = pd.read_csv('/content/drive/MyDrive/yrbss/regression_df.csv')
X_reg = regression_data.drop('suicide_attempt', axis=1)
y_reg = regression_data['suicide_attempt']
scaler = StandardScaler()
X_st = pd.DataFrame(scaler.fit_transform(X_reg), columns=X_reg.columns)
X_st.head()

Unnamed: 0,BMIPCT,q14_skipped_school_bc_insecure_0.0,q14_skipped_school_bc_insecure_1.0,q14_skipped_school_bc_insecure_2.5,q14_skipped_school_bc_insecure_4.5,q14_skipped_school_bc_insecure_6.0,q15_been_threatened_at_school_0.0,q15_been_threatened_at_school_1.0,q15_been_threatened_at_school_2.5,q15_been_threatened_at_school_4.5,...,q66_weight_perception_2.0,q66_weight_perception_3.0,q66_weight_perception_4.0,q86_unstable_housing_0.0,q86_unstable_housing_1.0,q86_unstable_housing_2.0,q86_unstable_housing_3.0,q86_unstable_housing_4.0,q86_unstable_housing_5.0,q86_unstable_housing_6.0
0,0.449118,0.486156,-0.28612,-0.2573,-0.133692,-0.192811,0.418413,-0.247563,-0.21497,-0.126497,...,-0.842744,1.469141,-0.305306,0.29455,-0.183001,-0.106542,-0.094603,-0.070084,-0.082117,-0.126497
1,-0.586702,0.486156,-0.28612,-0.2573,-0.133692,-0.192811,0.418413,-0.247563,-0.21497,-0.126497,...,1.186601,-0.68067,-0.305306,0.29455,-0.183001,-0.106542,-0.094603,-0.070084,-0.082117,-0.126497
2,-2.188734,0.486156,-0.28612,-0.2573,-0.133692,-0.192811,0.418413,-0.247563,-0.21497,-0.126497,...,1.186601,-0.68067,-0.305306,0.29455,-0.183001,-0.106542,-0.094603,-0.070084,-0.082117,-0.126497
3,1.308666,0.486156,-0.28612,-0.2573,-0.133692,-0.192811,0.418413,-0.247563,-0.21497,-0.126497,...,-0.842744,1.469141,-0.305306,0.29455,-0.183001,-0.106542,-0.094603,-0.070084,-0.082117,-0.126497
4,1.227148,0.486156,-0.28612,-0.2573,-0.133692,-0.192811,0.418413,-0.247563,-0.21497,-0.126497,...,-0.842744,1.469141,-0.305306,0.29455,-0.183001,-0.106542,-0.094603,-0.070084,-0.082117,-0.126497


In [None]:
# # Solving corrlinearity
# X["dating_physical_hurting"] = X["dating_physical_hurting_1.0"].fillna(0)
# # Drop the two dummy columns to avoid collinearity
# X.drop(columns=["dating_physical_hurting_0.0", "dating_physical_hurting_1.0"],inplace=True)

# X.drop(columns=["healthy_diet_0", "healthy_diet_1"],inplace=True)
# X.drop(columns=["std_checked_0", "std_checked_1"], inplace=True)
# X.drop(columns=["substance_use_0", "substance_use_1"], inplace=True)
# X.drop(columns=["unsafe_driving_0", "unsafe_driving_1"], inplace=True)
# X.drop(columns=["unsafe_sex_0", "unsafe_sex_1"], inplace=True)

In [29]:
# 确保输入数据为 NumPy 数组
X_np = X.to_numpy() if hasattr(X_st, 'to_numpy') else X_st
y_np = y.to_numpy() if hasattr(y_reg, 'to_numpy') else y_reg

In [30]:
scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
}

param_grid_lr = {
    'C': [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13],
    'penalty': ['l2']  # cuML only support l2 regularization
}

log_reg_gpu = cuLogisticRegression()

grid_search_lr_gpu = GridSearchCV(
    log_reg_gpu,
    param_grid_lr,
    cv=5,
    scoring=scoring_metrics,
    refit='Accuracy',
    verbose=3
)

grid_search_lr_gpu.fit(X_np, y_np)

print(f"Best parameters for GPU Logistic Regression: {grid_search_lr_gpu.best_params_}")

results = {}
for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[metric] = grid_search_lr_gpu.cv_results_[mean_key][grid_search_lr_gpu.best_index_]
    results[f'std_{metric}'] = grid_search_lr_gpu.cv_results_[std_key][grid_search_lr_gpu.best_index_]

print("GPU Logistic Regression:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[metric]:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END C=0.05, penalty=l2; Accuracy: (test=0.750) F1: (test=0.750) Precision: (test=0.750) Recall: (test=0.750) total time=   0.5s
[CV 2/5] END C=0.05, penalty=l2; Accuracy: (test=0.751) F1: (test=0.751) Precision: (test=0.752) Recall: (test=0.751) total time=   0.4s
[CV 3/5] END C=0.05, penalty=l2; Accuracy: (test=0.727) F1: (test=0.727) Precision: (test=0.727) Recall: (test=0.727) total time=   0.5s
[CV 4/5] END C=0.05, penalty=l2; Accuracy: (test=0.749) F1: (test=0.749) Precision: (test=0.749) Recall: (test=0.749) total time=   0.5s
[CV 5/5] END C=0.05, penalty=l2; Accuracy: (test=0.762) F1: (test=0.762) Precision: (test=0.762) Recall: (test=0.762) total time=   0.5s
[CV 1/5] END C=0.06, penalty=l2; Accuracy: (test=0.751) F1: (test=0.751) Precision: (test=0.751) Recall: (test=0.751) total time=   0.4s
[CV 2/5] END C=0.06, penalty=l2; Accuracy: (test=0.753) F1: (test=0.753) Precision: (test=0.754) Recall: (test=0.753) 

In [31]:
log_reg = LogisticRegression()
param_grid_lr = {'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1],
                 'penalty': ['l1'],
                 'solver': ['liblinear']}

scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
    }

# GridSearchCV
grid_search_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring=scoring_metrics, refit='Accuracy', verbose=3)
grid_search_lr.fit(X_np, y_np)

# print best parameters
print(f"Best parameters for Logistic Regression: {grid_search_lr.best_params_}")

# print scores
results = {}

for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[f'{metric}'] = grid_search_lr.cv_results_[mean_key][grid_search_lr.best_index_]
    results[f'std_{metric}'] = grid_search_lr.cv_results_[std_key][grid_search_lr.best_index_]

# Print results for the current model
print(f"Logistic Regression:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[f'{metric}']:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END C=0.01, penalty=l1, solver=liblinear; Accuracy: (test=0.749) F1: (test=0.749) Precision: (test=0.750) Recall: (test=0.749) total time=   0.1s
[CV 2/5] END C=0.01, penalty=l1, solver=liblinear; Accuracy: (test=0.722) F1: (test=0.722) Precision: (test=0.723) Recall: (test=0.722) total time=   0.1s
[CV 3/5] END C=0.01, penalty=l1, solver=liblinear; Accuracy: (test=0.713) F1: (test=0.713) Precision: (test=0.713) Recall: (test=0.713) total time=   0.1s
[CV 4/5] END C=0.01, penalty=l1, solver=liblinear; Accuracy: (test=0.720) F1: (test=0.720) Precision: (test=0.720) Recall: (test=0.720) total time=   0.2s
[CV 5/5] END C=0.01, penalty=l1, solver=liblinear; Accuracy: (test=0.745) F1: (test=0.745) Precision: (test=0.746) Recall: (test=0.745) total time=   0.1s
[CV 1/5] END C=0.02, penalty=l1, solver=liblinear; Accuracy: (test=0.748) F1: (test=0.747) Precision: (test=0.748) Recall: (test=0.747) total time=   0.1s
[CV 2/5] 

In [32]:
lda = LinearDiscriminantAnalysis()
param_grid_lda = {'solver': ['svd', 'lsqr']}

scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
    }

# GridSearchCV
grid_search_lda = GridSearchCV(lda, param_grid_lda, cv=5, scoring=scoring_metrics, refit='Accuracy', verbose=3)
grid_search_lda.fit(X_np, y_np)

# print best parameters
print(f"Best parameters for LDA: {grid_search_lda.best_params_}")

# print scores
results = {}

for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[f'{metric}'] = grid_search_lda.cv_results_[mean_key][grid_search_lda.best_index_]
    results[f'std_{metric}'] = grid_search_lda.cv_results_[std_key][grid_search_lda.best_index_]

# Print results for the current model
print(f"LDA:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[f'{metric}']:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END solver=svd; Accuracy: (test=0.756) F1: (test=0.756) Precision: (test=0.756) Recall: (test=0.756) total time=   0.1s
[CV 2/5] END solver=svd; Accuracy: (test=0.750) F1: (test=0.750) Precision: (test=0.751) Recall: (test=0.750) total time=   0.1s
[CV 3/5] END solver=svd; Accuracy: (test=0.731) F1: (test=0.731) Precision: (test=0.732) Recall: (test=0.731) total time=   0.3s
[CV 4/5] END solver=svd; Accuracy: (test=0.740) F1: (test=0.740) Precision: (test=0.740) Recall: (test=0.740) total time=   0.4s
[CV 5/5] END solver=svd; Accuracy: (test=0.757) F1: (test=0.757) Precision: (test=0.758) Recall: (test=0.757) total time=   0.1s
[CV 1/5] END solver=lsqr; Accuracy: (test=0.756) F1: (test=0.756) Precision: (test=0.756) Recall: (test=0.756) total time=   0.2s
[CV 2/5] END solver=lsqr; Accuracy: (test=0.750) F1: (test=0.750) Precision: (test=0.751) Recall: (test=0.750) total time=   0.1s
[CV 3/5] END solver=lsqr; Accuracy:

In [33]:
qda = QuadraticDiscriminantAnalysis()
param_grid_qda = {'reg_param': [0.0, 0.1, 0.2]}

scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
    }

# GridSearchCV
grid_search_qda = GridSearchCV(qda, param_grid_qda, cv=5, scoring=scoring_metrics, refit='Accuracy')
grid_search_qda.fit(X_np, y_np)

# print best parameters
print(f"Best parameters for QDA: {grid_search_qda.best_params_}")
# print scores
results = {}

for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[f'{metric}'] = grid_search_qda.cv_results_[mean_key][grid_search_qda.best_index_]
    results[f'std_{metric}'] = grid_search_qda.cv_results_[std_key][grid_search_qda.best_index_]

# Print results for the current model
print(f"QDA:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[f'{metric}']:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")



Best parameters for QDA: {'reg_param': 0.2}
QDA:
  Mean Accuracy: 0.6694, Standard Deviation: 0.0175
  Mean Precision: 0.6959, Standard Deviation: 0.0214
  Mean Recall: 0.6694, Standard Deviation: 0.0175
  Mean F1: 0.6580, Standard Deviation: 0.0179


In [34]:
knn = KNeighborsClassifier()
param_grid_KNN = {'n_neighbors': [2, 3, 4], 'weights': ['uniform', 'distance'],
                  'metric': ['euclidean', 'manhattan']}
scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
    }

# GridSearchCV
grid_search_KNN = GridSearchCV(knn, param_grid_KNN, cv=5, scoring=scoring_metrics, refit='Accuracy', verbose=3)
grid_search_KNN.fit(X_np, y_np)

# print best parameters
print(f"Best parameters for KNN: {grid_search_KNN.best_params_}")

# print scores
results = {}

for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[f'{metric}'] = grid_search_KNN.cv_results_[mean_key][grid_search_KNN.best_index_]
    results[f'std_{metric}'] = grid_search_KNN.cv_results_[std_key][grid_search_KNN.best_index_]

# Print results for the current model
print(f"KNN:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[f'{metric}']:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END metric=euclidean, n_neighbors=2, weights=uniform; Accuracy: (test=0.586) F1: (test=0.550) Precision: (test=0.627) Recall: (test=0.586) total time=   0.1s
[CV 2/5] END metric=euclidean, n_neighbors=2, weights=uniform; Accuracy: (test=0.589) F1: (test=0.564) Precision: (test=0.615) Recall: (test=0.589) total time=   0.1s
[CV 3/5] END metric=euclidean, n_neighbors=2, weights=uniform; Accuracy: (test=0.586) F1: (test=0.561) Precision: (test=0.610) Recall: (test=0.586) total time=   0.1s
[CV 4/5] END metric=euclidean, n_neighbors=2, weights=uniform; Accuracy: (test=0.587) F1: (test=0.565) Precision: (test=0.610) Recall: (test=0.588) total time=   0.1s
[CV 5/5] END metric=euclidean, n_neighbors=2, weights=uniform; Accuracy: (test=0.572) F1: (test=0.542) Precision: (test=0.596) Recall: (test=0.572) total time=   0.1s
[CV 1/5] END metric=euclidean, n_neighbors=2, weights=distance; Accuracy: (test=0.605) F1: (test=0.604) 

In [19]:
dt = DecisionTreeClassifier()
param_grid_dt = {'max_depth': [None, 10, 20, 30],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}

scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
    }

# GridSearchCV
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring=scoring_metrics, refit='Accuracy')
grid_search_dt.fit(X, y)

# print best parameters
print(f"Best parameters for Decision Tree: {grid_search_dt.best_params_}")

# print scores
results = {}

for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[f'{metric}'] = grid_search_dt.cv_results_[mean_key][grid_search_dt.best_index_]
    results[f'std_{metric}'] = grid_search_dt.cv_results_[std_key][grid_search_dt.best_index_]

# Print results for the current model
print(f"Decision Tree:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[f'{metric}']:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")

Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree:
  Mean Accuracy: 0.6990, Standard Deviation: 0.0117
  Mean Precision: 0.7004, Standard Deviation: 0.0115
  Mean Recall: 0.6989, Standard Deviation: 0.0117
  Mean F1: 0.6984, Standard Deviation: 0.0119


In [20]:
rf = RandomForestClassifier()
param_grid_rf = {'n_estimators': [50, 100, 200],
                 'max_depth': [None, 10, 20],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4],
                 'bootstrap': [True, False]}
scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
    }

# GridSearchCV
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring=scoring_metrics, refit='Accuracy', verbose=3)
grid_search_rf.fit(X, y)

# print best parameters
print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
# print scores
results = {}

for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[f'{metric}'] = grid_search_rf.cv_results_[mean_key][grid_search_rf.best_index_]
    results[f'std_{metric}'] = grid_search_rf.cv_results_[std_key][grid_search_rf.best_index_]

# Print results for the current model
print(f"Random Forest:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[f'{metric}']:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV 1/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; Accuracy: (test=0.773) F1: (test=0.773) Precision: (test=0.773) Recall: (test=0.773) total time=   0.4s
[CV 2/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; Accuracy: (test=0.764) F1: (test=0.764) Precision: (test=0.764) Recall: (test=0.764) total time=   0.4s
[CV 3/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; Accuracy: (test=0.737) F1: (test=0.737) Precision: (test=0.737) Recall: (test=0.737) total time=   0.6s
[CV 4/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; Accuracy: (test=0.751) F1: (test=0.751) Precision: (test=0.751) Recall: (test=0.751) total time=   0.6s
[CV 5/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; Accura

In [21]:
# lightgbm
lightgbm = LGBMClassifier()
param_grid_lightgbm = {'n_estimators': [70,80,90,100,110,120,130,140,150],
                       'max_depth': [3,4,5,6,7,8,9],
                       'learning_rate': [0.05,0.06,0.07,0.08,0.09,0.1,1]}
scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
}

# GridSearchCV
grid_search_lightgbm = GridSearchCV(lightgbm, param_grid_lightgbm, cv=5, scoring=scoring_metrics, refit='Accuracy', verbose=3)
grid_search_lightgbm.fit(X, y)

# print best parameters
print(f"Best parameters for LightGBM: {grid_search_lightgbm.best_params_}")

# print scores
results = {}

for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[f'{metric}'] = grid_search_lightgbm.cv_results_[mean_key][grid_search_lightgbm.best_index_]
    results[f'std_{metric}'] = grid_search_lightgbm.cv_results_[std_key][grid_search_lightgbm.best_index_]

# Print results for the current model
print(f"LightGBM:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[f'{metric}']:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 1/5] END learning_rate=1, max_depth=5, n_estimators=130; Accuracy: (test=0.734) F1: (test=0.734) Precision: (test=0.734) Recall: (test=0.734) total time=   0.2s
[LightGBM] [Info] Number of positive: 2210, number of negative: 2209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 4419, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500113 -> initscore=0.000453
[LightGBM] [Info] Start training from score 0.000453
[CV 2/5] END learning_rate=1, max_depth=5, n_estimators=130; Accuracy: (test=0.710) F1: (test=0.710) Precision: (test=0.710) Recall: (test=0.710) total time=   0.2s
[LightGBM] [Info] Number of positive: 2209, number o

In [22]:
# xgboost
xgboost = XGBClassifier()
param_grid_xgboost = {'n_estimators': [70,80,90,100,110,120,130,140,150],
                      'max_depth': [2,3,4,5,6,7,8,9],
                      'learning_rate': [0.05,0.06,0.07,0.08,0.09,0.1,1]}
scoring_metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='macro', zero_division=1),
    'Recall': make_scorer(recall_score, average='macro'),
    'F1': make_scorer(f1_score, average='macro')
}

# GridSearchCV
grid_search_xgboost = GridSearchCV(xgboost, param_grid_xgboost, cv=5, scoring=scoring_metrics, refit='Accuracy', verbose=3)
grid_search_xgboost.fit(X, y)

# print best parameters
print(f"Best parameters for XGBoost: {grid_search_xgboost.best_params_}")

# print scores
results = {}

for metric in scoring_metrics:
    mean_key = f'mean_test_{metric}'
    std_key = f'std_test_{metric}'
    results[f'{metric}'] = grid_search_xgboost.cv_results_[mean_key][grid_search_xgboost.best_index_]
    results[f'std_{metric}'] = grid_search_xgboost.cv_results_[std_key][grid_search_xgboost.best_index_]

# Print results for the current model
print(f"XGBoost:")
for metric in scoring_metrics:
    print(f"  Mean {metric}: {results[f'{metric}']:.4f}, Standard Deviation: {results[f'std_{metric}']:.4f}")



Fitting 5 folds for each of 504 candidates, totalling 2520 fits
[CV 1/5] END learning_rate=0.05, max_depth=2, n_estimators=70; Accuracy: (test=0.753) F1: (test=0.753) Precision: (test=0.753) Recall: (test=0.753) total time=   0.2s
[CV 2/5] END learning_rate=0.05, max_depth=2, n_estimators=70; Accuracy: (test=0.741) F1: (test=0.741) Precision: (test=0.741) Recall: (test=0.741) total time=   0.1s
[CV 3/5] END learning_rate=0.05, max_depth=2, n_estimators=70; Accuracy: (test=0.729) F1: (test=0.729) Precision: (test=0.730) Recall: (test=0.729) total time=   0.1s
[CV 4/5] END learning_rate=0.05, max_depth=2, n_estimators=70; Accuracy: (test=0.729) F1: (test=0.729) Precision: (test=0.730) Recall: (test=0.729) total time=   0.1s
[CV 5/5] END learning_rate=0.05, max_depth=2, n_estimators=70; Accuracy: (test=0.752) F1: (test=0.752) Precision: (test=0.752) Recall: (test=0.752) total time=   0.1s
[CV 1/5] END learning_rate=0.05, max_depth=2, n_estimators=80; Accuracy: (test=0.754) F1: (test=0.754