In [19]:
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold
sklearn.set_config(transform_output='pandas')
from sklearn.metrics import mean_squared_error, mean_absolute_error

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder


from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from functools import partial
import optuna
from sklearn.compose import ColumnTransformer
import joblib

import warnings; warnings.filterwarnings('ignore')


### 1. Abstract
In this notebook we will consider 3 ways of hyperparameters optimization:

- by Grid Search
- by Random Search
- by Optuna optimization module


For testing purposes we will use the Student Classification Dataset from Kaggle (https://www.kaggle.com/datasets/jacksondivakarr/student-classification-dataset).

About Dataset
This dataset encompasses various aspects related to student performance. Each entry is uniquely identified by an 'Id'. The dataset includes demographic information such as 'Student_Age' and 'Sex'. 'High_School_Type' categorizes the type of high school attended, while 'Scholarship' indicates whether the student has a scholarship. Details about 'Additional_Work' and involvement in 'Sports_activity' provide insights into extracurricular commitments.

'Transportation' outlines the mode of commuting for each student. Academic aspects are captured through 'Weekly_Study_Hours', 'Attendance', and evaluations of 'Reading', 'Notes', and 'Listening_in_Class'. The culmination of these factors is reflected in the 'Grade' column, providing a comprehensive overview of student performance. This dataset serves as a valuable resource for exploring the multifaceted dynamics influencing academic outcomes. 

We will try to solve classification problem for "Grade" data

In [2]:
data = pd.read_csv("student.csv")
data = data.drop(['Unnamed: 0', 'Id'],axis=1)
data

Unnamed: 0,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade
0,21,Male,Other,50%,Yes,No,Private,0,Always,Yes,Yes,No,No,AA
1,20,Male,Other,50%,Yes,No,Private,0,Always,Yes,No,Yes,Yes,AA
2,21,Male,State,50%,No,No,Private,2,Never,No,No,No,Yes,AA
3,18,Female,Private,50%,Yes,No,Bus,2,Always,No,Yes,No,No,AA
4,22,Male,Private,50%,No,No,Bus,12,Always,Yes,No,Yes,Yes,AA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,22,Female,State,50%,Yes,Yes,Private,0,Always,No,Yes,No,Yes,CC
141,18,Female,State,75%,No,No,Private,0,Never,No,Yes,Yes,No,CC
142,18,Female,Private,75%,No,No,Private,0,Always,Yes,No,No,No,AA
143,22,Female,State,75%,Yes,Yes,Bus,12,Sometimes,No,Yes,No,Yes,CB


In [3]:

X = data.drop('Grade',axis=1)
y = data['Grade']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, train_size=0.7)


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Student_Age         145 non-null    int64 
 1   Sex                 145 non-null    object
 2   High_School_Type    145 non-null    object
 3   Scholarship         144 non-null    object
 4   Additional_Work     145 non-null    object
 5   Sports_activity     145 non-null    object
 6   Transportation      145 non-null    object
 7   Weekly_Study_Hours  145 non-null    int64 
 8   Attendance          145 non-null    object
 9   Reading             145 non-null    object
 10  Notes               145 non-null    object
 11  Listening_in_Class  145 non-null    object
 12  Project_work        145 non-null    object
 13  Grade               145 non-null    object
dtypes: int64(2), object(12)
memory usage: 16.0+ KB


In [6]:
X_train

Unnamed: 0,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work
14,26,Male,State,75%,Yes,Yes,Private,12,Never,No,No,Yes,Yes
87,20,Male,State,50%,No,Yes,Private,2,Always,Yes,No,No,Yes
1,20,Male,Other,50%,Yes,No,Private,0,Always,Yes,No,Yes,Yes
91,20,Male,State,100%,Yes,Yes,Bus,0,Always,No,Yes,Yes,No
92,18,Male,State,50%,No,No,Private,0,Always,Yes,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,25,Male,State,50%,Yes,No,Private,0,Always,No,Yes,No,No
52,22,Female,State,50%,Yes,No,Private,2,Always,Yes,Yes,Yes,No
77,18,Male,Private,25%,No,No,Bus,2,Sometimes,No,6,No,Yes
50,21,Male,State,50%,No,No,Bus,2,Sometimes,Yes,Yes,Yes,Yes


In [7]:
data['Student_Age'].unique()

array([21, 20, 18, 22, 19, 26, 25, 24, 23], dtype=int64)

In [20]:
categorical_features = ["Sex","High_School_Type","Scholarship",
                        "Additional_Work","Sports_activity","Transportation","Attendance",
                        "Reading","Notes","Listening_in_Class","Project_work"]

numeric_features = [i for i in X_train.columns if i not in categorical_features]

# Пайплайн для числовых признаков
pipe_num = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power_tr', PowerTransformer()),
    ('scaler', StandardScaler()) ])

pipe_cat = Pipeline([
    ('imputer', SimpleImputer(strategy= 'constant', fill_value='unknown' )),
    ('encoding', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

ct = ColumnTransformer([
    ('pipe_num', pipe_num, numeric_features),
    ('pipe_cat', pipe_cat, categorical_features),
])

# Полный пайплайн
pipe = Pipeline([
    ('column_transformer', ct),
    ('model', xgb.XGBRegressor()),  # It denotes the fraction of observations to be randomly samples for each tree. 
])

# здесь добавляем энкодер для целевых признаков, потому что randomsearch не работает без этого преобразования
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
joblib.dump(label_encoder,'label_enc.joblib')



['label_enc.joblib']

#### 1. eta

eta [default=0.3, alias: learning_rate]

It is analogous to learning rate in GBM.
It is the step size shrinkage used in update to prevent overfitting.
After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative.
It makes the model more robust by shrinking the weights on each step.
range : [0,1]
Typical final values : 0.01-0.2.

#### 2.  gamma

gamma [default=0, alias: min_split_loss]

A node is split only when the resulting split gives a positive reduction in the loss function.
Gamma specifies the minimum loss reduction required to make a split.
It makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
The larger gamma is, the more conservative the algorithm will be.
Range: [0,∞]



#### 3. max_depth

max_depth [default=6]

The maximum depth of a tree, same as GBM.
It is used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.
Increasing this value will make the model more complex and more likely to overfit.
The value 0 is only accepted in lossguided growing policy when tree_method is set as hist and it indicates no limit on depth.
We should be careful when setting large value of max_depth because XGBoost aggressively consumes memory when training a deep tree.
range: [0,∞] (0 is only accepted in lossguided growing policy when tree_method is set as hist.
Should be tuned using CV.
Typical values: 3-10

#### 4. min_child_weight

min_child_weight [default=1]

It defines the minimum sum of weights of all observations required in a child.
This is similar to min_child_leaf in GBM but not exactly. This refers to min “sum of weights” of observations while GBM has min “number of observations”.
It is used to control over-fitting.
Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
    Too high values can lead to under-fitting.
Hence, it should be tuned using CV.
The larger min_child_weight is, the more conservative the algorithm will be.
range: [0,∞]

#### 5. max_delta_step

max_delta_step [default=0]

In maximum delta step we allow each tree’s weight estimation to be.
If the value is set to 0, it means there is no constraint.
If it is set to a positive value, it can help making the update step more conservative.
Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced.
Set it to value of 1-10 might help control the update.
range: [0,∞]

#### 6. subsample

subsample [default=1]

It denotes the fraction of observations to be randomly samples for each tree.
    Subsample ratio of the training instances.
Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. - This will prevent overfitting.
Subsampling will occur once in every boosting iteration.
Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting.
Typical values: 0.5-1
range: (0,1]


#### 7. colsample_bytree, colsample_bylevel, colsample_bynode
Table of Contents

colsample_bytree, colsample_bylevel, colsample_bynode [default=1]

This is a family of parameters for subsampling of columns.

All colsample_by parameters have a range of (0, 1], the default value of 1, and specify the fraction of columns to be subsampled.

colsample_bytree is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.

colsample_bylevel is the subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree.

colsample_bynode is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level.

colsample_by* parameters work cumulatively. For instance, the combination {'colsample_bytree':0.5, 'colsample_bylevel':0.5, 'colsample_bynode':0.5} with 64 features will leave 8 features to choose from at each split.

#### 8. lambda

        lambda [default=1, alias: reg_lambda]

L2 regularization term on weights (analogous to Ridge regression).
This is used to handle the regularization part of XGBoost.
Increasing this value will make model more conservative.


#### 9. alpha

alpha [default=0, alias: reg_alpha]

L1 regularization term on weights (analogous to Lasso regression).
It can be used in case of very high dimensionality so that the algorithm runs faster when implemented.
Increasing this value will make model more conservative.


#### 10. tree_method

tree_method string [default= auto]

The tree construction algorithm used in XGBoost.

XGBoost supports approx, hist and gpu_hist for distributed training. Experimental support for external memory is available for approx and gpu_hist.

Choices: auto, exact, approx, hist, gpu_hist

auto: Use heuristic to choose the fastest method.

For small to medium dataset, exact greedy (exact) will be used.

For very large dataset, approximate algorithm (approx) will be chosen.

Because old behavior is always use exact greedy in single machine, user will get a message when approximate algorithm is chosen to notify this choice.

exact: Exact greedy algorithm.

approx: Approximate greedy algorithm using quantile sketch and gradient histogram.

hist: Fast histogram optimized approximate greedy algorithm. It uses some performance improvements such as bins caching.

gpu_hist: GPU implementation of hist algorithm.

#### 11. scale_pos_weight

scale_pos_weight [default=1]

It controls the balance of positive and negative weights,
It is useful for imbalanced classes.
    A value greater than 0 should be used in case of high class imbalance as it helps in faster convergence.
A typical value to consider: sum(negative instances) / sum(positive instances).


#### 12. max_leaves

max_leaves [default=0]

Maximum number of nodes to be added.
Only relevant when grow_policy=lossguide is set.
There are other hyperparameters like sketch_eps,updater, refresh_leaf, process_type, grow_policy, max_bin, predictor and num_parallel_tree.

(source: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning#2.-XGBoost-hyperparameters- )


In [9]:
params = { 'model__eta': (0.01, 0.2),
    'model__gamma': (0, 5),
    'model__max_depth': (3, 10),
    'model__min_child_weight': (1, 10),
    'model__max_delta_step': (0, 10),
    'model__subsample': (0.1,1),
    'model__colsample_bytree': (0.5, 1),
    'model__colsample_bylevel': (0.5, 1),
    'model__colsample_bynode': (0.5, 1),
    'model__lambda': (0, 5),
    'model__alpha': (0, 5),
    'model__scale_pos_weight': (1, 10),
    'model__max_leaves': (0, 100)
           }

In [10]:
random_search = RandomizedSearchCV(pipe, param_distributions=params, n_iter=10, cv=5)
random_search.fit(X_train, y_train_encoded)


In [11]:
random_search.best_params_

{'model__subsample': 1,
 'model__scale_pos_weight': 10,
 'model__min_child_weight': 1,
 'model__max_leaves': 0,
 'model__max_depth': 3,
 'model__max_delta_step': 0,
 'model__lambda': 5,
 'model__gamma': 5,
 'model__eta': 0.2,
 'model__colsample_bytree': 1,
 'model__colsample_bynode': 0.5,
 'model__colsample_bylevel': 0.5,
 'model__alpha': 0}

In [18]:
result_predicted = np.round(random_search.predict(X_test))
label_encoder.inverse_transform(result_predicted.astype(int))

array(['CC', 'CC', 'BB', 'CC', 'BA', 'CB', 'BB', 'BA', 'CC', 'BA', 'BB',
       'CB', 'BB', 'BB', 'CB', 'BB', 'BA', 'DC', 'CC', 'BB', 'CB', 'DC',
       'CC', 'BA', 'BA', 'BB', 'CB', 'BA', 'BA', 'DC', 'BB', 'BA', 'BB',
       'CC', 'BA', 'BA', 'BA', 'DC', 'BB', 'BA', 'BA', 'BA', 'CB', 'BA'],
      dtype=object)

In [21]:
label_encoder = joblib.load('label_enc.joblib')

In [157]:
grid_search = GridSearchCV(pipe,  param_grid=params, cv=3,n_jobs=-1)
grid_search.fit(X_train, y_train_encoded)


KeyboardInterrupt: 

In [None]:
ml

In [None]:
# Функция для оптимизации с помощью Optuna
def objective(trial):

    # Параметры для оптимизации
    params = {
        'model__eta': trial.suggest_float('model__eta', 0.01, 0.2),
        'model__gamma': trial.suggest_float('model__gamma', 0, 5),
        'model__max_depth': trial.suggest_int('model__max_depth', 3, 10),
        'model__min_child_weight': trial.suggest_float('model__min_child_weight', 1, 10),
        'model__max_delta_step': trial.suggest_float('model__max_delta_step', 0, 10),
        'model__subsample': trial.suggest_float('model__subsample', 0.1,1),
        'model__colsample_bytree': trial.suggest_float('model__colsample_bytree', 0.5, 1),
        'model__colsample_bylevel': trial.suggest_float('model__colsample_bylevel', 0.5, 1),
        'model__colsample_bynode': trial.suggest_float('model__colsample_bynode', 0.5, 1),
        'model__lambda': trial.suggest_float('model__lambda', 0, 5),
        'model__alpha': trial.suggest_float('model__alpha', 0, 5),
        'model__scale_pos_weight': trial.suggest_float('model__scale_pos_weight', 1, 10),
        'model__max_leaves': trial.suggest_int('model__max_leaves', 0, 100),
               #  'model__reg_alpha': trial.suggest_float('model__reg_alpha', 1e-5, 1e2),
               # 'model__reg_lambda': trial.suggest_float('model__reg_lambda', 1e-5, 1e2)
               }

    pipe.set_params(**params)

    # Обучение модели
    cross_val_mse = cross_val_score(pipe,X_train, y_train_encoded).mean()
    # pipe.fit(X_train, y_train)

    return cross_val_mse

# Запуск оптимизации
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)  # Указать желаемое количество итераций

In [None]:
# Accessing the results from RandomizedSearchCV
results = pd.DataFrame()
results['RandomizedSearchCV'] = random_search.best_params_
results['GreedSearchCV'] = grid_search.best_params_
results['Optuna'] = study.best_params_

print(results)

План - найти параметры тремя способами, затем сравнить их и попытаться сравнить качество модели с каждым набором гиперпараметров


Затем проверить по метрикам и построить графики
