## Import libraries


In [13]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from bayes_opt import BayesianOptimization
from sklearn.utils import shuffle 


## Data loading

In [6]:
file_path  = os.path.join(os.getcwd(), '../data', 'processed_data.tsv')
df = pd.read_csv(file_path, sep='\t')
df

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,189,190,191,192,193,194,195,196,197,198
0,SERVICE,-0.128368,-0.469331,-0.177878,-0.192006,-0.222916,-0.093777,0.025445,-0.832899,-0.436870,...,-0.062861,-0.510515,-0.763284,-0.378731,0.528817,-0.049521,-0.329912,0.620485,-0.031838,-0.507967
1,SERVICE,-0.163475,-0.396297,-0.157724,-0.146050,-0.182638,-0.081966,0.078536,-0.641592,-0.299122,...,-0.295817,0.414417,-0.198974,0.003079,0.043172,-0.196704,-0.235280,0.157981,-0.224459,1.148332
2,NON_FOOD_GOODS,-0.327237,-0.892746,-0.421341,-0.114922,0.145603,0.178709,-0.678991,0.014879,-0.191826,...,-2.362266,-2.083801,4.348409,2.111854,8.086853,0.882947,1.015955,0.833849,1.022884,-0.328521
3,LOAN,-0.282064,-0.650740,-0.145728,-0.065063,-0.387599,-0.455714,0.259525,-1.033327,-0.479710,...,-0.021922,0.351193,-0.078121,-0.190551,0.202207,-0.030612,-0.136432,0.240200,0.016642,-0.120984
4,NOT_CLASSIFIED,-0.494004,-1.070177,-0.595625,-0.433807,-0.470061,-0.099003,0.713364,-0.616545,-0.117873,...,-0.618260,0.101671,-0.824624,-0.097199,0.223601,-0.150128,-0.369099,1.199207,0.386687,-0.350164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,TAX,-0.265042,-0.515122,-0.227587,-0.170178,-0.028201,0.028267,-0.192035,-0.286995,-0.204795,...,0.400836,-0.280797,0.010601,0.159663,-0.035141,-0.065599,-0.141998,0.008215,-0.165131,0.121020
496,BANK_SERVICE,-2.048056,20.911902,-10.326095,-3.513630,0.001023,-0.810541,-0.704414,0.892176,-4.624454,...,0.003252,0.008277,0.006612,-0.018563,-0.003845,-0.003299,-0.004852,0.004102,0.007205,0.026897
497,SERVICE,-0.131671,-0.358712,-0.142820,-0.118663,-0.166825,-0.083802,0.085220,-0.600441,-0.291151,...,-0.249828,1.571986,0.073639,-0.104947,0.662228,-0.267097,-1.113358,-0.172560,0.554598,-0.264534
498,FOOD_GOODS,-0.624792,-1.234122,-0.523791,-0.253499,-0.185988,-0.144191,0.002712,-0.402693,0.017636,...,-0.112007,-0.201682,0.092288,0.080106,-0.013846,0.029819,0.083273,-0.144005,-0.165861,-0.013142


In [7]:
X = df.drop(columns=['target'])
y = df['target']

X, y = shuffle(X, y, random_state=42)


## Model

In [8]:
def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap, oob_score):

    bootstrap = bool(bootstrap)
    oob_score = bool(oob_score) if bootstrap else False

    model = RandomForestClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        max_features=max_features,
        bootstrap=bool(bootstrap),  
        oob_score=bool(oob_score), 
        n_jobs=-1,
        random_state=42
    )
    
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        accuracies.append(accuracy_score(y_val, y_pred))
        precisions.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
        recalls.append(recall_score(y_val, y_pred, average='macro'))
        f1_scores.append(f1_score(y_val, y_pred, average='macro'))
    
    return np.mean(accuracies)



## Selection of hyperparameters

In [None]:
param_bounds = {
    'n_estimators': (10, 200), # Количество деревьев
    'max_depth': (5, 50), # Максимальная глубина дерева
    'min_samples_split': (2, 20), # Минимальное количество выборок для разделения узла
    'min_samples_leaf': (1, 20), # Минимальное количество выборок в листе
    'max_features': (0.1, 1.0), # Количество признаков, используемых для построения дерева
    'bootstrap': (0, 1), # Использовать ли bootstrap (0 - False, 1 - True)
    'oob_score': (0, 1) # Использовать ли out-of-bag выборку (0 - False, 1 - True)
}

optimizer = BayesianOptimization(
    f=rf_cv,          
    pbounds=param_bounds,            
    random_state=42
)

optimizer.maximize(
    init_points=5,   
    n_iter=45,   
)

print("Best hyperparameters found: ", optimizer.max)


|   iter    |  target   | bootstrap | max_depth | max_fe... | min_sa... | min_sa... | n_esti... | oob_score |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.85     [0m | [0m0.3745   [0m | [0m47.78    [0m | [0m0.7588   [0m | [0m12.37    [0m | [0m4.808    [0m | [0m39.64    [0m | [0m0.05808  [0m |
| [95m2        [0m | [95m0.908    [0m | [95m0.8662   [0m | [95m32.05    [0m | [95m0.7373   [0m | [95m1.391    [0m | [95m19.46    [0m | [95m168.2    [0m | [95m0.2123   [0m |
| [0m3        [0m | [0m0.874    [0m | [0m0.1818   [0m | [0m13.25    [0m | [0m0.3738   [0m | [0m10.97    [0m | [0m9.775    [0m | [0m65.33    [0m | [0m0.6119   [0m |
| [0m4        [0m | [0m0.868    [0m | [0m0.1395   [0m | [0m18.15    [0m | [0m0.4297   [0m | [0m9.665    [0m | [0m16.13    [0m | [0m47.94    [0m | [0m0.5142   [0m |
| [0m5        [0m | [0m0.898    [0m | 

## Best model

In [12]:
best_params = optimizer.max['params']

n_estimators_best = int(best_params['n_estimators'])
max_depth_best = int(best_params['max_depth'])
min_samples_split_best = int(best_params['min_samples_split'])
min_samples_leaf_best = int(best_params['min_samples_leaf'])
max_features_best = best_params['max_features']
bootstrap_best = bool(best_params['bootstrap'])
oob_score_best = bool(best_params['oob_score']) if bootstrap_best else False

best_model = RandomForestClassifier(
    n_estimators=n_estimators_best,
    max_depth=max_depth_best,
    min_samples_split=min_samples_split_best,
    min_samples_leaf=min_samples_leaf_best,
    max_features=max_features_best,
    bootstrap=bootstrap_best,
    oob_score=oob_score_best,
    n_jobs=-1,
    random_state=42
)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recalls.append(recall_score(y_val, y_pred, average='macro'))
    f1_scores.append(f1_score(y_val, y_pred, average='macro'))

print("Mean Accuracy: ", round(np.mean(accuracies), 3))
print("Mean Precision: ", round(np.mean(precisions), 3))
print("Mean Recall: ", round(np.mean(recalls), 3))
print("Mean F1-Score: ", round(np.mean(f1_scores), 3))


Mean Accuracy:  0.934
Mean Precision:  0.962
Mean Recall:  0.945
Mean F1-Score:  0.948


## Saving parameters

In [None]:
best_params = optimizer.max['params']
output_file_path  = os.path.join(os.getcwd(), '../best_parameters', 'lgbm_params.json')

with open(output_file_path, "w") as file:
    json.dump(best_params, file, indent=4)  

print(f"Best parameters saved to {output_file_path}")
