## Import libraries

In [None]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.utils import shuffle 


## Loading hyperparameters

In [None]:
lgbm_params_path = os.path.join(os.getcwd(), '../best_parameters', 'lgbm_params.json')
catboost_params_path = os.path.join(os.getcwd(), '../best_parameters', 'catboost_params.json')

with open(lgbm_params_path, 'r') as f:
    lgbm_params = json.load(f)

with open(catboost_params_path, 'r') as f:
    catboost_params = json.load(f)

In [4]:
lgbm_params

{'bootstrap': 0.0,
 'max_depth': 20.063756403314922,
 'max_features': 0.1,
 'min_samples_leaf': 1.8751829403343598,
 'min_samples_split': 10.94043278785658,
 'n_estimators': 179.40950184603062,
 'oob_score': 1.0}

In [5]:
catboost_params

{'depth': 4.091961642353418,
 'iterations': 152.27525095137952,
 'learning_rate': 0.08795585311974417,
 'max_bin': 157.27317787708617,
 'min_data_in_leaf': 35.69555631200623}

## Models

In [None]:
lgbm_model = LGBMClassifier(
    n_estimators=int(lgbm_params['n_estimators']),
    max_depth=int(lgbm_params['max_depth']),
    min_samples_split=int(lgbm_params['min_samples_split']),
    min_samples_leaf=int(lgbm_params['min_samples_leaf']),
    max_features=lgbm_params['max_features'],
    bootstrap=bool(lgbm_params['bootstrap']),
    oob_score=bool(lgbm_params['oob_score']) if lgbm_params.get('bootstrap', False) else False,
    random_state=42
)

In [16]:
catboost_params['depth'] = int(catboost_params['depth'])
catboost_params['iterations'] = int(catboost_params['iterations'])
catboost_params['min_data_in_leaf'] = int(catboost_params['min_data_in_leaf'])
catboost_params['max_bin'] = int(catboost_params['max_bin'])

catboost_model = CatBoostClassifier(
    learning_rate=catboost_params['learning_rate'],
    depth=int(catboost_params['depth']),
    iterations=int(catboost_params['iterations']),
    min_data_in_leaf=int(catboost_params['min_data_in_leaf']),
    max_bin=int(catboost_params['max_bin']),
    random_state=42,
    verbose=0  
)


## Data loading

In [8]:
file_path  = os.path.join(os.getcwd(), '../data', 'processed_data.tsv')
df = pd.read_csv(file_path, sep='\t')
df

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,189,190,191,192,193,194,195,196,197,198
0,SERVICE,-0.128368,-0.469331,-0.177878,-0.192006,-0.222916,-0.093777,0.025445,-0.832899,-0.436870,...,-0.062861,-0.510515,-0.763284,-0.378731,0.528817,-0.049521,-0.329912,0.620485,-0.031838,-0.507967
1,SERVICE,-0.163475,-0.396297,-0.157724,-0.146050,-0.182638,-0.081966,0.078536,-0.641592,-0.299122,...,-0.295817,0.414417,-0.198974,0.003079,0.043172,-0.196704,-0.235280,0.157981,-0.224459,1.148332
2,NON_FOOD_GOODS,-0.327237,-0.892746,-0.421341,-0.114922,0.145603,0.178709,-0.678991,0.014879,-0.191826,...,-2.362266,-2.083801,4.348409,2.111854,8.086853,0.882947,1.015955,0.833849,1.022884,-0.328521
3,LOAN,-0.282064,-0.650740,-0.145728,-0.065063,-0.387599,-0.455714,0.259525,-1.033327,-0.479710,...,-0.021922,0.351193,-0.078121,-0.190551,0.202207,-0.030612,-0.136432,0.240200,0.016642,-0.120984
4,NOT_CLASSIFIED,-0.494004,-1.070177,-0.595625,-0.433807,-0.470061,-0.099003,0.713364,-0.616545,-0.117873,...,-0.618260,0.101671,-0.824624,-0.097199,0.223601,-0.150128,-0.369099,1.199207,0.386687,-0.350164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,TAX,-0.265042,-0.515122,-0.227587,-0.170178,-0.028201,0.028267,-0.192035,-0.286995,-0.204795,...,0.400836,-0.280797,0.010601,0.159663,-0.035141,-0.065599,-0.141998,0.008215,-0.165131,0.121020
496,BANK_SERVICE,-2.048056,20.911902,-10.326095,-3.513630,0.001023,-0.810541,-0.704414,0.892176,-4.624454,...,0.003252,0.008277,0.006612,-0.018563,-0.003845,-0.003299,-0.004852,0.004102,0.007205,0.026897
497,SERVICE,-0.131671,-0.358712,-0.142820,-0.118663,-0.166825,-0.083802,0.085220,-0.600441,-0.291151,...,-0.249828,1.571986,0.073639,-0.104947,0.662228,-0.267097,-1.113358,-0.172560,0.554598,-0.264534
498,FOOD_GOODS,-0.624792,-1.234122,-0.523791,-0.253499,-0.185988,-0.144191,0.002712,-0.402693,0.017636,...,-0.112007,-0.201682,0.092288,0.080106,-0.013846,0.029819,0.083273,-0.144005,-0.165861,-0.013142


In [11]:
X = df.drop(columns=['target'])
y = df['target']

X, y = shuffle(X, y, random_state=42)


## Ensemble work

### №1

In [20]:
estimators = [
    (f'lgbm_{i}', lgbm_model) for i in range(5)
] + [
    (f'catboost_{i}', catboost_model) for i in range(5)
]

ensemble = VotingClassifier(
    estimators=estimators,
    voting='soft'  
)

In [21]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_val)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recalls.append(recall_score(y_val, y_pred, average='macro'))
    f1_scores.append(f1_score(y_val, y_pred, average='macro'))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26474
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 199
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -1.714798
[LightGBM] [Info] Start training from score -2.590267
[LightGBM] [Info] Start training from score -2.525729
[LightGBM] [Info] Start training from score -1.647659
[LightGBM] [Info] Start training from score -3.101093
[LightGBM] [Info] Start training from score -2.900422
[LightGBM] [Info] Start training from score -1.742969
[LightGBM] [Info] Start training from score -2.327903
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26474
[LightGBM] [Info] Number of 

In [22]:
print("Mean Accuracy: ", round(np.mean(accuracies), 3))
print("Mean Precision: ", round(np.mean(precisions), 3))
print("Mean Recall: ", round(np.mean(recalls), 3))
print("Mean F1-Score: ", round(np.mean(f1_scores), 3))


Mean Accuracy:  0.92
Mean Precision:  0.942
Mean Recall:  0.929
Mean F1-Score:  0.932


### №2

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin

class CompatibleCatBoostClassifier(CatBoostClassifier, BaseEstimator, ClassifierMixin):
    def predict(self, X):
        return super().predict(X).astype(int)
    
    def predict_proba(self, X):
        return super().predict_proba(X)

compatible_catboost_model = CompatibleCatBoostClassifier(**catboost_params, verbose=0)

estimators = [
    (f'lgbm_{i}', lgbm_model) for i in range(5)
] + [
    (f'catboost_{i}', compatible_catboost_model) for i in range(5)
]

ensemble = VotingClassifier(
    estimators=estimators,
    voting='soft'
)


In [26]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_val)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
    recalls.append(recall_score(y_val, y_pred, average='macro'))
    f1_scores.append(f1_score(y_val, y_pred, average='macro'))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26474
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 199
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -1.714798
[LightGBM] [Info] Start training from score -2.590267
[LightGBM] [Info] Start training from score -2.525729
[LightGBM] [Info] Start training from score -1.647659
[LightGBM] [Info] Start training from score -3.101093
[LightGBM] [Info] Start training from score -2.900422
[LightGBM] [Info] Start training from score -1.742969
[LightGBM] [Info] Start training from score -2.327903
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26474
[LightGBM] [Info] Number of 

In [27]:
print("Mean Accuracy: ", round(np.mean(accuracies), 3))
print("Mean Precision: ", round(np.mean(precisions), 3))
print("Mean Recall: ", round(np.mean(recalls), 3))
print("Mean F1-Score: ", round(np.mean(f1_scores), 3))


Mean Accuracy:  0.92
Mean Precision:  0.944
Mean Recall:  0.933
Mean F1-Score:  0.936
