## Импорт необходимого функционала

In [1]:
import pickle
import numpy as np
import pandas as pd
from tqdm import notebook
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import VotingClassifier

## Загрузка данных

In [2]:
train = pd.read_csv("data/mars-train-class.csv")
train.shape, train.isna().sum().sum()

((11915, 13), 0)

In [3]:
train.corr()

Unnamed: 0,№ испытания,Модуль сигнала,Тип_измерения,Количество импульсов,Фаза Hor,Фаза Ver,Уровень шума,Азимут,У.М.,Секунда,Дальность (м),Доля сигнала в ВП,Тип марсианина
№ испытания,1.0,-0.02502,-0.017452,-0.020188,0.002079,-0.008238,-0.130296,-0.116182,0.141618,0.998006,0.038307,0.124126,-0.169779
Модуль сигнала,-0.02502,1.0,-0.000286,-0.028387,-0.00697,0.002894,0.77885,-0.122965,0.378563,-0.023539,-0.337502,0.097808,0.02158
Тип_измерения,-0.017452,-0.000286,1.0,-0.797615,-0.192059,-0.194575,-0.238472,-0.038155,0.128842,-0.030021,-0.131973,-0.038494,0.019426
Количество импульсов,-0.020188,-0.028387,-0.797615,1.0,0.159007,0.156408,0.259612,0.028902,-0.121065,-0.010993,0.01541,0.022666,0.123542
Фаза Hor,0.002079,-0.00697,-0.192059,0.159007,1.0,0.166253,0.040993,0.011824,-0.040392,0.005382,0.025235,-0.002086,0.003897
Фаза Ver,-0.008238,0.002894,-0.194575,0.156408,0.166253,1.0,0.046959,0.022025,-0.027857,-0.004896,0.023368,0.015075,0.002198
Уровень шума,-0.130296,0.77885,-0.238472,0.259612,0.040993,0.046959,1.0,-0.103142,0.151747,-0.125473,-0.139898,0.049155,0.042167
Азимут,-0.116182,-0.122965,-0.038155,0.028902,0.011824,0.022025,-0.103142,1.0,-0.351181,-0.115784,0.298235,-0.048005,0.023581
У.М.,0.141618,0.378563,0.128842,-0.121065,-0.040392,-0.027857,0.151747,-0.351181,1.0,0.136634,-0.664329,0.030407,0.049693
Секунда,0.998006,-0.023539,-0.030021,-0.010993,0.005382,-0.004896,-0.125473,-0.115784,0.136634,1.0,0.039631,0.129095,-0.173169


## Обучение классификатора и оценка предсказательной способности

In [4]:
np.random.seed(5432)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=5432)
ESTIMATORS = [
    ("cb", CatBoostClassifier(random_seed=5432, depth=10)),
    ("xgb", XGBClassifier(n_estimators=800, n_jobs=-1)),
]

all_models = {}
for i in range(1, 4):
    trial = train[train["№ испытания"].values == i].drop(["№ испытания"], axis=1).reset_index(drop=True)
    y = trial["Тип марсианина"].values
    x = trial.drop(["Тип марсианина"], axis=1)
    
    f1_mean, models = 0.0, []
    for train_idx, val_idx in notebook.tqdm(kfold.split(x, y)):
        clf = VotingClassifier(estimators=ESTIMATORS, n_jobs=-1, verbose=1)
        clf.fit(x.loc[train_idx], y[train_idx])
        models.append(clf)
        preds = clf.predict(x.loc[val_idx])
        f1 = f1_score(y[val_idx], preds)
        f1_mean += f1
        print(f1)
        
    all_models[i] = models
    print()
    print(f1_mean / kfold.n_splits)
    print(50 * '=')

0it [00:00, ?it/s]

0.9995114802149487
0.9990234375
1.0
1.0
1.0

0.9997069835429897


0it [00:00, ?it/s]

0.9970731707317073
0.9970674486803518
0.9990224828934506
0.9990224828934506
0.9980468750000001

0.9980464920397921


0it [00:00, ?it/s]

0.999137187230371
1.0
1.0
0.9991386735572783
0.9982788296041308

0.999310938078356


## Сохранение весов классификатора

In [5]:
for key in all_models.keys():
    estimators = all_models[key]
    for i, estimator in enumerate(estimators):
        with open(f"models/{key}_model_{i}.pkl", "wb") as file:
            pickle.dump(estimator, file)

---