In [1]:
from typing import Tuple, List

import joblib
import pickle

import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
#import xgboost as xgb

from preprocessing import CategoricalPreprocessor
from preprocessing import DinamChangesInserter
from preprocessing import Splitter
from cfg import *

In [2]:
with open("./data/raw/covid_flow.pkl", 'rb') as data_file:
    data = pickle.load(data_file)
#data = data.reset_index(drop=True)

In [3]:
len(np.unique(data.index))

1992

In [4]:
dinam_fact_columns = []
for column in data.columns:
    if DINAM_FACT_PREFIX in column:
        dinam_fact_columns.append(column)
dinam_fact_columns

['Температура_dinam_fact',
 'Лимфоциты#_dinam_fact',
 'АСТ_dinam_fact',
 'ЧСС_dinam_fact',
 'ЧДД_dinam_fact',
 'Билирубин общий_dinam_fact',
 'MPV- Средн.объем тромбоцитов_dinam_fact',
 'PCT- Тромбокрит_dinam_fact',
 'Лимфоциты%_dinam_fact',
 'снижение_сознания_dinam_fact',
 'Cтепень тяжести по КТ_dinam_fact',
 'Лактатдегидрогеназа_dinam_fact',
 'PDW- Индекс расп.по объему тр._dinam_fact']

In [5]:
target_columns = []
for column in data:
    if DINAM_CONTROL_PREFIX in column:
        target_columns.append(column)
target_columns

['трансфузия_dinam_control',
 'оксигенотерапия_dinam_control',
 'НИВЛ_dinam_control',
 'ИВЛ_dinam_control']

In [6]:
feat_columns = []
for column in data:
    if DINAM_CONTROL_PREFIX not in  column and (
        STAT_CONTROL_PREFIX in column
        or STAT_FACT_PREFIX in  column
        or DINAM_FACT_PREFIX in column
    ):
        feat_columns.append(column)

In [15]:
preprocessors = Pipeline(
    [
        ('encoder', CategoricalPreprocessor()),
        ('change_inserter', DinamChangesInserter(use_columns=dinam_fact_columns)),
        ('imputer', KNNImputer(n_neighbors=20)),
        ('scaler', MinMaxScaler())
    ]
)

feat_data = preprocessors.fit_transform(data[feat_columns])

In [16]:
column_train_data = {}
for column in target_columns:
    temp_data = np.concatenate([feat_data, data[column].values.reshape(-1, 1)], axis=1)
    positive_label = temp_data[temp_data[:, -1] == 1]
    negative_label = temp_data[temp_data[:, -1] == 0]

    negative_label = negative_label[np.random.choice(
        np.arange(0, len(negative_label), 1),
        size=len(positive_label) * 2,
        replace=False)
    ]

    temp_data = np.concatenate(
        [
            positive_label,
            negative_label
        ],
        axis=0
    )
    column_train_data[column] = temp_data


In [17]:
for column in column_train_data:
    print(f'{column} - {column_train_data[column][:, -1].sum()} - {len(column_train_data[column])}')

трансфузия_dinam_control - 157.0 - 471
оксигенотерапия_dinam_control - 360.0 - 1080
НИВЛ_dinam_control - 57.0 - 171
ИВЛ_dinam_control - 250.0 - 750


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from catboost import CatBoostClassifier
import xgboost as xgb


In [19]:
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import GridSearchCV

params = {'iterations': [500, 1000, 2000],
          'depth': [5, 6],
          'loss_function': ['Logloss', 'CrossEntropy'],
          'l2_leaf_reg': [1e-20],
          'learning_rate' : [0.01, 0.001, 0.05, 0.005],
          'leaf_estimation_iterations': [5, 10, 15],
          'logging_level':['Silent'],
          'random_seed': [42]
         }

grid_dict = {}
for column in target_columns:
    print(column)
    X_train, X_test, y_train, y_test = train_test_split(
        column_train_data[column][:, :-1],
        column_train_data[column][:, -1],
        random_state=42
    )

    clf = CatBoostClassifier()
    scorer = make_scorer(f1_score)
    clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=3, verbose=0)
    clf_grid.fit(X_train, y_train)
    prediction = clf_grid.best_estimator_.predict(X_test)
    print(f1_score(y_test, prediction))
    grid_dict[column] = clf_grid

трансфузия_dinam_control
0.7654320987654322
оксигенотерапия_dinam_control
0.44755244755244755
НИВЛ_dinam_control
0.6153846153846154
ИВЛ_dinam_control
0.7819548872180451


In [20]:
for column in target_columns:
    print(grid_dict[column].best_params_)

{'depth': 5, 'iterations': 500, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'learning_rate': 0.001, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'random_seed': 42}
{'depth': 5, 'iterations': 500, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 5, 'learning_rate': 0.05, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'random_seed': 42}
{'depth': 5, 'iterations': 500, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'learning_rate': 0.01, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'random_seed': 42}
{'depth': 5, 'iterations': 1000, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 15, 'learning_rate': 0.01, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'random_seed': 42}


In [21]:
params = parameters = {
    'max_depth': [4, 5, 6, 7, 8],
    'n_estimators': [50, 100, 200, 400, 1000],
    'learning_rate': [0.01, 0.001, 0.05, 0.005]
}

grid_dict2 = {}
for column in target_columns:
    print(column)
    X_train, X_test, y_train, y_test = train_test_split(
        column_train_data[column][:, :-1],
        column_train_data[column][:, -1],
        random_state=42
    )

    clf = xgb.XGBClassifier()
    scorer = make_scorer(f1_score)
    clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=3, verbose=0)
    clf_grid.fit(X_train, y_train)
    prediction = clf_grid.best_estimator_.predict(X_test)
    print(f1_score(y_test, prediction))
    grid_dict2[column] = clf_grid

трансфузия_dinam_control
0.6206896551724138
оксигенотерапия_dinam_control
0.4840764331210191
НИВЛ_dinam_control
0.7741935483870969
ИВЛ_dinam_control
0.7681159420289856
