In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score 
import shap

# import warnings
# warnings.filterwarnings("ignore")


!mkdir data
!wget https://lodmedia.hb.bizmrg.com/case_files/776881/train_dataset_train.csv -P data
!wget https://lodmedia.hb.bizmrg.com/case_files/776881/test_dataset_test.csv -P data
!wget https://lodmedia.hb.bizmrg.com/cases/776881/%D0%9D%D0%BE%D0%B2%D0%B3%D0%BE%D1%80%D0%BE%D0%B4.zip -P data
!unzip data/Новгород.zip -d data

In [3]:
def prep(df):
    df = df.fillna(0)
    df['dayofweek'] = pd.to_datetime(df.Дата).dt.dayofweek
    df['hour'] = pd.to_datetime(df.Время).dt.hour
    df['dayofyear'] = pd.to_datetime(df.Дата).dt.dayofyear
    df['week'] = pd.to_datetime(df.Дата).dt.isocalendar().week.astype(int)
    df['day'] = pd.to_datetime(df.Дата).dt.day
    df['month'] = pd.to_datetime(df.Дата).dt.month
    df['year'] = pd.to_datetime(df.Дата).dt.year
    df['km'] = df.Километр.astype('int64')
    df['meter'] = df.Метр.astype('int64')
    df['address'] = df['Улица'].astype('str') + df['Дом'].astype('str')
    dropcols = ["Погибло детей","Дата", "Время", "id", 'Улица', 'Дом']
    return df.drop(dropcols, axis=1, errors='ignore')

In [4]:
test = pd.read_csv('data/test_dataset_test.csv')
train = pd.read_csv('data/train_dataset_train.csv')
sample_sub = pd.read_csv('data/sample_solution.csv')

In [5]:
train = prep(train)
test = prep(test)

encoder = ce.OrdinalEncoder(cols=['Место', 'Address', 'Дорога', 'Вид ДТП',], return_df=True , verbose = None)
train = encoder.fit_transform(train)
test = encoder.fit_transform(test)

target_cols = ["Погибло", "Ранено", "Ранено детей"] 
train[target_cols] = (train[target_cols] != 0).astype(int) # Приведение целевых переменных к бинарному виду

In [6]:
def plot_feature_importance(importance, names):
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    plt.show()

In [7]:
recall_all = 0
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns= target_cols), train[target_cols], test_size=0.3, random_state=42)
for i in target_cols:
    train_pool = Pool(X_train, y_train[i])
    val_pool = Pool(X_test, y_test[i])
    test_pool = Pool(test)
    base_model = CatBoostClassifier(random_seed=123, custom_metric=['Recall'], auto_class_weights='Balanced', \
                                   depth=4, learning_rate=0.015)
    base_model.fit(train_pool, verbose=500, eval_set=val_pool, use_best_model=True) 
    base_model.save_model(f'data/{i}_MODEL.cbm')
    
    val_pred = base_model.predict(val_pool)
    recall = recall_score(y_test[i].values, val_pred , average='macro')
    recall_all += recall
    print('recall: ', recall)
    
    sample_sub[i] = base_model.predict(test_pool)
    
    plot_feature_importance(base_model.get_feature_importance(), X_train.columns)
    
    explainer = shap.Explainer(base_model)
    shap_values = explainer(X_train)
    shap.plots.beeswarm(shap_values, max_display=40, order=shap_values.abs.max(0))
    plt.show()

print(f'Mean recall: {recall_all / 3}')

In [8]:
# Mean recall: 0.6113447572556285 = 0.466679 on LB
sample_sub.to_csv("sub.csv", index=False)