In [22]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb

from run_xgb.run import run_xgb
from scipy import stats
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)



In [65]:
df = pd.read_pickle('../data/input/owid-covid-data.pickle')

In [66]:
russia_covid_df = df.loc[df['iso_code']=='RUS']

In [67]:
(~df[['date',
       'aged_70_older',
       'people_fully_vaccinated_per_hundred',
       'new_deaths_per_million']].isnull()).sum(0)

date                                   161618
aged_70_older                          133347
people_fully_vaccinated_per_hundred     38292
new_deaths_per_million                 140496
dtype: int64

In [68]:
isnull_vacc = (df['people_fully_vaccinated_per_hundred'].isnull()) | (df['new_deaths_per_million'].isnull())
df = df[~isnull_vacc]

In [108]:
df['new_deaths_per_million_shift1'] = df['new_deaths_per_million'].shift(1)
df['new_deaths_per_million_delta'] = df['new_deaths_per_million'] - df['new_deaths_per_million_shift1']

In [110]:
df = df.iloc[1:]

In [111]:
df.reset_index(drop=True, inplace=True)
df.shape

(37602, 69)

### Model check

In [126]:
train_cols = ['aged_70_older', 
              'people_fully_vaccinated_per_hundred',
             'new_deaths_per_million_shift1']
label_col = 'new_deaths_per_million'
train_val_border = 200

In [117]:
train, val = train_test_split(df, train_size=.7)

In [128]:
kfold = KFold(n_splits=3)

train_params = {
    'num_boost_round': 200,
    'verbose_eval': 10,
    'early_stopping_rounds': 10
}

booster_params = {
    'objective': 'reg:squarederror',
    
    'nthread': -1,
    'eta': .1,
    'max_depth': 5
}

log_params = {
    'description': 'proba',
    'result_path': '../data/result'
}

In [129]:
res = run_xgb(train=train,
              val=val,
              train_cols=train_cols,
              train_labels=train[label_col],
              val_labels=val[label_col],
              booster_params=booster_params,
              train_params=train_params,
              log_params=log_params,
              kfold=kfold,
              metric=mean_absolute_error)

../data/result/02-14-19
[0]	train-rmse:4.82223	val-rmse:4.88351
[10]	train-rmse:3.49878	val-rmse:3.67075
[20]	train-rmse:3.23747	val-rmse:3.46259
[30]	train-rmse:3.14927	val-rmse:3.41304
[40]	train-rmse:3.07737	val-rmse:3.38714
[50]	train-rmse:3.03749	val-rmse:3.37577
[60]	train-rmse:3.01350	val-rmse:3.36737
[70]	train-rmse:3.00122	val-rmse:3.36514
[80]	train-rmse:2.96591	val-rmse:3.34484
[90]	train-rmse:2.93172	val-rmse:3.33270
[100]	train-rmse:2.91478	val-rmse:3.32655
[110]	train-rmse:2.87696	val-rmse:3.31937
[120]	train-rmse:2.84361	val-rmse:3.32048
[130]	train-rmse:2.82230	val-rmse:3.31624
[140]	train-rmse:2.81061	val-rmse:3.31407
[150]	train-rmse:2.78085	val-rmse:3.30885
[160]	train-rmse:2.75028	val-rmse:3.30495
[170]	train-rmse:2.72840	val-rmse:3.30031
[180]	train-rmse:2.71565	val-rmse:3.29960
[190]	train-rmse:2.69238	val-rmse:3.29616
[199]	train-rmse:2.67289	val-rmse:3.29466
[0]	train-rmse:4.85430	val-rmse:4.80436
[10]	train-rmse:3.49606	val-rmse:3.61901
[20]	train-rmse:3.22306	

### Feature importance checker

In [130]:
with open('../data/result/02-14-19/models/model0.pickle', 'rb') as f:
    model = pickle.load(f)

In [131]:
model.feature_names

['aged_70_older',
 'people_fully_vaccinated_per_hundred',
 'new_deaths_per_million_shift1']

In [132]:
from run_xgb.check_importance import check_importance

In [138]:
result = check_importance(df,
                          model,
                          input_col='people_fully_vaccinated_per_hundred',
                          input_change_method='rel',
                          target_measure_method='mass_rel',
                          agg_method='mean',
                          increase_koeff=25)

Start preds mean --> 2.56    changed preds mean --> 2.43
-0.0474


In [135]:
df[label_col].mean()

2.687699909579278