In [14]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb

from run_xgb.reads import read_config
from run_xgb.run import run_xgb
from scipy import stats
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

In [2]:
df = pd.read_pickle('../data/input/owid-covid-data.pickle')

In [3]:
russia_covid_df = df.loc[df['iso_code']=='RUS']

In [4]:
(~df[['date',
       'aged_70_older',
       'people_fully_vaccinated_per_hundred',
       'new_deaths_per_million']].isnull()).sum(0)

date                                   161618
aged_70_older                          133347
people_fully_vaccinated_per_hundred     38292
new_deaths_per_million                 140496
dtype: int64

In [5]:
isnull_vacc = (df['people_fully_vaccinated_per_hundred'].isnull()) | (df['new_deaths_per_million'].isnull())
df = df[~isnull_vacc]

In [6]:
df['new_deaths_per_million_shift1'] = df['new_deaths_per_million'].shift(1)
df['new_deaths_per_million_delta'] = df['new_deaths_per_million'] - df['new_deaths_per_million_shift1']

In [7]:
df = df.iloc[1:]

In [8]:
df.reset_index(drop=True, inplace=True)
df.shape

(37602, 69)

### Model check

In [9]:
train_cols = ['aged_70_older', 
              'people_fully_vaccinated_per_hundred',
             'new_deaths_per_million_shift1']
label_col = 'new_deaths_per_million'
train_val_border = 200

In [10]:
train, val = train_test_split(df, train_size=.7)

In [17]:
kfold = KFold(n_splits=3)
config = read_config()

In [19]:
res = run_xgb(train=train,
              val=val,
              train_cols=train_cols,
              train_labels=train[label_col],
              val_labels=val[label_col],
              booster_params=config['booster_params'],
              train_params=config['train_params'],
              log_params=config['log_params'],
              kfold=kfold,
              metric=mean_absolute_error)

../data/result/02-15-14
[0]	train-rmse:4.88282	val-rmse:5.00077
[10]	train-rmse:3.62698	val-rmse:3.75716
[20]	train-rmse:3.37292	val-rmse:3.53228
[30]	train-rmse:3.29367	val-rmse:3.49107
[40]	train-rmse:3.23158	val-rmse:3.46930
[50]	train-rmse:3.14154	val-rmse:3.42503
[60]	train-rmse:3.08331	val-rmse:3.40896
[70]	train-rmse:3.05690	val-rmse:3.40928
[0]	train-rmse:4.78858	val-rmse:5.18683
[10]	train-rmse:3.48352	val-rmse:4.00596
[20]	train-rmse:3.21842	val-rmse:3.81156
[30]	train-rmse:3.12279	val-rmse:3.75327
[40]	train-rmse:3.07467	val-rmse:3.73816
[50]	train-rmse:3.04166	val-rmse:3.73194
[60]	train-rmse:3.02256	val-rmse:3.72822
[70]	train-rmse:2.96666	val-rmse:3.72121
[80]	train-rmse:2.94869	val-rmse:3.72135
[0]	train-rmse:5.08681	val-rmse:4.56649
[10]	train-rmse:3.78761	val-rmse:3.32931
[20]	train-rmse:3.52284	val-rmse:3.13613
[30]	train-rmse:3.43127	val-rmse:3.10847
[40]	train-rmse:3.38479	val-rmse:3.11503
[50]	train-rmse:3.34351	val-rmse:3.10726
[60]	train-rmse:3.31578	val-rmse:3.1

### Feature importance checker

In [130]:
with open('../data/result/02-14-19/models/model0.pickle', 'rb') as f:
    model = pickle.load(f)

In [131]:
model.feature_names

['aged_70_older',
 'people_fully_vaccinated_per_hundred',
 'new_deaths_per_million_shift1']

In [132]:
from run_xgb.check_importance import check_importance

In [138]:
result = check_importance(df,
                          model,
                          input_col='people_fully_vaccinated_per_hundred',
                          input_change_method='rel',
                          target_measure_method='mass_rel',
                          agg_method='mean',
                          increase_koeff=25)

Start preds mean --> 2.56    changed preds mean --> 2.43
-0.0474


In [135]:
df[label_col].mean()

2.687699909579278