In [6]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb

from run_xgb.check_importance import check_importance
from run_xgb.reads import read_config
from run_xgb.run import run_xgb
from scipy import stats
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

In [7]:
df = pd.read_pickle('../data/input/owid-covid-data.pickle')
config = read_config()

In [8]:
isnull_vacc_or_death = (df['people_fully_vaccinated_per_hundred'].isnull()) \
                        | (df['new_deaths_per_million'].isnull())

df = df[~isnull_vacc_or_death]
print(df.shape)

(37603, 67)


In [9]:
def get_tomorrow_label(df, config):
    
    df_lst = []
    for country in df['iso_code'].unique():
        country_df = df[df['iso_code']==country]
        country_df['new_deaths_per_million_tomorrow'] = country_df['new_deaths_per_million'].shift(-1).values
        country_df = country_df.iloc[:-1]

        df_lst.append(country_df)
    
    res_df = pd.concat(df_lst)[['date'] + config['train_cols'] + [config['label_col']]]
    res_df = pd.concat(df_lst)
    res_df.reset_index(drop=True, inplace=True)
    return res_df

In [10]:
shift_df = get_tomorrow_label(df, config)
shift_df['date'] = pd.to_datetime(shift_df['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
print(shift_df.shape)

(37389, 68)


### Model check

In [12]:
train, val = train_test_split(shift_df, train_size=.7)

In [13]:
class Splitter():
    
    def __init__(self):
        self.n_splits = 1
    
    def split(self, df):
        q7 = df['date'].quantile(.7)
        df.reset_index(inplace=True)
        yield (df[df['date'] <= q7].index.values, df[df['date'] > q7].index.values)


In [14]:
for train_index, test_index in Splitter().split(shift_df):
    pass

In [15]:
res = run_xgb(train=train,
              val=val,
              train_cols=config['train_cols'],
              train_labels=train[config['label_col']],
              val_labels=val[config['label_col']],
              booster_params=config['booster_params'],
              train_params=config['train_params'],
              log_params=config['log_params'],
              kfold=Splitter(),
              metric=mean_absolute_error)

../data/result/02-16-12
[0]	train-rmse:5.27703	val-rmse:4.89049
[10]	train-rmse:4.53096	val-rmse:4.35526
[20]	train-rmse:4.04349	val-rmse:4.00874
[30]	train-rmse:3.72854	val-rmse:3.78802
[40]	train-rmse:3.52800	val-rmse:3.64872
[50]	train-rmse:3.40154	val-rmse:3.56589
[60]	train-rmse:3.31752	val-rmse:3.51657
[70]	train-rmse:3.26127	val-rmse:3.48734
[80]	train-rmse:3.21676	val-rmse:3.46551
[90]	train-rmse:3.18589	val-rmse:3.45497
[100]	train-rmse:3.16185	val-rmse:3.44928
[110]	train-rmse:3.14269	val-rmse:3.44525
[120]	train-rmse:3.12576	val-rmse:3.44271
[126]	train-rmse:3.11553	val-rmse:3.44352


### Feature importance checker

In [16]:
with open('../data/result/02-16-12/models/model0.pickle', 'rb') as f:
    model = pickle.load(f)

In [17]:
result = check_importance(shift_df,
                          model,
                          input_col='people_fully_vaccinated_per_hundred',
                          input_change_method='rel',
                          target_measure_method='mass_rel',
                          agg_method='mean',
                          increase_koeff=25)

Start preds mean --> 2.52    changed preds mean --> 2.40
-0.0487


In [18]:
shift_df[config['label_col']].mean()

2.6841397202385724

In [None]:
d = model.get_score(importance_type='gain')
pd.DataFrame({'feature': d.keys(), 'score': d.values()}).sort_values(by='score', ascending=False)[:20]