In [1]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb

from run_xgb.check_importance import check_importance
from run_xgb.reads import read_config
from run_xgb.run import run_xgb
from scipy import stats
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

In [3]:
df = pd.read_pickle('../data/input/owid-covid-data.pickle')
config = read_config()

In [4]:
isnull_vacc_or_death = (df['people_fully_vaccinated_per_hundred'].isnull()) \
                        | (df['new_deaths_per_million'].isnull())

df = df[~isnull_vacc_or_death]
print(df.shape)

(37603, 67)


In [5]:
def get_tomorrow_label(df, config):
    
    df_lst = []
    for country in df['iso_code'].unique():
        country_df = df[df['iso_code']==country]
        country_df['new_deaths_per_million_tomorrow'] = country_df['new_deaths_per_million'].shift(-1).values
        country_df = country_df.iloc[:-1]

        df_lst.append(country_df)
    
    res_df = pd.concat(df_lst)[['date'] + config['train_cols'] + [config['label_col']]]
    res_df = pd.concat(df_lst)
    res_df.reset_index(drop=True, inplace=True)
    return res_df

In [6]:
shift_df = get_tomorrow_label(df, config)
shift_df['date'] = pd.to_datetime(shift_df['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
print(shift_df.shape)

(37389, 68)


### Model check

In [8]:
train, val = train_test_split(shift_df, train_size=.7)

In [9]:
class Splitter():
    
    def __init__(self):
        self.n_splits = 1
    
    def split(self, df):
        q7 = df['date'].quantile(.7)
        df.reset_index(inplace=True)
        yield (df[df['date'] <= q7].index.values, df[df['date'] > q7].index.values)


In [10]:
for train_index, test_index in Splitter().split(shift_df):
    pass

In [11]:
res = run_xgb(train=train,
              val=val,
              train_cols=config['train_cols'],
              train_labels=train[config['label_col']],
              val_labels=val[config['label_col']],
              booster_params=config['booster_params'],
              train_params=config['train_params'],
              log_params=config['log_params'],
              kfold=Splitter(),
              metric=mean_absolute_error)

../data/result/02-16-12
[0]	train-rmse:5.27017	val-rmse:4.90504
[10]	train-rmse:4.50284	val-rmse:4.26119
[20]	train-rmse:3.99540	val-rmse:3.87766
[30]	train-rmse:3.66484	val-rmse:3.66140
[40]	train-rmse:3.44251	val-rmse:3.54730
[50]	train-rmse:3.28857	val-rmse:3.49151
[60]	train-rmse:3.18076	val-rmse:3.47552
[70]	train-rmse:3.09768	val-rmse:3.46372
[78]	train-rmse:3.04068	val-rmse:3.46949


### Feature importance checker

In [14]:
with open('../data/result/02-16-12/models/model0.pickle', 'rb') as f:
    model = pickle.load(f)

In [15]:
result = check_importance(shift_df,
                          model,
                          input_col='people_fully_vaccinated_per_hundred',
                          input_change_method='rel',
                          target_measure_method='mass_rel',
                          agg_method='mean',
                          increase_koeff=25)

Start preds mean --> 2.61    changed preds mean --> 2.56
-0.0189


In [17]:
shift_df[config['label_col']].mean()

2.6841397202385724

In [18]:
d = model.get_score(importance_type='gain')
pd.DataFrame({'feature': d.keys(), 'score': d.values()}).sort_values(by='score', ascending=False)[:20]

Unnamed: 0,feature,score
7,new_deaths_per_million,6795.056152
9,new_cases_per_million,1387.654907
5,positive_rate,800.651917
11,handwashing_facilities,783.147095
6,people_fully_vaccinated_per_hundred,766.122742
4,diabetes_prevalence,710.852478
8,total_deaths_per_million,584.596008
3,hospital_beds_per_thousand,526.571411
10,total_cases_per_million,361.474182
1,cardiovasc_death_rate,350.229462
