In [1]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb

from run_xgb.check_importance import check_importance
from run_xgb.reads import read_config
from run_xgb.run import run_xgb
from scipy import stats
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

In [2]:
df = pd.read_pickle('../data/input/owid-covid-data.pickle')
config = read_config()

In [3]:
isnull_vacc_or_death = (df['people_fully_vaccinated_per_hundred'].isnull()) \
                        | (df['new_deaths_per_million'].isnull())

df = df[~isnull_vacc_or_death]
print(df.shape)

(37603, 67)


In [4]:
def get_yesterday_feat(df, config):
    
    df_lst = []
    for country in df['iso_code'].unique():
        country_df = df[df['iso_code']==country]
        country_df['new_deaths_per_million_shift1'] = country_df['new_deaths_per_million'].shift(1).values
        country_df = country_df.iloc[1:]

        df_lst.append(country_df)
    
#     res_df = pd.concat(df_lst)[['date'] + config['train_cols'] + [config['label_col']]]
    res_df = pd.concat(df_lst)
    res_df.reset_index(drop=True, inplace=True)
    return res_df

In [5]:
train_cols = list(df.dtypes[df.dtypes != 'object'].index) + ['new_deaths_per_million_shift1']

train_cols.remove(config['label_col'])
config['train_cols'] = train_cols

print(config['label_col'] in config['train_cols'])

False


In [6]:
shift_df = get_yesterday_feat(df, config)
shift_df['date'] = pd.to_datetime(shift_df['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
print(shift_df.shape)

(37389, 68)


### Model check

In [8]:
train, val = train_test_split(shift_df, train_size=.7)

In [9]:
class Splitter():
    
    def __init__(self):
        self.n_splits = 1
    
    def split(self, df):
        q7 = df['date'].quantile(.7)
        df.reset_index(inplace=True)
        yield (df[df['date'] <= q7].index.values, df[df['date'] > q7].index.values)


In [10]:
for train_index, test_index in Splitter().split(shift_df):
    pass

In [11]:
res = run_xgb(train=train,
              val=val,
              train_cols=config['train_cols'],
              train_labels=train[config['label_col']],
              val_labels=val[config['label_col']],
              booster_params=config['booster_params'],
              train_params=config['train_params'],
              log_params=config['log_params'],
              kfold=Splitter(),
              metric=mean_absolute_error)

../data/result/02-16-10
[0]	train-rmse:5.25245	val-rmse:4.98284
[10]	train-rmse:4.18347	val-rmse:3.98728
[20]	train-rmse:3.40583	val-rmse:3.29928
[30]	train-rmse:2.85216	val-rmse:2.86131
[40]	train-rmse:2.46102	val-rmse:2.59426
[50]	train-rmse:2.18621	val-rmse:2.46437
[60]	train-rmse:1.98617	val-rmse:2.40377
[69]	train-rmse:1.85401	val-rmse:2.42558


### Feature importance checker

In [12]:
with open('../data/result/02-16-10/models/model0.pickle', 'rb') as f:
    model = pickle.load(f)

In [14]:
result = check_importance(shift_df,
                          model,
                          input_col='people_fully_vaccinated_per_hundred',
                          input_change_method='rel',
                          target_measure_method='mass_rel',
                          agg_method='mean',
                          increase_koeff=25)

Start preds mean --> 2.47    changed preds mean --> 2.47
0.0000


In [15]:
df[config['label_col']].mean()

2.687636438582028