In [3]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb

from dateutil.relativedelta import relativedelta
from run_xgb.check_importance import check_importance
from run_xgb.reads import read_config
from run_xgb.run import run_xgb
from scipy import stats
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_pickle('../data/input/owid-covid-data.pickle')
config = read_config()

In [5]:
isnull_vacc_or_death = (df['people_fully_vaccinated_per_hundred'].isnull()) \
                        | (df['new_deaths_per_million'].isnull())

df = df[~isnull_vacc_or_death]
print(df.shape)

(37603, 67)


In [6]:
def get_tomorrow_label(df, config):
    
    df_lst = []
    for country in df['iso_code'].unique():
        country_df = df[df['iso_code']==country]
        country_df['new_deaths_per_million_tomorrow'] = country_df['new_deaths_per_million'].shift(-1).values
        country_df = country_df.iloc[:-1]

        df_lst.append(country_df)
    
    res_df = pd.concat(df_lst)[['date'] + config['train_cols'] + [config['label_col']]]
    res_df = pd.concat(df_lst)
    res_df.reset_index(drop=True, inplace=True)
    return res_df

In [7]:
shift_df = get_tomorrow_label(df, config)
shift_df['date'] = pd.to_datetime(shift_df['date'])

print(shift_df.shape)

(37389, 68)


In [8]:
shift_df['iso_code'].value_counts().head(10)

OWID_HIC    427
OWID_WRL    427
OWID_NAM    427
USA         427
ISR         421
OWID_ASI    421
OWID_EUR    419
CAN         417
CHE         416
OWID_EUN    413
Name: iso_code, dtype: int64

In [9]:
is_null_mask = shift_df[config['train_cols']].isnull().sum(1).astype(bool)
is_country = shift_df['iso_code']=='RUS'

country_df = shift_df[is_country]
print(country_df.shape)

(305, 68)


### Last time validation

In [10]:
%run time_val.py

In [11]:
class Splitter():
    
    def __init__(self):
        self.n_splits = 1
    
    def split(self, df):
        df.reset_index(inplace=True)
        yield (df[df['is_train_val'] == False].index.values, df[df['is_train_val'] == True].index.values)

In [12]:
val_params = {
    'n_points_in_val': 10,
    'n_points_in_train_val': 10,
    'starting_point': 150,
    'date_col': 'date'
}

In [13]:
res_lst_simple = []

generator = eat_by_points_generator(country_df,
                                    n_points_in_val=val_params['n_points_in_val'],
                                    n_points_in_train_val=val_params['n_points_in_train_val'],
                                    starting_point=val_params['starting_point'],
                                    date_col=val_params['date_col'])

for (val_mask, train_val_mask) in generator:
    val = country_df[val_mask]
    train = country_df[~val_mask]
    train['is_train_val'] = [False] * train.shape[0]
    train.loc[train_val_mask, 'is_train_val'] = True
    
    res = run_xgb(train=train,
                  val=val,
                  train_cols=config['train_cols'],
                  train_labels=train[config['label_col']],
                  val_labels=val[config['label_col']],
                  booster_params=config['booster_params'],
                  train_params=config['train_params'],
                  log_params=config['log_params'],
                  kfold=Splitter(),
                  metric=mean_absolute_error)
    
    res_lst_simple.append(res)

Val-part starts with --> 2021-08-12
../data/result/02-23-16
[0]	train-rmse:4.59482	val-rmse:4.64233
[100]	train-rmse:0.27688	val-rmse:0.23962
[200]	train-rmse:0.10081	val-rmse:0.08167
[205]	train-rmse:0.09927	val-rmse:0.08156
Val-part starts with --> 2021-08-22
../data/result/02-23-16
[0]	train-rmse:4.59309	val-rmse:4.70972
[100]	train-rmse:0.27656	val-rmse:0.25982
[200]	train-rmse:0.10065	val-rmse:0.09313
[226]	train-rmse:0.09061	val-rmse:0.09283
Val-part starts with --> 2021-09-01
../data/result/02-23-16
[0]	train-rmse:4.59402	val-rmse:4.69231
[100]	train-rmse:0.27685	val-rmse:0.27883
[178]	train-rmse:0.11029	val-rmse:0.11115
Val-part starts with --> 2021-09-11
../data/result/02-23-16
[0]	train-rmse:4.59565	val-rmse:4.68286
[100]	train-rmse:0.27611	val-rmse:0.23955
[200]	train-rmse:0.09818	val-rmse:0.03169
[205]	train-rmse:0.09677	val-rmse:0.03194
Val-part starts with --> 2021-09-22
../data/result/02-23-16
[0]	train-rmse:4.58098	val-rmse:4.64616
[100]	train-rmse:0.27385	val-rmse:0.27

In [14]:
print('Mean error through validation --> {:.4f}'.format(np.mean(res_lst_simple)))

Mean error through validation --> 0.1800


### Metric validation

In [15]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.preprocessing import MinMaxScaler

In [17]:
# weights_hardcode = [0.16, .07, 0.77]
time_reduce_func = np.median

In [18]:
%run metric_val.py

In [19]:
res_lst_metric = []

generator = eat_by_points_generator(country_df,
                                    n_points_in_val=val_params['n_points_in_val'],
                                    n_points_in_train_val=val_params['n_points_in_train_val'],
                                    starting_point=val_params['starting_point'],
                                    date_col='date')

for (val_mask, train_val_mask) in generator:
    val = country_df[val_mask]
    train = country_df[~val_mask]
    train['is_train_val'] = [False] * train.shape[0]

    scaler = MinMaxScaler().fit(train[config['train_cols']])

    val_scaled = scaler.transform(val[config['train_cols']])
    val_sc_mean = time_reduce_func(val_scaled, 0)
    #     input_vector *= weights_hardcode

    train_scaled = scaler.transform(train[config['train_cols']])
    #     train_sc_weighted *= weights_hardcode

    train_val_point, _ = metric_val(val_sc_mean.reshape(1, -1),
                                    train_scaled,
                                    val_params['n_points_in_train_val'],
                                    time_reduce_func)

    start_of_train_val = train_val_point - val_params['n_points_in_train_val']
    train['is_train_val'].iloc[start_of_train_val:train_val_point] = True

    assert train['is_train_val'].sum() == val_params['n_points_in_train_val']

    res = run_xgb(train=train,
                  val=val,
                  train_cols=config['train_cols'],
                  train_labels=train[config['label_col']],
                  val_labels=val[config['label_col']],
                  booster_params=config['booster_params'],
                  train_params=config['train_params'],
                  log_params=config['log_params'],
                  kfold=Splitter(),
                  metric=mean_absolute_error)

    res_lst_metric.append(res)

Val-part starts with --> 2021-08-12
../data/result/02-23-16
[0]	train-rmse:4.59332	val-rmse:4.68581
[100]	train-rmse:0.27718	val-rmse:0.24201
[200]	train-rmse:0.09916	val-rmse:0.06308
[259]	train-rmse:0.08488	val-rmse:0.06118
Val-part starts with --> 2021-08-22
../data/result/02-23-16
[0]	train-rmse:4.59473	val-rmse:4.66237
[100]	train-rmse:0.27696	val-rmse:0.23364
[200]	train-rmse:0.10018	val-rmse:0.06899
[205]	train-rmse:0.09843	val-rmse:0.06905
Val-part starts with --> 2021-09-01
../data/result/02-23-16
[0]	train-rmse:4.59643	val-rmse:4.62227
[100]	train-rmse:0.27562	val-rmse:0.23317
[172]	train-rmse:0.10967	val-rmse:0.14029
Val-part starts with --> 2021-09-11
../data/result/02-23-16
[0]	train-rmse:4.59404	val-rmse:4.72902
[100]	train-rmse:0.27581	val-rmse:0.26703
[200]	train-rmse:0.09858	val-rmse:0.07452
[235]	train-rmse:0.08731	val-rmse:0.07394
Val-part starts with --> 2021-09-22
../data/result/02-23-16
[0]	train-rmse:4.56683	val-rmse:5.03864
[100]	train-rmse:0.27325	val-rmse:0.41

In [20]:
print('Mean error through validation --> {:.4f}'.format(np.mean(res_lst_metric)))
print('Mean error through validation simple--> {:.4f}'.format(np.mean(res_lst_simple)))

Mean error through validation --> 0.1672
Mean error through validation simple--> 0.1800
