In [18]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb

from dateutil.relativedelta import relativedelta
from run_xgb.check_importance import check_importance
from run_xgb.reads import read_config
from run_xgb.run import run_xgb
from scipy import stats
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

import warnings
warnings.filterwarnings('ignore')

In [19]:
df = pd.read_pickle('../data/input/owid-covid-data.pickle')
config = read_config()

In [20]:
isnull_vacc_or_death = (df['people_fully_vaccinated_per_hundred'].isnull()) \
                        | (df['new_deaths_per_million'].isnull())

df = df[~isnull_vacc_or_death]
print(df.shape)

(37603, 67)


In [21]:
def get_tomorrow_label(df, config):
    
    df_lst = []
    for country in df['iso_code'].unique():
        country_df = df[df['iso_code']==country]
        country_df['new_deaths_per_million_tomorrow'] = country_df['new_deaths_per_million'].shift(-1).values
        country_df = country_df.iloc[:-1]

        df_lst.append(country_df)
    
    res_df = pd.concat(df_lst)[['date'] + config['train_cols'] + [config['label_col']]]
    res_df = pd.concat(df_lst)
    res_df.reset_index(drop=True, inplace=True)
    return res_df

In [22]:
shift_df = get_tomorrow_label(df, config)
shift_df['date'] = pd.to_datetime(shift_df['date'])

print(shift_df.shape)

(37389, 68)


In [23]:
shift_df['iso_code'].value_counts().head(10)

OWID_HIC    427
OWID_WRL    427
OWID_NAM    427
USA         427
ISR         421
OWID_ASI    421
OWID_EUR    419
CAN         417
CHE         416
OWID_EUN    413
Name: iso_code, dtype: int64

In [24]:
is_null_mask = shift_df[config['train_cols']].isnull().sum(1).astype(bool)
is_country = shift_df['iso_code']=='RUS'

country_df = shift_df[is_country]
print(country_df.shape)

(305, 68)


### Last time validation

In [25]:
%run time_val.py

In [26]:
class Splitter():
    
    def __init__(self):
        self.n_splits = 1
    
    def split(self, df):
        df.reset_index(inplace=True)
        yield (df[df['is_train_val'] == False].index.values, df[df['is_train_val'] == True].index.values)

In [27]:
val_params = {
    'n_points_in_val': 10,
    'n_points_in_train_val': 10,
    'starting_point': 150,
    'date_col': 'date'
}

In [28]:
res_lst_simple = []

generator = eat_by_points_generator(country_df,
                                    n_points_in_val=val_params['n_points_in_val'],
                                    n_points_in_train_val=val_params['n_points_in_train_val'],
                                    starting_point=val_params['starting_point'],
                                    date_col=val_params['date_col'])

for (val_mask, train_val_mask) in generator:
    val = country_df[val_mask]
    train = country_df[~val_mask]
    train['is_train_val'] = [False] * train.shape[0]
    train.loc[train_val_mask, 'is_train_val'] = True
    
    res = run_xgb(train=train,
                  val=val,
                  train_cols=config['train_cols'],
                  train_labels=train[config['label_col']],
                  val_labels=val[config['label_col']],
                  booster_params=config['booster_params'],
                  train_params=config['train_params'],
                  log_params=config['log_params'],
                  kfold=Splitter(),
                  metric=mean_absolute_error)
    
    res_lst_simple.append(res)

Val-part starts with --> 2021-08-12
../data/result/02-18-16
[0]	train-rmse:4.59321	val-rmse:4.63587
[100]	train-rmse:0.27704	val-rmse:0.24692
[189]	train-rmse:0.10153	val-rmse:0.12210
Val-part starts with --> 2021-08-22
../data/result/02-18-16
[0]	train-rmse:4.59136	val-rmse:4.67085
[100]	train-rmse:0.27780	val-rmse:0.25889
[200]	train-rmse:0.09850	val-rmse:0.10000
[299]	train-rmse:0.07622	val-rmse:0.09865
Val-part starts with --> 2021-09-01
../data/result/02-18-16
[0]	train-rmse:4.59000	val-rmse:4.70162
[100]	train-rmse:0.27780	val-rmse:0.27137
[200]	train-rmse:0.10186	val-rmse:0.09441
[299]	train-rmse:0.08027	val-rmse:0.08917
Val-part starts with --> 2021-09-11
../data/result/02-18-16
[0]	train-rmse:4.59221	val-rmse:4.68226
[100]	train-rmse:0.27640	val-rmse:0.25994
[181]	train-rmse:0.10562	val-rmse:0.09011
Val-part starts with --> 2021-09-22
../data/result/02-18-16
[0]	train-rmse:4.57744	val-rmse:4.66570
[100]	train-rmse:0.27469	val-rmse:0.27098
[200]	train-rmse:0.09595	val-rmse:0.11

In [29]:
print('Mean error through validation --> {:.4f}'.format(np.mean(res_lst_simple)))

Mean error through validation --> 0.1931


### Metric validation

In [30]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.preprocessing import MinMaxScaler

In [31]:
# weights_hardcode = [0.16, .07, 0.77]

In [32]:
%run metric_val.py

In [33]:
res_lst_metric = []

generator = eat_by_points_generator(country_df,
                                    n_points_in_val=val_params['n_points_in_val'],
                                    n_points_in_train_val=val_params['n_points_in_train_val'],
                                    starting_point=val_params['starting_point'],
                                    date_col='date')

for (val_mask, train_val_mask) in generator:
    val = country_df[val_mask]
    train = country_df[~val_mask]
    train['is_train_val'] = [False] * train.shape[0]

    scaler = MinMaxScaler().fit(train[config['train_cols']])

    val_scaled = scaler.transform(val[config['train_cols']])
    val_sc_mean = np.median(val_scaled, 0)
    #     input_vector *= weights_hardcode

    train_scaled = scaler.transform(train[config['train_cols']])
    #     train_sc_weighted *= weights_hardcode

    train_val_point, _ = metric_val(val_sc_mean.reshape(1, -1),
                                                        train_scaled,
                                                        val_params['n_points_in_train_val'])

    start_of_train_val = train_val_point - val_params['n_points_in_train_val']
    train['is_train_val'].iloc[start_of_train_val:train_val_point] = True

    assert train['is_train_val'].sum() >= val_params['n_points_in_train_val']

    res = run_xgb(train=train,
                  val=val,
                  train_cols=config['train_cols'],
                  train_labels=train[config['label_col']],
                  val_labels=val[config['label_col']],
                  booster_params=config['booster_params'],
                  train_params=config['train_params'],
                  log_params=config['log_params'],
                  kfold=Splitter(),
                  metric=mean_absolute_error)

    res_lst_metric.append(res)

Val-part starts with --> 2021-08-12
../data/result/02-18-16
[0]	train-rmse:4.59240	val-rmse:4.64741
[100]	train-rmse:0.27670	val-rmse:0.23484
[171]	train-rmse:0.10948	val-rmse:0.10272
Val-part starts with --> 2021-08-22
../data/result/02-18-16
[0]	train-rmse:4.59386	val-rmse:4.63552
[100]	train-rmse:0.27673	val-rmse:0.24456
[173]	train-rmse:0.10557	val-rmse:0.10959
Val-part starts with --> 2021-09-01
../data/result/02-18-16
[0]	train-rmse:4.59386	val-rmse:4.64023
[100]	train-rmse:0.27673	val-rmse:0.26384
[189]	train-rmse:0.09777	val-rmse:0.12062
Val-part starts with --> 2021-09-11
../data/result/02-18-16
[0]	train-rmse:4.56503	val-rmse:5.05236
[100]	train-rmse:0.27344	val-rmse:0.49822
[200]	train-rmse:0.09302	val-rmse:0.29305
[299]	train-rmse:0.07246	val-rmse:0.28181
Val-part starts with --> 2021-09-22
../data/result/02-18-16
[0]	train-rmse:4.56503	val-rmse:4.83512
[100]	train-rmse:0.27344	val-rmse:0.35736
[200]	train-rmse:0.09302	val-rmse:0.19715
[299]	train-rmse:0.07246	val-rmse:0.19

In [34]:
print('Mean error through validation --> {:.4f}'.format(np.mean(res_lst_metric)))
print('Mean error through validation simple--> {:.4f}'.format(np.mean(res_lst_simple)))

Mean error through validation --> 0.1923
Mean error through validation simple--> 0.1931
