In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/b1/61/2b8106c8870601671d99ca94d8b8d180f2b740b7cdb95c930147508abcf9/catboost-0.23-cp36-none-manylinux1_x86_64.whl (64.7MB)
[K     |████████████████████████████████| 64.8MB 65kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.23


In [2]:
# My repo with data
!git clone https://github.com/Tixonmavrin/covid-19-solution
!unzip covid-19-solution/data/data_with_features/_data_with_features.csv.zip
!unzip covid-19-solution/data/data_with_features/_data_with_features_all.csv.zip

Cloning into 'covid-19-solution'...
remote: Enumerating objects: 62, done.[K
remote: Counting objects: 100% (62/62), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 248 (delta 31), reused 0 (delta 0), pack-reused 186[K
Receiving objects: 100% (248/248), 8.71 MiB | 20.75 MiB/s, done.
Resolving deltas: 100% (111/111), done.
Archive:  covid-19-solution/data/data_with_features/_data_with_features.csv.zip
  inflating: _data_with_features.csv  
  inflating: __MACOSX/.__data_with_features.csv  
Archive:  covid-19-solution/data/data_with_features/_data_with_features_all.csv.zip
  inflating: _data_with_features_all.csv  
  inflating: __MACOSX/.__data_with_features_all.csv  


In [0]:
import pandas as pd
import numpy as np
import warnings
import copy
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
from sklearn import metrics
from sklearn import tree, ensemble
from sklearn.linear_model import HuberRegressor, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import random
pd.set_option('display.max_columns', 100)
warnings.filterwarnings("ignore")

In [0]:
#data4 = pd.read_csv('_data_with_features.csv')
data4 = pd.read_csv('_data_with_features_all.csv')

In [0]:
data4['Country/Region'] = data4['Country/Region'].fillna('nan').astype('str')
data4['Province/State'] = data4['Province/State'].fillna('nan').astype('str')
data4['Date'] = pd.to_datetime(data4['Date'])
data4['day'] = data4['Date'].apply(lambda x: x.dayofyear).astype(np.int16)

In [0]:
#data4 = data4[data4['Country/Region'] != 'US']
data4['place'] = data4['Country/Region'].fillna('') + '/' + data4['Province/State'].fillna('')
places = data4['place'].unique()
# Cummax
#data4.loc[data4['Country/Region'] != 'Russia','Confirmed'] = data4.loc[data4['Country/Region'] != 'Russia'].groupby("place")["Confirmed"].cummax()
#data4.loc[data4['Country/Region'] != 'Russia','Deaths'] = data4.loc[data4['Country/Region'] != 'Russia'].groupby("place")["Deaths"].cummax()

data4.loc[:,'Confirmed'] = data4.groupby("place")["Confirmed"].cummax()
data4.loc[:,'Deaths'] = data4.groupby("place")["Deaths"].cummax()

In [7]:
data4n = []
for place in tqdm(data4['place'].unique()):
  if ('Russia' in place) and (',' not in place) and (place.count('Russia') == 1):
    data4n.append(data4[data4['place'] == place])
  else:
    data4c = data4[data4['place'] == place]
    if data4c['Confirmed'].nunique() > 60:
      data4n.append(data4c)
data4 = pd.concat(data4n)

100%|██████████| 2178/2178 [00:32<00:00, 67.78it/s]


In [8]:
places = data4['place'].unique()
data4['Confirmed per day'] = 0
temp_list = np.zeros(len(data4))
for place in tqdm(places):
    temp = data4['Confirmed'][data4['place']==place].values
    temp[1:] = temp[1:] - temp[:-1]
    data4['Confirmed per day'][data4['place']==place] = temp

100%|██████████| 313/313 [00:02<00:00, 131.74it/s]


In [0]:
def aggregate(data_agg, column, left_right, function, name=''):
    data_agg_new = data_agg.copy()
    column_name = '{}_{}_{}_{}'.format(column, name, left_right[0], left_right[1])
    data_agg_new[column_name] = 0
    tmp = data_agg_new[column].rolling(left_right[1]-left_right[0]+1).agg(function)
    data_agg_new[column_name][left_right[0]:] = tmp[:-left_right[0]]
    return data_agg_new

def make_features(data_agg):
    data_agg = aggregate(data_agg, 'Confirmed per day', [1,1], 'mean', 'mean')
    data_agg = aggregate(data_agg, 'Confirmed per day', [1,7], 'mean','mean')
    data_agg = aggregate(data_agg, 'Confirmed per day', [8,14], 'mean','mean')
    data_agg = aggregate(data_agg, 'Confirmed per day', [15,21], 'mean','mean')

    data_agg = aggregate(data_agg, 'Confirmed per day', [1,7], 'max','max')
    data_agg = aggregate(data_agg, 'Confirmed per day', [1,7], 'min', 'min')
    data_agg = aggregate(data_agg, 'Confirmed per day', [1,7], 'median', 'median')
    data_agg = aggregate(data_agg, 'Confirmed per day', [1,4], 'median', 'median')
    data_agg = aggregate(data_agg, 'Confirmed per day', [8,14], 'median', 'median')
    
    for thresh in [1, 10, 100]:
        days_under_thresh = (data_agg['Confirmed']<thresh).sum()
        tmp = data_agg['day'].values - days_under_thresh
        tmp[tmp<=0] = 0
        data_agg['days_from_{}'.format(thresh)] = tmp

    for lag in range(1, 14):
        data_agg[f"lag_{lag}_cc"] = data_agg.groupby("place")["Confirmed"].shift(lag)

    data_agg["perc_1_cc"] = data_agg[f"lag_1_cc"] / data_agg.population
    
    data_agg["diff_1_cc"] = data_agg[f"lag_1_cc"] - data_agg[f"lag_2_cc"]
    data_agg["diff_2_cc"] = data_agg[f"lag_2_cc"] - data_agg[f"lag_3_cc"]
    data_agg["diff_3_cc"] = data_agg[f"lag_3_cc"] - data_agg[f"lag_4_cc"]
    
    data_agg["diff_123_cc"] = (data_agg[f"lag_1_cc"] - data_agg[f"lag_4_cc"]) / 3

    data_agg["diff_change_1_cc"] = data_agg.diff_1_cc / data_agg.diff_2_cc
    data_agg["diff_change_2_cc"] = data_agg.diff_2_cc / data_agg.diff_3_cc

    data_agg["diff_change_12_cc"] = (data_agg.diff_change_1_cc + data_agg.diff_change_2_cc) / 2

    data_agg["change_1_cc"] = data_agg[f"lag_1_cc"] / data_agg[f"lag_2_cc"]
    data_agg["change_2_cc"] = data_agg[f"lag_2_cc"] / data_agg[f"lag_3_cc"]
    data_agg["change_3_cc"] = data_agg[f"lag_3_cc"] / data_agg[f"lag_4_cc"]

    data_agg["change_1_3_cc"] = data_agg[f"lag_1_cc"] / data_agg[f"lag_4_cc"]
    data_agg["change_1_7_cc"] = data_agg[f"lag_1_cc"] / data_agg[f"lag_8_cc"]

    data_agg.reset_index(drop=True, inplace=True)
    data_agg["day_from_max"] = 0
    data_agg["max_value"] = 0
    data_agg["delta_with_max"] = 0
    vmax = 0.0
    imax = 0
    for i in range(1, data_agg.shape[0]):
      if data_agg.loc[i-1, 'Confirmed per day'] > vmax:
        vmax = data_agg.loc[i-1, 'Confirmed per day']
        imax = i-1
      data_agg.loc[i, 'delta_with_max'] = data_agg.loc[i,'Confirmed per day'] - vmax
      data_agg.loc[i, 'max_value'] = vmax
      data_agg.loc[i, 'day_from_max'] = i - imax
    
    data_agg["day_from_max_1"] = data_agg.groupby("place")["day_from_max"].shift(1)
    data_agg["max_value_1"] = data_agg.groupby("place")["max_value"].shift(1)
    data_agg["delta_with_max_1"] = data_agg.groupby("place")["delta_with_max"].shift(1)

    data_agg["apl-driving_8"] = data_agg.groupby("place")["apl-driving"].shift(8)
    data_agg["apl-transit_8"] = data_agg.groupby("place")["apl-transit"].shift(8)
    data_agg["apl-walking_8"] = data_agg.groupby("place")["apl-walking"].shift(8)
    data_agg["grocery-and-pharmacy_8"] = data_agg.groupby("place")["grocery-and-pharmacy"].shift(8)
    data_agg["parks_8"] = data_agg.groupby("place")["parks"].shift(8)
    data_agg["residential_8"] = data_agg.groupby("place")["residential"].shift(8)
    data_agg["retail-and-recreation_8"] = data_agg.groupby("place")["retail-and-recreation"].shift(8)
    data_agg["transit-stations_8"] = data_agg.groupby("place")["transit-stations"].shift(8)
    data_agg["workplaces_8"] = data_agg.groupby("place")["workplaces"].shift(8)
    data_agg["isolation_8"] = data_agg.groupby("place")["isolation"].shift(8)

    data_agg['m2_7'] = data_agg['Confirmed per day'].shift(7)*2
    data_agg['s2_7'] = data_agg['Confirmed per day'].shift(7)**2


    return data_agg

In [10]:
data5 = []
for place in tqdm(places[:]):
    temp = data4[data4['place']==place].reset_index(drop=True)
    temp = make_features(temp)
    data5.append(temp)
data5 = pd.concat(data5).reset_index(drop=True)

100%|██████████| 313/313 [01:08<00:00,  4.57it/s]


In [11]:
MIN_VALID = '2020-04-20'
MAX_VALID = '2020-04-26'
MIN_TEST = '2020-04-27'
MAX_TEST = '2020-05-03'

data5['Date'] = pd.to_datetime(data5['Date'])
data5.columns = ['Country/Region', 'Province/State', 'Date', 'Confirmed', 'Deaths'] + list(data5.columns[5:])
traint = data5[data5['Date'] < pd.to_datetime(MIN_TEST)]
test = data5[(data5['Date'] >= pd.to_datetime(MIN_TEST)) & (data5['Date'] <= pd.to_datetime(MAX_TEST))]
trainv = data5[data5['Date'] < pd.to_datetime(MIN_VALID)]
valid = data5[(data5['Date'] >= pd.to_datetime(MIN_VALID)) & (data5['Date'] <= pd.to_datetime(MAX_VALID))]
# Add noice
#traint['Confirmed'] = traint['Confirmed'].apply(lambda x: x + random.random())
traint.sample(3)

Unnamed: 0,Country/Region,Province/State,Date,Confirmed,Deaths,Health_GDP,Health_USD,Physicians,Nurse,Age_old/new,Smoking,tests,testpop,gatheringlimit,hospibed,healthperpop,density_n,fertility_rate,land_area,median_age,migrants,population_coun,urban_pop_rate_c,world_share,Start_date,population,population_urban,population_rural,urban_pop_rate,lat,lon,continent,area,apl-driving,apl-transit,apl-walking,grocery-and-pharmacy,parks,residential,retail-and-recreation,transit-stations,workplaces,isolation,federal_district,geoname_name,global_vec_0,global_vec_1,global_vec_2,global_vec_3,global_vec_4,...,Confirmed per day_median_1_7,Confirmed per day_median_1_4,Confirmed per day_median_8_14,days_from_1,days_from_10,days_from_100,lag_1_cc,lag_2_cc,lag_3_cc,lag_4_cc,lag_5_cc,lag_6_cc,lag_7_cc,lag_8_cc,lag_9_cc,lag_10_cc,lag_11_cc,lag_12_cc,lag_13_cc,perc_1_cc,diff_1_cc,diff_2_cc,diff_3_cc,diff_123_cc,diff_change_1_cc,diff_change_2_cc,diff_change_12_cc,change_1_cc,change_2_cc,change_3_cc,change_1_3_cc,change_1_7_cc,day_from_max,max_value,delta_with_max,day_from_max_1,max_value_1,delta_with_max_1,apl-driving_8,apl-transit_8,apl-walking_8,grocery-and-pharmacy_8,parks_8,residential_8,retail-and-recreation_8,transit-stations_8,workplaces_8,isolation_8,m2_7,s2_7
32475,Russia,Tver oblast,2020-02-14,0,0,5.3,469.1,4.0,8.6,1.032643,39.3,116061.0,1257.394491,33.482838,8.2,9.689281,9.0,1.8,16376870.0,40.0,182456.0,145934462.0,0.74,0.0187,88.0,1260345.0,959753.0,300592.0,0.7615,60.0,90.0,Europe,17098246.0,124.25,72.823825,128.39,,,,,,,2.163219,,,0.275142,0.021197,0.045999,0.287714,0.083187,...,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,23,0.0,0.0,22.0,0.0,0.0,108.56,72.823825,104.71,,,,,,,2.163219,0.0,0.0
12070,Luxembourg,Luxembourg,2020-04-17,3480,0,6.2,6271.4,3.0,12.3,0.977199,23.5,40218.117147,10403.318753,33.482838,4.8,10195.24648,242.0,1.5,2590.0,40.0,9741.0,625978.0,0.88,0.0001,78.0,1753027.0,1312777.8,440249.282353,0.709728,,,,,33.99,18.73,44.11,,,,,,,2.163219,,,0.309397,0.064277,0.09168,0.29504,0.160987,...,47.0,40.5,81.0,70,58,53,3444.0,3373.0,3307.0,3292.0,3281.0,3270.0,3223.0,3115.0,3034.0,2970.0,2843.0,2804.0,2729.0,0.001965,71.0,66.0,15.0,50.666667,1.075758,4.4,2.737879,1.02105,1.019958,1.004557,1.046173,1.105618,23,234.0,-198.0,22.0,234.0,-163.0,30.05,21.08,47.32,,,,,,,2.163219,216.0,11664.0
26258,Russia,Kamchatskiy kray,2020-03-22,0,0,5.3,469.1,4.0,8.6,1.032643,39.3,116061.0,1257.394491,33.482838,8.2,9.689281,9.0,1.8,16376870.0,40.0,182456.0,145934462.0,0.74,0.0187,88.0,312438.0,245128.0,67310.0,0.784565,60.0,90.0,Europe,17098246.0,90.74,72.823825,76.79,,,,,,,3.3,Дальневосточный,Kamchatka,0.309397,0.064277,0.09168,0.29504,0.160987,...,0.0,0.0,0.0,7,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,60,0.0,0.0,59.0,0.0,0.0,108.95,72.823825,113.78,,,,,,,3.0,0.0,0.0


---

In [0]:
features = [
 #'Country/Region',
 #'Province/State',
 #'Date',
 #'Confirmed',
 #'Deaths',
 'Health_GDP',
 'Health_USD',
 'Physicians',
 'Nurse',
 'Age_old/new',
 'Smoking',
 'tests',
 'testpop',
 'gatheringlimit',
 'hospibed',
 'healthperpop',
 'density_n',
 'fertility_rate',
 'land_area',
 'median_age',
 'migrants',
 'population_coun',
 'urban_pop_rate_c',
 'world_share',
 'Start_date',
 'population',
 'population_urban',
 'population_rural',
 'urban_pop_rate',
 'lat',
 'lon',
 #'continent',
 'area',
 #'apl-driving',
 #'apl-transit',
 #'apl-walking',
 #'grocery-and-pharmacy',
 #'parks',
 #'residential',
 #'retail-and-recreation',
 #'transit-stations',
 #'workplaces',
 #'isolation',
 #'federal_district',
 #'geoname_name',
 'global_vec_0',
 'global_vec_1',
 'global_vec_2',
 'global_vec_3',
 'global_vec_4',
 'global_vec_5',
 'global_vec_6',
 'global_vec_7',
 'global_vec_8',
 'global_vec_9',
 'global_vec_10',
 'global_vec_11',
 'global_vec_12',
 'global_vec_13',
 'global_vec_14',
 'global_vec_15',
 'global_vec_16',
 'global_vec_17',
 'global_vec_18',
 'global_vec_19',
 'global_vec_20',
 'global_vec_21',
 'global_vec_22',
 'global_vec_23',
 'global_vec_24',
 'global_vec_25',
 'global_vec_26',
 'global_vec_27',
 'region_vec_0',
 'region_vec_1',
 'region_vec_2',
 'region_vec_3',
 'region_vec_4',
 'region_vec_5',
 'region_vec_6',
 'region_vec_7',
 'region_vec_8',
 'region_vec_9',
 'region_vec_10',
 'region_vec_11',
 'region_vec_12',
 'region_vec_13',
 'region_vec_14',
 'region_vec_15',
 'region_vec_16',
 'region_vec_17',
 'region_vec_18',
 'region_vec_19',
 'region_vec_20',
 'region_vec_21',
 'region_vec_22',
 'region_vec_23',
 'region_vec_24',
 'region_vec_25',
 'region_vec_26',
 'region_vec_27',
 'day',
 #'place',
 #'Confirmed per day',
 'Confirmed per day_mean_1_1',
 'Confirmed per day_mean_1_7',
 'Confirmed per day_mean_8_14',
 'Confirmed per day_mean_15_21',
 'Confirmed per day_max_1_7',
 'Confirmed per day_min_1_7',
 'Confirmed per day_median_1_7',
 'Confirmed per day_ampl_plus_1_7',
 'Confirmed per day_ampl_minus_1_7',
 'Confirmed per day_ampl2_plus_1_7',
 'Confirmed per day_ampl2_minus_1_7',
 'Confirmed per day_power_full_1_7',
 'Confirmed per day_power_med_1_7',
 'days_from_1',
 'days_from_10',
 'days_from_100',
 'lag_1_cc',
 'lag_2_cc',
 'lag_3_cc',
 'lag_4_cc',
 'lag_5_cc',
 'lag_6_cc',
 'lag_7_cc',
 'lag_8_cc',
 'lag_9_cc',
 'lag_10_cc',
 'lag_11_cc',
 'lag_12_cc',
 'lag_13_cc',
 'perc_1_cc',
 'diff_1_cc',
 'diff_2_cc',
 'diff_3_cc',
 'diff_123_cc',
 'diff_change_1_cc',
 'diff_change_2_cc',
 'diff_change_12_cc',
 'change_1_cc',
 'change_2_cc',
 'change_3_cc',
 'change_1_3_cc',
 'change_1_7_cc',
 #'day_from_max',
 #'max_value',
 #'delta_with_max',
 'day_from_max_1',
 'max_value_1',
 'delta_with_max_1',
 'apl-driving_8',
 'apl-transit_8',
 'apl-walking_8',
 'grocery-and-pharmacy_8',
 'parks_8',
 'residential_8',
 'retail-and-recreation_8',
 'transit-stations_8',
 'workplaces_8',
 'isolation_8',
 'm2_7',
 's2_7']

In [0]:
def get_result(model_c, features, model_name=''):
  import numpy as np
  import pandas as pd
  max_train = traint['day'].max()
  traintest = traint.append(test)
  predictions_final = []
  places = traintest['place'].unique()
  places2 = [i for i in places if ('Ru' in i)]

  for i, place in tqdm(enumerate(places2[:])):
      df_traintest = traintest[traintest['place']==place].reset_index(drop=True).copy()
      count_know = (df_traintest['day']<=max_train).sum()
      for j in range(7):
          if model_name == 'catboost':
            X_valid = df_traintest[features].iloc[j+count_know].values
          elif model_name == 'adaboost':
            X_valid = [df_traintest[features].iloc[j+count_know].values]
          else:
            X_valid = df_traintest[features].iloc[j+count_know]
          res_c = np.exp(model_c.predict(X_valid))-1

          df_traintest['Confirmed per day'][j+count_know] = res_c
          df_traintest['Confirmed'][j+count_know] = df_traintest['Confirmed'][j+count_know-1] + res_c
          df_traintest = make_features(df_traintest)
      df_traintest['Confirmed per day pred'] = np.cumsum(df_traintest['Confirmed per day'].values)
      predictions_final.append(df_traintest)
  predictions_final = pd.concat(predictions_final)

  predicts = predictions_final[(predictions_final['Country/Region'] == 'Russia') & (predictions_final['Province/State'] != 'nan') & predictions_final['Province/State'].apply(lambda x: (',' not in str(x)) and ('Russia' not in str(x)))][['Province/State', 'Date','Confirmed per day pred']]
  predicts = predicts.merge(test[['Province/State', 'Date', 'Confirmed', 'Deaths']], how='left', on=['Province/State', 'Date'])
  predicts2 = predicts[predicts['Date'] >= predicts['Date'].max() - pd.DateOffset(6)]
  predicts2 = predicts2.rename(columns={'cases_pred': 'Confirmed per day pred'})
  predicts3 = predicts2[['Province/State', 'Date', 'Confirmed per day pred']]
  predicts3['Confirmed per day pred'] = predicts3['Confirmed per day pred'].astype('int64')
  predicts3.to_csv('covid-19-solution/data/result/result.csv')

  russia_regions = pd.read_csv('covid-19-solution/data/features/russia_regions.csv')[['iso_code', 'csse_province_state']].rename(columns={'csse_province_state':'Province/State', 'iso_code':'country'})
  predicts3 = predicts3.merge(russia_regions, how='left', on='Province/State')
  del predicts3['Province/State']
  predicts3 = predicts3.rename(columns={'Date':'date','Confirmed per day pred':'prediction_confirmed'})

  predicts3.to_csv('covid-19-solution/data/result/submission.csv')
  your_submission_file = 'covid-19-solution/data/result/submission.csv'

  import pandas as pd
  import datetime
  import numpy as np
  pd.options.display.max_rows = 1000

  from_date = MIN_TEST
  to_date = MAX_TEST

  all_country = ['RU-AD','RU-AL','RU-ALT','RU-AMU','RU-ARK','RU-AST','RU-BA',
                'RU-BEL','RU-BRY','RU-BU','RU-CE','RU-CHE','RU-CHU','RU-CU',
                'RU-DA','RU-IN','RU-IRK','RU-IVA','RU-KAM','RU-KB','RU-KC',
                'RU-KDA','RU-KEM','RU-KGD','RU-KGN','RU-KHA','RU-KHM','RU-KIR',
                'RU-KK','RU-KL','RU-KLU','RU-KO','RU-KOS','RU-KR','RU-KRS',
                'RU-KYA','RU-LEN','RU-LIP','RU-MAG','RU-ME','RU-MO','RU-MOS',
                'RU-MOW','RU-MUR','RU-NEN','RU-NGR','RU-NIZ','RU-NVS','RU-OMS',
                'RU-ORE','RU-ORL','RU-PER','RU-PNZ','RU-PRI','RU-PSK','RU-ROS',
                'RU-RYA','RU-SA','RU-SAK','RU-SAM','RU-SAR','RU-SE','RU-SMO',
                'RU-SPE','RU-STA','RU-SVE','RU-TA','RU-TAM','RU-TOM','RU-TUL',
                'RU-TVE','RU-TY','RU-TYU','RU-UD','RU-ULY','RU-VGG','RU-VLA',
                'RU-VLG','RU-VOR','RU-YAN','RU-YAR','RU-YEV','RU-ZAB','UA-40','UA-43']

  df_solution = pd.read_csv('covid-19-solution/data/check_submission/solution_file.csv', parse_dates=['date'])
  df_solution = df_solution[
      (df_solution['date'] >= str(from_date)) & 
      (df_solution['date'] <= str(to_date))
  ]
  df_solution.set_index(['country', 'date'], inplace=True)

  df_submission = pd.read_csv(your_submission_file, parse_dates=['date'])
  df_columns = [i.lower() for i in df_submission.columns]
  if 'region' in df_columns and 'country' in df_columns:
      df_submission.drop(columns=['country'], inplace=True)
  df_submission.rename(columns={'region': 'country'}, inplace=True)
  df_submission = df_submission[
      (df_submission['date'] >= str(from_date)) & 
      (df_submission['date'] <= str(to_date))
  ]
  df_submission.set_index(['country', 'date'], inplace=True)

  df_join = df_solution.join(df_submission, how='left', lsuffix='_solution')
  df_join['prediction_confirmed_solution'].fillna(0, inplace=True)
  df_join['prediction_confirmed'].fillna(0, inplace=True)
  #print(df_join)

  def mean_absolute_logarithmic_error(df_actual, df_predicted):
      """
      Mean Absolute Logarithmic Error

                |        predicted + 1  |
      LOSS = SUM |log10 ---------------- |
                |          actual + 1   |

      """
      actual = df_actual.astype(np.float).values[:] + 1
      predicted = df_predicted.astype(np.float).values[:] + 1
      return sum(abs(np.log10(predicted) - np.log10(actual)))


  contry_loss = []
  for country in all_country:
      sc = mean_absolute_logarithmic_error(
          df_join.loc[country][f'prediction_confirmed_solution'],
          df_join.loc[country]['prediction_confirmed']
      )
      contry_loss.append(sc)
      
  print('Loss - ', np.mean(contry_loss))

In [0]:
lightgbm_params = []
lightgbm_features = []
lightgbm_params.append({'boosting_type': 'gbdt', 'objective': 'rmse', 'metric': 'rmse', 'max_depth': 3, 'learning_rate': 0.1})
lightgbm_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
lightgbm_params.append({'num_boost_round': 200,'learning_rate': 0.05,'max_depth': 3,})
lightgbm_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
lightgbm_params.append({'num_boost_round': 250,'learning_rate': 0.04,'max_depth': 3,})
lightgbm_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
lightgbm_models = []

target_column = 'Confirmed per day'
for i in range(len(lightgbm_params)):
  X = traint[lightgbm_features[i]]
  y = np.log(traint[target_column].values.clip(0, 1e10)+1)
  dataset = lgb.Dataset(X, label=y, categorical_feature=[])
  model = lgb.train(lightgbm_params[i], dataset)
  lightgbm_models.append(model)

In [15]:
xgboost_params = []
xgboost_features = []
xgboost_params.append({'objective': 'reg:linear','booster': 'gbtree','silent': False,'max_depth': 4, 'min_child_weight': 10, 'learning_rate': 0.07,'colsample_bylevel':0.2,})
xgboost_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
xgboost_params.append({'objective': 'reg:linear','booster': 'gbtree','silent': False,'max_depth': 5, 'min_child_weight': 10, 'learning_rate': 0.07,'colsample_bylevel':0.2,})
xgboost_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
xgboost_params.append({'objective': 'reg:linear','booster': 'gbtree','silent': False,'max_depth': 4, 'min_child_weight': 12, 'learning_rate': 0.07,'colsample_bylevel':0.2,})
xgboost_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
xgboost_models = []

for i in range(len(xgboost_params)):
  X = traint[xgboost_features[i]]
  y = np.log(traint[target_column].values.clip(0, 1e10)+1)
  model = xgb.XGBRegressor(**xgboost_params[i])
  model = model.fit(X.values, y)
  xgboost_models.append(model)



In [0]:
catboost_params = []
catboost_features = []
catboost_params.append({'depth': 10,  'l2_leaf_reg': 1, 'verbose':False})
catboost_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
catboost_params.append({'depth': 12,  'l2_leaf_reg': 1, 'verbose':False})
catboost_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
catboost_params.append({'depth': 8,  'l2_leaf_reg': 1, 'verbose':False})
catboost_features.append(['lat', 'lon', 'Confirmed per day_mean_1_1','Confirmed per day_mean_1_7','Confirmed per day_mean_8_14','Confirmed per day_mean_15_21','days_from_100','Confirmed per day_median_1_7',])
catboost_models = []

for i in range(len(catboost_params)):
  X = traint[catboost_features[i]]
  y = np.log(traint[target_column].values.clip(0, 1e10)+1)
  model = ctb.CatBoostRegressor(**catboost_params[i])
  model = model.fit(X, y)
  catboost_models.append(model)

In [0]:
adaboost_params = []
adaboost_features = []
adaboost_models = []

for i in range(len(adaboost_params)):
  X = traint[adaboost_features[i]].fillna(0).values
  y = np.log(traint[target_column].values.clip(0, 1e10)+1)
  model = ensemble.AdaBoostRegressor(base_estimator=tree.DecisionTreeRegressor(**adaboost_params[i]))
  model = model.fit(X, y)
  adaboost_models.append(model)

In [0]:
randomforest_params = []
randomforest_features = []
randomforest_models = []

for i in range(len(randomforest_params)):
  X = traint[randomforest_features[i]].fillna(0).values
  y = np.log(traint[target_column].values.clip(0, 1e10)+1)
  model = RandomForestRegressor(**randomforest_params[i])
  model = model.fit(X, y)
  randomforest_models.append(model)

In [19]:
#single model
get_result(lightgbm_models[2], lightgbm_features[2])

 10/86  Russia/Chukotskiy autonomous oblast, len known: 96 (103, 160)
 20/86  Russia/Kemerovo oblast, len known: 96 (103, 160)
 30/86  Russia/Magadan oblast, len known: 96 (103, 160)
 40/86  Russia/Orenburg oblast, len known: 96 (103, 160)
 50/86  Russia/Republic of Crimea, len known: 96 (103, 160)
 60/86  Russia/Republic of Tatarstan, len known: 96 (103, 160)
 70/86  Russia/Sevastopol, len known: 96 (103, 160)
 80/86  Russia/Vladimir oblast, len known: 96 (103, 160)
Loss -  0.34517662970266194


---

In [0]:
def get_pred(models):
  import numpy as np
  import pandas as pd
  max_train = traint['day'].max()
  traintest = traint.append(test)
  predictions_final = []
  places = traintest['place'].unique()
  places2 = [i for i in places if ('Ru' in i)]

  for i, place in tqdm(enumerate(places2[:])):
      df_traintest = traintest[traintest['place']==place].reset_index(drop=True).copy()
      count_know = (df_traintest['day']<=max_train).sum()
      for j in range(7):
          res_m = []
          for model_c in models:
            if model_c[0] == 'catboost':
              X_valid = df_traintest[model_c[2]].iloc[j+count_know].values
            elif model_c[0] == 'adaboost':
              X_valid = [df_traintest[model_c[2]].iloc[j+count_know].values]
            else:
              X_valid = df_traintest[model_c[2]].iloc[j+count_know]

            res_m.append(np.exp(model_c[1].predict(X_valid))-1)

          res_c = np.mean(res_m)

          df_traintest['Confirmed per day'][j+count_know] = res_c
          df_traintest['Confirmed'][j+count_know] = df_traintest['Confirmed'][j+count_know-1] + res_c
          df_traintest = make_features(df_traintest)
      df_traintest['Confirmed per day pred'] = np.cumsum(df_traintest['Confirmed per day'].values)
      predictions_final.append(df_traintest)
  predictions_final = pd.concat(predictions_final)

  predicts = predictions_final[(predictions_final['Country/Region'] == 'Russia') & (predictions_final['Province/State'] != 'nan') & predictions_final['Province/State'].apply(lambda x: (',' not in str(x)) and ('Russia' not in str(x)))][['Province/State', 'Date','Confirmed per day pred']]
  predicts = predicts.merge(test[['Province/State', 'Date', 'Confirmed', 'Deaths']], how='left', on=['Province/State', 'Date'])
  predicts2 = predicts[predicts['Date'] >= predicts['Date'].max() - pd.DateOffset(6)]
  predicts2 = predicts2.rename(columns={'cases_pred': 'Confirmed per day pred'})
  predicts3 = predicts2[['Province/State', 'Date', 'Confirmed per day pred']]
  predicts3['Confirmed per day pred'] = predicts3['Confirmed per day pred'].astype('int64')
  predicts3.to_csv('covid-19-solution/data/result/result.csv')

  russia_regions = pd.read_csv('covid-19-solution/data/features/russia_regions.csv')[['iso_code', 'csse_province_state']].rename(columns={'csse_province_state':'Province/State', 'iso_code':'country'})
  predicts3 = predicts3.merge(russia_regions, how='left', on='Province/State')
  del predicts3['Province/State']
  predicts3 = predicts3.rename(columns={'Date':'date','Confirmed per day pred':'prediction_confirmed'})

  return predicts3

In [0]:
group1 = [['xgboost', xgboost_models[0], xgboost_features[0]],
          ['catboost', catboost_models[0], catboost_features[0]],
          ['lightgbm', lightgbm_models[0], lightgbm_features[0]],
          ]
group2 = [['xgboost', xgboost_models[1], xgboost_features[1]],
          ['catboost', catboost_models[1], catboost_features[1]],
          ['lightgbm', lightgbm_models[1], lightgbm_features[1]],
          ]
group3 = [['xgboost', xgboost_models[2], xgboost_features[2]],
          ['catboost', catboost_models[2], catboost_features[2]],
          ['lightgbm', lightgbm_models[2], lightgbm_features[2]],
          ]


In [0]:
preds = []
preds.append(get_pred(group1))
preds.append(get_pred(group2))
preds.append(get_pred(group3))

In [0]:
pred = preds[0]
for p in preds[1:]:
  pred['prediction_confirmed'] += p['prediction_confirmed']
pred['prediction_confirmed'] = (pred['prediction_confirmed']/len(preds)).astype('int64')

In [0]:
pred.to_csv('covid-19-solution/data/result/submission.csv')

In [25]:
pred

Unnamed: 0,date,prediction_confirmed,country
0,2020-04-27,323,RU-ALT
1,2020-04-28,363,RU-ALT
2,2020-04-29,404,RU-ALT
3,2020-04-30,445,RU-ALT
4,2020-05-01,490,RU-ALT
5,2020-05-02,542,RU-ALT
6,2020-05-03,595,RU-ALT
7,2020-04-27,38,RU-AMU
8,2020-04-28,41,RU-AMU
9,2020-04-29,44,RU-AMU


In [0]:
your_submission_file = 'covid-19-solution/data/result/submission.csv'

In [27]:
import pandas as pd
import datetime
import numpy as np
pd.options.display.max_rows = 1000

from_date = MIN_TEST
to_date = MAX_TEST

all_country = ['RU-AD','RU-AL','RU-ALT','RU-AMU','RU-ARK','RU-AST','RU-BA',
               'RU-BEL','RU-BRY','RU-BU','RU-CE','RU-CHE','RU-CHU','RU-CU',
               'RU-DA','RU-IN','RU-IRK','RU-IVA','RU-KAM','RU-KB','RU-KC',
               'RU-KDA','RU-KEM','RU-KGD','RU-KGN','RU-KHA','RU-KHM','RU-KIR',
               'RU-KK','RU-KL','RU-KLU','RU-KO','RU-KOS','RU-KR','RU-KRS',
               'RU-KYA','RU-LEN','RU-LIP','RU-MAG','RU-ME','RU-MO','RU-MOS',
               'RU-MOW','RU-MUR','RU-NEN','RU-NGR','RU-NIZ','RU-NVS','RU-OMS',
               'RU-ORE','RU-ORL','RU-PER','RU-PNZ','RU-PRI','RU-PSK','RU-ROS',
               'RU-RYA','RU-SA','RU-SAK','RU-SAM','RU-SAR','RU-SE','RU-SMO',
               'RU-SPE','RU-STA','RU-SVE','RU-TA','RU-TAM','RU-TOM','RU-TUL',
               'RU-TVE','RU-TY','RU-TYU','RU-UD','RU-ULY','RU-VGG','RU-VLA',
               'RU-VLG','RU-VOR','RU-YAN','RU-YAR','RU-YEV','RU-ZAB','UA-40','UA-43']

df_solution = pd.read_csv('covid-19-solution/data/check_submission/solution_file.csv', parse_dates=['date'])
df_solution = df_solution[
    (df_solution['date'] >= str(from_date)) & 
    (df_solution['date'] <= str(to_date))
]
df_solution.set_index(['country', 'date'], inplace=True)

df_submission = pd.read_csv(your_submission_file, parse_dates=['date'])
df_columns = [i.lower() for i in df_submission.columns]
if 'region' in df_columns and 'country' in df_columns:
    df_submission.drop(columns=['country'], inplace=True)
df_submission.rename(columns={'region': 'country'}, inplace=True)
df_submission = df_submission[
    (df_submission['date'] >= str(from_date)) & 
    (df_submission['date'] <= str(to_date))
]
df_submission.set_index(['country', 'date'], inplace=True)

df_join = df_solution.join(df_submission, how='left', lsuffix='_solution')
df_join['prediction_confirmed_solution'].fillna(0, inplace=True)
df_join['prediction_confirmed'].fillna(0, inplace=True)
#print(df_join)

def mean_absolute_logarithmic_error(df_actual, df_predicted):
    """
    Mean Absolute Logarithmic Error

               |        predicted + 1  |
    LOSS = SUM |log10 ---------------- |
               |          actual + 1   |

    """
    actual = df_actual.astype(np.float).values[:] + 1
    predicted = df_predicted.astype(np.float).values[:] + 1
    return sum(abs(np.log10(predicted) - np.log10(actual)))


contry_loss = []
for country in all_country:
    sc = mean_absolute_logarithmic_error(
        df_join.loc[country][f'prediction_confirmed_solution'],
        df_join.loc[country]['prediction_confirmed']
    )
    contry_loss.append(sc)
    
print('Loss - ', np.mean(contry_loss))

loss: 0.315150518318631


In [0]:
# Loss - 0.315150518318631