## Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import matplotlib.pyplot as plt

## Data

In [11]:
df = pd.read_csv('icl_train.zip', 
                 compression='zip',
                 sep=',',
                 parse_dates=['date'],
                 index_col='date')
df.head()

Unnamed: 0_level_0,zone,Вызов специальной бригады,Перевозка плановая,Перевозка экстренная,авария ( сбило машиной) постр.,аллергия( задыхается),аллергия( отекло лицо),аллергия( сыпь),аритмия,без сознания,...,травма позвоночника,травма ребер( задыхается),травма ребер( не задыхается),травма руки,травма руки( кровотечение),тяж.ст.опьянения,укусила собака,умер?,умер?( хронический больной),умирает
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 07:00:00,П/станция 1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-01-01 08:00:00,П/станция 1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2019-01-01 09:00:00,П/станция 1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2019-01-01 10:00:00,П/станция 1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-01-01 11:00:00,П/станция 1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Погода

In [23]:
weather_df = pd.read_csv('weather_prepared_interpolated.csv', 
                         encoding='cp1251',
                         parse_dates=[0],
                         index_col=0)
weather_df.head()

Unnamed: 0,weather,temp,w_spid,p
2019-01-01 03:00:00,пасмурно,-8.0,3.0,765.0
2019-01-01 04:00:00,пасмурно,-8.333333,2.666667,765.0
2019-01-01 05:00:00,пасмурно,-8.666667,2.333333,765.0
2019-01-01 06:00:00,пасмурно,-9.0,2.0,765.0
2019-01-01 07:00:00,пасмурно,-9.333333,2.0,765.0


#### Лунные фазы

In [24]:
moon_df = pd.read_csv('moon_prepared_interpolated.csv', 
                      encoding='cp1251',
                      parse_dates=[0],
                      index_col=0)
moon_df.head()

Unnamed: 0,новая луна,первая четверь,полнолуние,третья четверть
2019-01-01 00:00:00,1.0,0.0,0.0,0.0
2019-01-01 01:00:00,1.0,0.0,0.0,0.0
2019-01-01 02:00:00,1.0,0.0,0.0,0.0
2019-01-01 03:00:00,1.0,0.0,0.0,0.0
2019-01-01 04:00:00,1.0,0.0,0.0,0.0


#### Динамика по ковиду

In [43]:
covid_df = pd.read_csv('covid_interpolated.csv', 
                       encoding='cp1251',
                       parse_dates=[0],
                       index_col=0)
covid_df.head()

Unnamed: 0,Заражений за день
2019-01-01 00:00:00,0.0
2019-01-01 01:00:00,0.0
2019-01-01 02:00:00,0.0
2019-01-01 03:00:00,0.0
2019-01-01 04:00:00,0.0


### Агрегируем данные

In [72]:
df_new = df.copy()
df_new[weather_df.columns] = weather_df.loc[df.index]
df_new[moon_df.columns] = moon_df.loc[df.index]
df_new[covid_df.columns] = covid_df.loc[df.index]

In [73]:
#!pip install workalendar

In [74]:
from workalendar.europe import Russia
cal = Russia()
df_new['праздничный день'] = df_new.reset_index().date.apply(cal.is_working_day).values

In [75]:
def day_off(date):
    day_name = date.day_name()
    return (day_name in ['Sunday','Saturday'])

def salary_day(date):
    day = date.day
    return ((day>24) or (day<6))

In [76]:
df_new['выходной день'] = df_new.reset_index().date.apply(day_off).values
df_new['зарплатный день'] = df_new.reset_index().date.apply(salary_day).values

In [77]:
df_new.head()

Unnamed: 0_level_0,zone,Вызов специальной бригады,Перевозка плановая,Перевозка экстренная,авария ( сбило машиной) постр.,аллергия( задыхается),аллергия( отекло лицо),аллергия( сыпь),аритмия,без сознания,...,w_spid,p,новая луна,первая четверь,полнолуние,третья четверть,Заражений за день,праздничный день,выходной день,зарплатный день
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 07:00:00,П/станция 1,0,0,0,0,0,0,0,0,0,...,2.0,765.0,1.0,0.0,0.0,0.0,0.0,False,False,True
2019-01-01 08:00:00,П/станция 1,0,0,0,0,0,0,0,0,0,...,2.0,765.0,1.0,0.0,0.0,0.0,0.0,False,False,True
2019-01-01 09:00:00,П/станция 1,0,0,0,0,0,0,0,0,1,...,2.0,765.0,1.0,0.0,0.0,0.0,0.0,False,False,True
2019-01-01 10:00:00,П/станция 1,0,0,0,0,0,0,0,0,0,...,2.006102,764.666667,1.0,0.0,0.0,0.0,0.0,False,False,True
2019-01-01 11:00:00,П/станция 1,1,0,0,0,0,0,0,0,0,...,2.012205,764.333333,1.0,0.0,0.0,0.0,0.0,False,False,True


## Pipeline

In [80]:
targets = df.columns[1:]
targets

Index(['Вызов специальной бригады', 'Перевозка плановая',
       'Перевозка экстренная', 'авария ( сбило машиной) постр.',
       'аллергия( задыхается)', 'аллергия( отекло лицо)', 'аллергия( сыпь)',
       'аритмия', 'без сознания', 'без сознания( сахарный диабет)',
       ...
       'травма позвоночника', 'травма ребер( задыхается)',
       'травма ребер( не задыхается)', 'травма руки',
       'травма руки( кровотечение)', 'тяж.ст.опьянения', 'укусила собака',
       'умер?', 'умер?( хронический больной)', 'умирает'],
      dtype='object', length=153)

In [52]:
def add_lag(df,column_name,lag_length):
    for lag in range(1,lag_length+1):
        df[column_name+'_'+str(lag)] = df[column_name].shift(lag)
    return df

In [56]:
def moving_average(df, column_name, moving_length):
    df[column_name+'_ma_'+str(moving_length)] = df[column_name].rolling(window=moving_length).mean().shift(1)
    return df

In [57]:
def encode_time(df, column_name, amount):
    df[column_name+'_s'] = df[column_name].apply(lambda x: np.sin((2*np.pi)/amount*x))
    df[column_name+'_c'] = df[column_name].apply(lambda x: np.cos((2*np.pi)/amount*x))
    return df

In [81]:
df_new = pd.concat([df_new,pd.get_dummies(df_new['weather'])],axis=1)
df_new = df_new.drop(columns = ['weather'])

In [94]:
df_new['hour'] = df_new.reset_index().date.apply(lambda x: x.hour).values
df_new['day'] = df_new.reset_index().date.apply(lambda x: x.day).values
df_new['year'] = df_new.reset_index().date.apply(lambda x: x.year).values
df_new['day_year'] = df_new.reset_index().date.apply(lambda x: x.dayofyear).values

In [105]:
data = df_new.drop(targets, axis=1)

In [89]:
stations = data['zone'].unique()

In [138]:
def preprocessing(train_df, column_name, lag_length = 100):
    moving_length = [5]
    moving_column_names = []

    train_df = encode_time(train_df,'hour',24)
    train_df = encode_time(train_df,'day',30)
    train_df = encode_time(train_df,'day_year',365)

    #Все фичи для которых можно сделать average + lag
    for clmn in [column_name,'Заражений за день','temp','w_spid', 'p']:
        for i in moving_length:
            train_df = moving_average(train_df, clmn, i)
            moving_column_names.append(clmn+'_ma_'+str(i))
        moving_column_names.append(clmn)

    #Дифференциал
    for diff_feature in moving_column_names:
        train_df[diff_feature+'_diff'] = train_df[diff_feature].diff().shift(1)

    #Лаги
    for lag_feature in moving_column_names:
        train_df = add_lag(train_df,lag_feature,lag_length)

    additional_lag_columns = ['зарплатный день','выходной день']
    for lag_feature in additional_lag_columns:
        train_df = add_lag(train_df,lag_feature,3)

    train_df = train_df.dropna()
    train_df = train_df.drop(columns = ['year','day_year','day','hour','zone'])

    return train_df

In [135]:
train_df = data.query('zone == @stations[0]')

In [136]:
train_df

Unnamed: 0_level_0,zone,temp,w_spid,p,новая луна,первая четверь,полнолуние,третья четверть,Заражений за день,праздничный день,выходной день,зарплатный день,малооблачно,облачно,пасмурно,ясно,day,year,day_year,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-01 07:00:00,П/станция 1,-9.333333,2.000000,765.000000,1.0,0.0,0.0,0.0,0.0,False,False,True,0,0,1,0,1,2019,1,7
2019-01-01 08:00:00,П/станция 1,-9.666667,2.000000,765.000000,1.0,0.0,0.0,0.0,0.0,False,False,True,0,0,1,0,1,2019,1,8
2019-01-01 09:00:00,П/станция 1,-10.000000,2.000000,765.000000,1.0,0.0,0.0,0.0,0.0,False,False,True,1,0,0,0,1,2019,1,9
2019-01-01 10:00:00,П/станция 1,-9.666667,2.006102,764.666667,1.0,0.0,0.0,0.0,0.0,False,False,True,1,0,0,0,1,2019,1,10
2019-01-01 11:00:00,П/станция 1,-9.333333,2.012205,764.333333,1.0,0.0,0.0,0.0,0.0,False,False,True,1,0,0,0,1,2019,1,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 19:00:00,П/станция 1,-7.200000,3.533333,761.000000,1.0,0.0,0.0,0.0,95.0,True,False,True,0,0,1,0,31,2020,366,19
2020-12-31 20:00:00,П/станция 1,-7.300000,3.466667,761.500000,1.0,0.0,0.0,0.0,95.0,True,False,True,0,0,1,0,31,2020,366,20
2020-12-31 21:00:00,П/станция 1,-7.400000,3.400000,762.000000,1.0,0.0,0.0,0.0,95.0,True,False,True,0,0,1,0,31,2020,366,21
2020-12-31 22:00:00,П/станция 1,-7.500000,3.333333,762.500000,1.0,0.0,0.0,0.0,95.0,True,False,True,0,0,1,0,31,2020,366,22


In [100]:
final_mae = dict()
for t in targets:
    final_mae[t] = dict()
    for st in stations:
        final_mae[t][st] = -1

In [141]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit


final_mae = np.array([])
for column_name in targets:
    for station in stations:
        train_df = pd.concat([data, df_new[column_name]],axis=1).query('zone == @station')
        train_df = preprocessing(train_df,column_name, 10)

        train_df = train_df.reset_index(drop=True)
        X = train_df.drop(columns = [column_name])
        Y = train_df[column_name]

        lr_rmse_list = np.array([])
        tscv = TimeSeriesSplit()
        for train_index, test_index in tscv.split(X):
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = Y.loc[train_index], Y.loc[test_index]

        # ТУТ УЧИМ И ТЕСТИМ
        #lr_model = LinearRegression()
        lr_model = Ridge(alpha=1.0)
        lr_model.fit(X_train,y_train)

        predicted = lr_model.predict(X_test).round()
        lr_rmse = mean_absolute_error(y_test,predicted)
        lr_rmse_list = np.append(lr_rmse_list,lr_rmse)

          #print('MAE = %.2f' % lr_rmse)
        final_mae = np.append(final_mae,lr_rmse.mean())
        print('\n',pd.Series(predicted).value_counts())
        print(lr_rmse_list)
        print('MAE ', column_name,' ', station,' - ', lr_rmse_list.mean())
        print('All 0', mean_absolute_error(y_test,np.zeros(len(y_test))),'\n')


 0.0    2920
dtype: int64
[0.01815068]
MAE  Вызов специальной бригады   П/станция 1  -  0.018150684931506848
All 0 0.018150684931506848 


 0.0    2920
dtype: int64
[0.01883562]
MAE  Вызов специальной бригады   П/станция 2  -  0.018835616438356163
All 0 0.018835616438356163 


 0.0    2920
dtype: int64
[0.01369863]
MAE  Вызов специальной бригады   П/станция 3  -  0.0136986301369863
All 0 0.0136986301369863 


 2.0    1085
1.0     828
3.0     749
4.0     152
0.0     100
5.0       6
dtype: int64
[1.3]
MAE  Перевозка плановая   П/станция 1  -  1.3
All 0 2.0578767123287673 


 1.0    2174
0.0     463
2.0     283
dtype: int64
[1.09246575]
MAE  Перевозка плановая   П/станция 2  -  1.0924657534246576
All 0 1.2787671232876712 


 1.0    2008
0.0     789
2.0     123
dtype: int64
[0.76952055]
MAE  Перевозка плановая   П/станция 3  -  0.7695205479452055
All 0 0.8037671232876712 


 0.0    2176
1.0     744
dtype: int64
[0.35582192]
MAE  Перевозка экстренная   П/станция 1  -  0.3558219178082192
Al

In [None]:
cut = 100
cut_df = data[time_columns+weather_columns+covid_columns+[column_name]].query('zone == @station').iloc[-cut:]


def predict_1_step(cut_df, column_name, station, lag_length, model):
  
    columns_to_save = cut_df.columns
    date = cut_df['date'].iloc[-1] + timedelta(minutes=60)
    year = date.year
    day_year = date.dayofyear
    day = date.day
    hour = date.hour
    zone = station
    day_name = date.day_name()
    cut_weather = data_weather[data_weather['date'] == date][weather_columns].to_numpy()
    cut_covid = data_covid[data_covid['day_date'] == date][covid_columns].to_numpy()
    target = -1 #потом перезаписывается

    new_row = np.array([date,year,day_year,day,hour,zone,day_name])
    new_row = np.hstack((a1,cut_weather.ravel(),cut_covid.ravel(),target))
    cut_df.loc[len(cut_df)] = new_row

    temp = preprocessing(cut_df, column_name, lag_length)

    x = np.array(temp.drop(columns = ['date','year','day_year','day','hour','zone','day_name', column_name]).iloc[-1])
    y_pred = model.predict(x.reshape(1,-1)).round()

    cut_df = cut_df.replace(-1, y)

    return cut_df[columns_to_save]