In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import datetime

plt.rcParams["figure.figsize"] = (20,10)

In [102]:
data = pd.read_csv('icl_train.csv')

In [3]:
def show_data_by_dates(data,column_name,
                       left = datetime.date(2019, 1, 1),
                       right = datetime.date(2019, 2, 1)):
  group = data[[column_name,'day_date']].groupby(by = ['day_date']).sum()
  x = group.index
  y = group[column_name]
  fig, ax = plt.subplots()
  ax.plot_date(x, y,'b-')
  fig.autofmt_xdate()
  ax.set_xlim([left, right])

In [4]:
def plot_timeseries(df, station, start_date, end_date):
    fig, ax = plt.subplots(figsize=(20,8))
    st_df = df[df.zone == station][['date', 'total']]
    st_df = st_df.loc[(st_df.date >= pd.to_datetime(start_date)) & (st_df.date <= pd.to_datetime(end_date))]
    ax.plot(st_df.date, st_df.total, '.--')
    ax.set_title(station)
    plt.show();

In [103]:
#change data
data['total'] = data.iloc[:, 2:].sum(axis=1)
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].apply(lambda x: x.year)
data['day'] = data['date'].apply(lambda x: x.day)
data['hour'] = data['date'].apply(lambda x: x.hour)
data['day_date'] = data['date'].apply(lambda x: x.date())
data['day_date'] = pd.to_datetime(data['day_date'])
data['day_name'] = data['date'].apply(lambda x: x.day_name())
data['day_year'] = data['date'].apply(lambda x: x.dayofyear)                                  


In [88]:
stations = data['zone'].unique()

In [None]:
#plot_timeseries(data, stations[0], '20190105', '20190107')

In [6]:
def add_lag(df,column_name,lag_length):
  for lag in range(1,lag_length+1):
    df[column_name+'_'+str(lag)] = df[column_name].shift(lag)
  return df

#amount - количество дней/часов в году..
def encode_time(df, column_name, amount):
  df[column_name+'_s'] = df[column_name].apply(lambda x: np.sin((2*np.pi)/amount*x))
  df[column_name+'_c'] = df[column_name].apply(lambda x: np.cos((2*np.pi)/amount*x))
  return df

def moving_average(df, column_name, moving_length):
  df[column_name+'_ma_'+str(moving_length)] = df[column_name].rolling(window=moving_length).mean().shift(1)
  return df

def gaussian_average(df, column_name, moving_length):
  df[column_name+'_ga_'+str(moving_length)] = df[column_name].rolling(window=5, win_type='gaussian').mean(std=0.5).shift(1)
  return df

def exp_average(df, column_name):
  df[column_name+'_ea_'] = df[column_name].ewm(com=0.5).mean().shift(1)
  return df

def day_off(day):
  if day in ['Sunday','Saturday']:
    return 1
  else:
    return 0

def salary_day(day):
  if ((day>24) or (day<6)):
    return 1
  else:
    return 0

In [None]:
#data

In [106]:
#column_name = 'Перевозка плановая'
#station = stations[0]
#train_df = data[['date','year','day_year','day','hour','zone','day_name',column_name]].query('zone == @station')

# Add_data

In [111]:
data_weather = pd.read_csv('weather_prepared_interpolated.csv', encoding='1251', sep = ',')
data_weather = pd.concat([data_weather,pd.get_dummies(data_weather['weather'])],axis=1)
data_weather = data_weather.drop(columns = ['weather'])
data_weather['date'] = pd.to_datetime(data_weather['Unnamed: 0'])
weather_columns = ['temp', 'w_spid', 'p', 'малооблачно', 'облачно',
       'пасмурно', 'ясно']
#data_weather = data_weather.set_index('Unnamed: 0')

#data = data.set_index('date')
#data = data.join(data_weather, lsuffix='', rsuffix='',how = 'left')
data = pd.merge(data, data_weather,  how='left', left_on=['date'], right_on = ['date'])

In [113]:
data_covid = pd.read_csv('covid_tatarstan.csv', encoding='1251', sep = ';')
data_covid['DateTime'] = pd.to_datetime(data_covid['DateTime'])
covid_columns = ['Заражений за день']
data_covid['day_date'] = data_covid['DateTime']
data_covid = data_covid.drop(columns = ['DateTime'])

data = pd.merge(data, data_covid,  how='left', left_on=['day_date'], right_on = ['day_date'])

In [115]:
data = data.fillna(0)

# Preprocessing

In [117]:
#time_columns = ['date','year','day_year','day','hour','zone','day_name','Перевозка плановая']
#train_df = data[time_columns+weather_columns+covid_columns].query('zone == @stations[0]')

In [118]:
#train_df.isnull().sum()

date                  0
year                  0
day_year              0
day                   0
hour                  0
zone                  0
day_name              0
Перевозка плановая    0
temp                  0
w_spid                0
p                     0
малооблачно           0
облачно               0
пасмурно              0
ясно                  0
Заражений за день     0
dtype: int64

In [120]:
def preprocessing(train_df, column_name, lag_length = 100):
  moving_length = [6,12,24,48]
  moving_column_names = []

  train_df = encode_time(train_df,'hour',24)
  train_df = encode_time(train_df,'day',30)
  train_df = encode_time(train_df,'day_year',365)

  train_df = pd.concat([train_df,pd.get_dummies(train_df['year'])],axis=1)
  train_df = pd.concat([train_df,pd.get_dummies(train_df['day_name'])],axis=1)
  train_df['day_off'] = train_df['day_name'].apply(day_off)
  train_df['salary_day'] = train_df['day'].apply(salary_day)

  #Все фичи для которых можно сделать average + lag
  for clmn in [column_name,'Заражений за день','temp','w_spid']:
    for i in moving_length:
      train_df = moving_average(train_df, clmn, i)
      moving_column_names.append(clmn+'_ma_'+str(i))
    for i in moving_length:
      train_df = gaussian_average(train_df, clmn, i)
      moving_column_names.append(clmn+'_ga_'+str(i))
    train_df = exp_average(train_df, clmn)
    moving_column_names.append(clmn+'_ea_')
    moving_column_names.append(clmn)

  #Дифференциал
  for diff_feature in moving_column_names:
    train_df[diff_feature+'_diff'] = train_df[diff_feature].diff().shift(1)

  #Лаги
  for lag_feature in moving_column_names:
    train_df = add_lag(train_df,lag_feature,lag_length)

  additional_lag_columns = ['salary_day','day_off']
  for lag_feature in additional_lag_columns:
    train_df = add_lag(train_df,lag_feature,3)

  train_df = train_df.dropna()
  train_df = train_df.drop(columns = ['date','year','day_year','day','hour','zone','day_name'])

  return train_df

# LinReg

In [121]:
import sklearn
from tqdm import tqdm
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

In [112]:
train_df = train_df.reset_index()
train_df = train_df.drop(columns = ['index'])
X = train_df.drop(columns = [column_name])
Y = train_df[column_name]

In [13]:
time_columns = ['date','year','day_year','day','hour','zone','day_name','Перевозка плановая']

In [126]:
columns_to_test = ['Перевозка плановая','Вызов специальной бригады','Перевозка экстренная']
stations_to_test = stations[0:3]
time_columns = ['date','year','day_year','day','hour','zone','day_name']
lag_length = 100

final_mae = np.array([])

for column_name in tqdm(columns_to_test):
  for station in stations_to_test:
    train_df = data[time_columns+weather_columns+covid_columns+[column_name]].query('zone == @station')
    train_df = preprocessing(train_df,column_name, lag_length)

    train_df = train_df.reset_index()
    train_df = train_df.drop(columns = ['index'])
    X = train_df.drop(columns = [column_name])
    Y = train_df[column_name]
    
    lr_rmse_list = np.array([])
    tscv = TimeSeriesSplit()
    for train_index, test_index in tscv.split(X):
      X_train, X_test = X.loc[train_index], X.loc[test_index]
      y_train, y_test = Y.loc[train_index], Y.loc[test_index]
      
      # ТУТ УЧИМ И ТЕСТИМ
      #lr_model = LinearRegression()
      lr_model = Ridge(alpha=1.0)
      lr_model.fit(X_train,y_train)
      
      predicted = lr_model.predict(X_test).round()
      lr_rmse = mean_absolute_error(y_test,predicted)
      lr_rmse_list = np.append(lr_rmse_list,lr_rmse)
      
      #print('MAE = %.2f' % lr_rmse)
    final_mae = np.append(final_mae,lr_rmse.mean())
    print('\n',pd.Series(predicted).value_counts())
    print(lr_rmse_list)
    print('MAE ', column_name,' ', station,' - ', lr_rmse_list.mean())
    print('All 0', mean_absolute_error(y_test,np.zeros(len(y_test))),'\n')

print('*'*10+'\n')
print('MAE: ', final_mae.mean(),' lag: ', lag_length)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]


  1.0    935
 3.0    736
 2.0    723
 4.0    315
 0.0    173
 5.0     15
-1.0      1
dtype: int64
[0.71463078 0.64596273 0.64699793 1.46480331 1.2284334 ]
MAE  Перевозка плановая   П/станция 1  -  0.9401656314699792
All 0 2.067287784679089 


  1.0    1744
 2.0     781
 0.0     361
 3.0      11
-1.0       1
dtype: int64
[0.32125604 0.31918565 0.38371291 0.7926156  1.09765355]
MAE  Перевозка плановая   П/станция 2  -  0.5828847481021393
All 0 1.279848171152519 



 33%|████████████████████████████                                                        | 1/3 [01:22<02:44, 82.05s/it]


 1.0    1823
0.0     904
2.0     171
dtype: int64
[0.62249827 0.50172533 0.51069703 0.67253278 0.74223602]
MAE  Перевозка плановая   П/станция 3  -  0.6099378881987578
All 0 0.8026224982746721 


 0.0    2898
dtype: int64
[0.02346446 0.02311939 0.01897861 0.02035887 0.01828847]
MAE  Вызов специальной бригады   П/станция 1  -  0.020841959972394755
All 0 0.01828847481021394 


 -0.0    2898
dtype: int64
[0.02070393 0.01966874 0.01690821 0.01966874 0.01863354]
MAE  Вызов специальной бригады   П/станция 2  -  0.01911663216011042
All 0 0.018633540372670808 



 67%|████████████████████████████████████████████████████████                            | 2/3 [02:42<01:20, 80.88s/it]


 0.0    2898
dtype: int64
[0.01483782 0.01518288 0.00966184 0.01725328 0.01380262]
MAE  Вызов специальной бригады   П/станция 3  -  0.01414768806073154
All 0 0.013802622498274672 


 0.0    2192
1.0     706
dtype: int64
[0.28743961 0.27260179 0.26397516 0.36853002 0.36231884]
MAE  Перевозка экстренная   П/станция 1  -  0.3109730848861284
All 0 0.32988267770876467 


 0.0    2890
1.0       8
dtype: int64
[0.08661146 0.09627329 0.12111801 0.17218772 0.16873706]
MAE  Перевозка экстренная   П/станция 2  -  0.1289855072463768
All 0 0.16735679779158041 



100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [04:00<00:00, 80.33s/it]


 0.0    2611
1.0     287
dtype: int64
[0.19392685 0.1863354  0.21325052 0.27639752 0.25189786]
MAE  Перевозка экстренная   П/станция 3  -  0.2243616287094548
All 0 0.23360938578329882 

**********

MAE:  0.43355570891802775  lag:  100



