In [327]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import datetime
from sklearn.preprocessing import LabelEncoder

plt.rcParams["figure.figsize"] = (20,10)

In [198]:
data = pd.read_csv('icl_train.csv')

In [3]:
def show_data_by_dates(data,column_name,
                       left = datetime.date(2019, 1, 1),
                       right = datetime.date(2019, 2, 1)):
  group = data[[column_name,'day_date']].groupby(by = ['day_date']).sum()
  x = group.index
  y = group[column_name]
  fig, ax = plt.subplots()
  ax.plot_date(x, y,'b-')
  fig.autofmt_xdate()
  ax.set_xlim([left, right])

In [4]:
def plot_timeseries(df, station, start_date, end_date):
    fig, ax = plt.subplots(figsize=(20,8))
    st_df = df[df.zone == station][['date', 'total']]
    st_df = st_df.loc[(st_df.date >= pd.to_datetime(start_date)) & (st_df.date <= pd.to_datetime(end_date))]
    ax.plot(st_df.date, st_df.total, '.--')
    ax.set_title(station)
    plt.show();

In [199]:
#change data
#data['total'] = data.iloc[:, 2:].sum(axis=1)
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].apply(lambda x: x.year)
data['day'] = data['date'].apply(lambda x: x.day)
data['hour'] = data['date'].apply(lambda x: x.hour)
data['day_date'] = data['date'].apply(lambda x: x.date())
data['day_date'] = pd.to_datetime(data['day_date'])
data['day_name'] = data['date'].apply(lambda x: x.day_name())
data['day_year'] = data['date'].apply(lambda x: x.dayofyear)                                  


In [88]:
stations = data['zone'].unique()

In [None]:
#plot_timeseries(data, stations[0], '20190105', '20190107')

In [166]:
def add_lag(df,column_name,lag_length):
  for lag in range(1,lag_length+1):
    df[column_name+'_'+str(lag)] = df[column_name].shift(lag)
  return df

#amount - количество дней/часов в году..
def encode_time(df, column_name, amount):
  df[column_name+'_s'] = df[column_name].apply(lambda x: np.sin((2*np.pi)/amount*x))
  df[column_name+'_c'] = df[column_name].apply(lambda x: np.cos((2*np.pi)/amount*x))
  return df

def moving_average(df, column_name, moving_length):
  df[column_name+'_ma_'+str(moving_length)] = df[column_name].rolling(window=moving_length).mean().shift(1)
  return df

def gaussian_average(df, column_name, moving_length):
  df[column_name+'_ga_'+str(moving_length)] = df[column_name].rolling(window=moving_length, win_type='gaussian').mean(std=0.5).shift(1)
  return df

def exp_average(df, column_name):
  df[column_name+'_ea_'] = df[column_name].ewm(com=0.5).mean().shift(1)
  return df

def day_off(day):
  if day in ['Sunday','Saturday']:
    return 1
  else:
    return 0

def salary_day(day):
  if ((day>24) or (day<6)):
    return 1
  else:
    return 0

# Add_data

In [201]:
data_weather = pd.read_csv('weather_prepared_interpolated.csv', encoding='1251', sep = ',')
data_weather = pd.concat([data_weather,pd.get_dummies(data_weather['weather'])],axis=1)
data_weather = data_weather.drop(columns = ['weather'])
data_weather['date'] = pd.to_datetime(data_weather['Unnamed: 0'])
weather_columns = ['temp', 'w_spid', 'p', 'малооблачно', 'облачно',
       'пасмурно', 'ясно']
#data_weather = data_weather.set_index('Unnamed: 0')

#data = data.set_index('date')
#data = data.join(data_weather, lsuffix='', rsuffix='',how = 'left')
data = pd.merge(data, data_weather,  how='left', left_on=['date'], right_on = ['date'])

In [202]:
data_covid = pd.read_csv('covid_tatarstan.csv', encoding='1251', sep = ';')
data_covid['DateTime'] = pd.to_datetime(data_covid['DateTime'])
covid_columns = ['Заражений за день']
data_covid['day_date'] = data_covid['DateTime']
data_covid = data_covid.drop(columns = ['DateTime'])

data = pd.merge(data, data_covid,  how='left', left_on=['day_date'], right_on = ['day_date'])

In [203]:
data = data.fillna(0)

# Preprocessing

In [117]:
#time_columns = ['date','year','day_year','day','hour','zone','day_name','Перевозка плановая']
#train_df = data[time_columns+weather_columns+covid_columns].query('zone == @stations[0]')

In [118]:
#train_df.isnull().sum()

date                  0
year                  0
day_year              0
day                   0
hour                  0
zone                  0
day_name              0
Перевозка плановая    0
temp                  0
w_spid                0
p                     0
малооблачно           0
облачно               0
пасмурно              0
ясно                  0
Заражений за день     0
dtype: int64

In [334]:
#day_name_unique = data['day_name'].unique()
def preprocessing(train_df, column_name, lag_length = 100):
  moving_length = [6,12,24,48]
  moving_column_names = []

  train_df = encode_time(train_df,'hour',24)
  train_df = encode_time(train_df,'day',30)
  train_df = encode_time(train_df,'day_year',365)

  #train_df = pd.concat([train_df,pd.get_dummies(train_df['year'])],axis=1)
  #не смог найти get_dummies с обучением
  #train_df = pd.concat([train_df,pd.get_dummies(train_df['day_name'], columns = day_name_unique)],axis=1)
  train_df['day_off'] = train_df['day_name'].apply(day_off)
  train_df['salary_day'] = train_df['day'].apply(salary_day)

  #Все фичи для которых можно сделать average + lag
  for clmn in [column_name,'Заражений за день','temp','w_spid']:
    for i in moving_length:
      train_df = moving_average(train_df, clmn, i)
      moving_column_names.append(clmn+'_ma_'+str(i))
    #for i in moving_length:
    #  train_df = gaussian_average(train_df, clmn, i)
    #  moving_column_names.append(clmn+'_ga_'+str(i))
    #train_df = exp_average(train_df, clmn)
    #moving_column_names.append(clmn+'_ea_')
    moving_column_names.append(clmn)

  #Дифференциал
  for diff_feature in moving_column_names:
    train_df[diff_feature+'_diff'] = train_df[diff_feature].diff().shift(1)

  #Лаги
  for lag_feature in moving_column_names:
    train_df = add_lag(train_df,lag_feature,lag_length)

  additional_lag_columns = ['salary_day','day_off']
  for lag_feature in additional_lag_columns:
    train_df = add_lag(train_df,lag_feature,3)

  train_df = train_df.dropna()
  #train_df = train_df.drop(columns = ['date','year','day_year','day','hour','zone','day_name'])

  return train_df

# LinReg

In [121]:
import sklearn
from tqdm import tqdm
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

In [13]:
time_columns = ['date','year','day_year','day','hour','zone','day_name','Перевозка плановая']

In [None]:
columns_to_test = ['Перевозка плановая','Вызов специальной бригады','Перевозка экстренная']
stations_to_test = stations[0:3]
time_columns = ['date','year','day_year','day','hour','zone','day_name']
lag_length = 10

final_mae = np.array([])

for column_name in tqdm(columns_to_test):
  for station in stations_to_test:
    train_df = data[time_columns+weather_columns+covid_columns+[column_name]].query('zone == @station')
    train_df = preprocessing(train_df,column_name, lag_length)

    train_df = train_df.reset_index()
    train_df = train_df.drop(columns = ['index'])
    X = train_df.drop(columns = ['date','year','day_year','day','hour','zone','day_name', column_name])
    Y = train_df[column_name]
    
    lr_rmse_list = np.array([])
    tscv = TimeSeriesSplit()
    for train_index, test_index in tscv.split(X):
      X_train, X_test = X.loc[train_index], X.loc[test_index]
      y_train, y_test = Y.loc[train_index], Y.loc[test_index]
      
      # ТУТ УЧИМ И ТЕСТИМ
      #lr_model = LinearRegression()
      lr_model = Ridge(alpha=0.7)
      lr_model.fit(X_train,y_train)
      
      predicted = lr_model.predict(X_test).round()
      lr_rmse = mean_absolute_error(y_test,predicted)
      lr_rmse_list = np.append(lr_rmse_list,lr_rmse)
      
      #print('MAE = %.2f' % lr_rmse)
    final_mae = np.append(final_mae,lr_rmse.mean())
    print('\n',pd.Series(predicted).value_counts())
    print(lr_rmse_list)
    print('MAE ', column_name,' ', station,' - ', lr_rmse_list.mean())
    print('All 0', mean_absolute_error(y_test,np.zeros(len(y_test))),'\n')

print('*'*10+'\n')
print('MAE: ', final_mae.mean(),' lag: ', lag_length)

# Predict 1 step forward

In [190]:
from datetime import timedelta

In [474]:
#data - конец временного ряда (датасета)
cut = 100
cut_df = data[time_columns+weather_columns+covid_columns+[column_name]].query('zone == @station').iloc[-cut:]


def predict_1_step(cut_df, column_name, station, lag_length, model):
  
  columns_to_save = cut_df.columns
  date = cut_df['date'].iloc[-1] + timedelta(minutes=60)
  year = date.year
  day_year = date.dayofyear
  day = date.day
  hour = date.hour
  zone = station
  day_name = date.day_name()
  cut_weather = data_weather[data_weather['date'] == date][weather_columns].to_numpy()
  cut_covid = data_covid[data_covid['day_date'] == pd.Timestamp.round(date,freq='d')][covid_columns].to_numpy()#.reshape(1,)
  target = -1 #потом перезаписывается

  new_row = np.array([date,year,day_year,day,hour,zone,day_name])
  new_row = np.hstack((new_row,cut_weather.ravel(),cut_covid.ravel(),target))
  cut_df.loc[len(cut_df)] = new_row

  temp = preprocessing(cut_df, column_name, lag_length)

  x = np.array(temp.drop(columns = ['date','year','day_year','day','hour','zone','day_name', column_name]).iloc[-1])
  y_pred = model.predict(x.reshape(1,-1)).round()

  cut_df = cut_df.replace(-1, y)

  return cut_df[columns_to_save]

In [475]:
for i in tqdm(range(90*24)):
  cut_df = predict_1_step(cut_df, column_name, station, lag_length, lr_model)

100%|██████████████████████████████████████████████████████████████████████████████| 2160/2160 [07:23<00:00,  4.87it/s]


In [477]:
cut_df.isna().sum().sum()

0