In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import datetime

plt.rcParams["figure.figsize"] = (20,10)

In [16]:
data = pd.read_csv('icl_train.csv')

In [3]:
def show_data_by_dates(data,column_name,
                       left = datetime.date(2019, 1, 1),
                       right = datetime.date(2019, 2, 1)):
  group = data[[column_name,'day_date']].groupby(by = ['day_date']).sum()
  x = group.index
  y = group[column_name]
  fig, ax = plt.subplots()
  ax.plot_date(x, y,'b-')
  fig.autofmt_xdate()
  ax.set_xlim([left, right])

In [4]:
def plot_timeseries(df, station, start_date, end_date):
    fig, ax = plt.subplots(figsize=(20,8))
    st_df = df[df.zone == station][['date', 'total']]
    st_df = st_df.loc[(st_df.date >= pd.to_datetime(start_date)) & (st_df.date <= pd.to_datetime(end_date))]
    ax.plot(st_df.date, st_df.total, '.--')
    ax.set_title(station)
    plt.show();

In [17]:
#change data
data['total'] = data.iloc[:, 2:].sum(axis=1)
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].apply(lambda x: x.year)
data['day'] = data['date'].apply(lambda x: x.day)
data['hour'] = data['date'].apply(lambda x: x.hour)
data['day_date'] = data['date'].apply(lambda x: x.date())
data['day_date'] = pd.to_datetime(data['day_date'])
data['day_name'] = data['date'].apply(lambda x: x.day_name())
data['day_year'] = data['date'].apply(lambda x: x.dayofyear)                                  


In [6]:
stations = data['zone'].unique()

In [None]:
#plot_timeseries(data, stations[0], '20190105', '20190107')

In [18]:
def add_lag(df,column_name,lag_length):
  for lag in range(1,lag_length+1):
    df[column_name+'_'+str(lag)] = df[column_name].shift(lag)
  return df

#amount - количество дней/часов в году..
def encode_time(df, column_name, amount):
  df[column_name+'_s'] = df[column_name].apply(lambda x: np.sin((2*np.pi)/amount*x))
  df[column_name+'_c'] = df[column_name].apply(lambda x: np.cos((2*np.pi)/amount*x))
  return df

def moving_average(df, column_name, moving_length):
  df[column_name+'_ma_'+str(moving_length)] = df[column_name].rolling(window=moving_length).mean().shift(1)
  return df

def gaussian_average(df, column_name, moving_length):
  df[column_name+'_ga_'+str(moving_length)] = df[column_name].rolling(window=5, win_type='gaussian').mean(std=0.5).shift(1)
  return df

def exp_average(df, column_name):
  df[column_name+'_ea_'] = df[column_name].ewm(com=0.5).mean().shift(1)
  return df

def day_off(day):
  if day in ['Sunday','Saturday']:
    return 1
  else:
    return 0

def salary_day(day):
  if ((day>24) or (day<6)):
    return 1
  else:
    return 0

In [None]:
data

In [106]:
column_name = 'Перевозка плановая'
station = stations[0]
train_df = data[['date','year','day_year','day','hour','zone','day_name',column_name]].query('zone == @station')

# Preprocessing

In [107]:
moving_length = [6,12,24,48]
moving_column_names = []

train_df = encode_time(train_df,'hour',24)
train_df = encode_time(train_df,'day',30)
train_df = encode_time(train_df,'day_year',365)

train_df = pd.concat([train_df,pd.get_dummies(train_df['year'])],axis=1)
train_df = pd.concat([train_df,pd.get_dummies(train_df['day_name'])],axis=1)
train_df['day_off'] = train_df['day_name'].apply(day_off)
train_df['salary_day'] = train_df['day'].apply(salary_day)

for i in moving_length:
  train_df = moving_average(train_df, column_name, i)
  moving_column_names.append(column_name+'_ma_'+str(i))
for i in moving_length:
  train_df = gaussian_average(train_df, column_name, i)
  moving_column_names.append(column_name+'_ga_'+str(i))
train_df = exp_average(train_df, column_name)
moving_column_names.append(column_name+'_ea_')
moving_column_names.append(column_name)

In [108]:
#Дифференциал для таргета + average
for diff_feature in moving_column_names:
  train_df[diff_feature+'_diff'] = train_df[diff_feature].diff().shift(1)

#Лаги для таргета + average
for lag_feature in moving_column_names:
  train_df = add_lag(train_df,lag_feature,100)

additional_lag_columns = ['salary_day','day_off']
for lag_feature in additional_lag_columns:
  train_df = add_lag(train_df,lag_feature,3)




In [109]:
train_df = train_df.dropna()
train_df = train_df.drop(columns = ['date','year','day_year','day','hour','zone','day_name'])

In [110]:
#Проверка
dts = []
for column in train_df.columns:
  dts.append(train_df[column].dtype)
pd.Series(dts).value_counts()

float64    1031
uint8         9
int64         3
dtype: int64

In [123]:
def preprocessing(train_df, column_name):
  moving_length = [6,12,24,48]
  moving_column_names = []

  train_df = encode_time(train_df,'hour',24)
  train_df = encode_time(train_df,'day',30)
  train_df = encode_time(train_df,'day_year',365)

  train_df = pd.concat([train_df,pd.get_dummies(train_df['year'])],axis=1)
  train_df = pd.concat([train_df,pd.get_dummies(train_df['day_name'])],axis=1)
  train_df['day_off'] = train_df['day_name'].apply(day_off)
  train_df['salary_day'] = train_df['day'].apply(salary_day)

  for i in moving_length:
    train_df = moving_average(train_df, column_name, i)
    moving_column_names.append(column_name+'_ma_'+str(i))
  for i in moving_length:
    train_df = gaussian_average(train_df, column_name, i)
    moving_column_names.append(column_name+'_ga_'+str(i))
  train_df = exp_average(train_df, column_name)
  moving_column_names.append(column_name+'_ea_')
  moving_column_names.append(column_name)

  #Дифференциал для таргета + average
  for diff_feature in moving_column_names:
    train_df[diff_feature+'_diff'] = train_df[diff_feature].diff().shift(1)

  #Лаги для таргета + average
  for lag_feature in moving_column_names:
    train_df = add_lag(train_df,lag_feature,100)

  additional_lag_columns = ['salary_day','day_off']
  for lag_feature in additional_lag_columns:
    train_df = add_lag(train_df,lag_feature,3)

  train_df = train_df.dropna()
  train_df = train_df.drop(columns = ['date','year','day_year','day','hour','zone','day_name'])

  return train_df

# LinReg

In [118]:
import sklearn
from tqdm import tqdm
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

In [112]:
train_df = train_df.reset_index()
train_df = train_df.drop(columns = ['index'])
X = train_df.drop(columns = [column_name])
Y = train_df[column_name]

In [None]:
data

In [141]:
columns_to_test = ['Перевозка плановая']#,'Вызов специальной бригады','Перевозка экстренная']
stations_to_test = stations[0:3]

for column_name in tqdm(columns_to_test):
  for station in stations_to_test:
    train_df = data[['date','year','day_year','day','hour','zone','day_name',column_name]].query('zone == @station')
    train_df = preprocessing(train_df,column_name)

    train_df = train_df.reset_index()
    train_df = train_df.drop(columns = ['index'])
    X = train_df.drop(columns = [column_name])
    Y = train_df[column_name]

    lr_rmse_list = np.array([])
    tscv = TimeSeriesSplit()
    for train_index, test_index in tscv.split(X):
      X_train, X_test = X.loc[train_index], X.loc[test_index]
      y_train, y_test = Y.loc[train_index], Y.loc[test_index]
      
      # ТУТ УЧИМ И ТЕСТИМ
      #lr_model = LinearRegression()
      lr_model = Ridge(alpha=1.0)
      lr_model.fit(X_train,y_train)
      
      predicted = lr_model.predict(X_test).round()
      lr_rmse = mean_absolute_error(y_test,predicted)
      lr_rmse_list = np.append(lr_rmse_list,lr_rmse)
      
      #print('MAE = %.2f' % lr_rmse)
    print('\n',pd.Series(predicted).value_counts())
    print(lr_rmse_list)
    print('MAE ', column_name,' ', station,' - ', lr_rmse_list.mean())
    print('All 0', mean_absolute_error(y_test,np.zeros(len(y_test))),'\n')

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]


 1.0    973
3.0    766
2.0    698
4.0    279
0.0    171
5.0     11
dtype: int64
[0.64182195 0.63940649 0.6252588  1.45169082 1.20324362]
MAE  Перевозка плановая   П/станция 1  -  0.9122843340234645
All 0 2.067287784679089 


 1.0    1724
2.0     775
0.0     383
3.0      16
dtype: int64
[0.27881297 0.30745342 0.37543133 0.7605245  1.08454106]
MAE  Перевозка плановая   П/станция 2  -  0.561352657004831
All 0 1.279848171152519 



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.31s/it]


 1.0    1830
0.0     974
2.0      94
dtype: int64
[0.45134576 0.48619738 0.5010352  0.63871636 0.71635611]
MAE  Перевозка плановая   П/станция 3  -  0.5587301587301587
All 0 0.8026224982746721 




