In [126]:
import seaborn as sns
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [96]:
df = pd.read_csv('data/train_bikes.csv')

df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


## Предобработка данных:

In [149]:
def get_supervised_array(df_origin):
    df = df_origin.copy()
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M')
    df['year'] = df.loc[:, 'datetime'].dt.year
    df['month'] = df.loc[:, 'datetime'].dt.month
    df['hour'] = df.loc[:, 'datetime'].dt.hour
    stroka = df.to_string(
        columns=['year','month','hour'], 
        col_space=0,
        header=False,
        index=False,
    )
    return np.array(re.sub(' +','-',stroka).split('\n'))

In [137]:
def get_date_features(df_origin):
    df = df_origin.copy()
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M')
    df['year'] = df.loc[:, 'datetime'].dt.year
    df['month'] = df.loc[:, 'datetime'].dt.month
    df['day'] = df.loc[:, 'datetime'].dt.day
    df['hour'] = df.loc[:, 'datetime'].dt.hour
    df['weekday'] = df.loc[:, 'datetime'].dt.weekday
    
    return df

In [100]:
def class_hour(df):
    df_tmp = df.copy()
    df_tmp['class_hour'] = df_tmp['hour'].map(
    lambda x:
        "night" if x == 23 or x <= 4 else 
        "morning" if x <= 8 and x > 4 else
        "rush_morning" if x <= 9 and x > 8 else
        "day" if x <= 16 and x > 9 else
        "rush_day" if x <= 19 and x > 16 else
        "evening" 

    )
    df_tmp = pd.get_dummies(df_tmp, columns=['class_hour'])
    return df_tmp

In [101]:
def class_weather(df):
    df_tmp = df.copy()
    df_tmp['weather'].replace([1,2,3,4], ['sunny', 'cloudy', 'mainly', 'bad'], inplace=True)
    df_tmp = pd.get_dummies(df_tmp, columns=['weather'])
    return df_tmp

In [102]:
def prepare_features(df, drop_cols=[], target_feature=''):
    df_tmp = df.copy()
    targets = ["casual", "registered", "count"]
# оставляем только один из целевых признаков (casual, registred или count)
    if target_feature in targets:
        y_target = df_tmp[target_feature]
        drop_cols.extend(targets)
# выделяем признаки для "времени дня", "месяца" и дня недели      
    df_clean = get_date_features(df)
# создаём новую переменную (утро, день, вечер, ночь).
    df_clean = class_hour(df_clean)
# представляем категориальные признаки weather виде вектора
    df_clean = class_weather(df_clean)
# Удаляем ненужные признаки (сильно скоррелированные и просто бесполковые, типа datetime, day)
    df_clean = df_clean.drop(drop_cols, axis=1)
    df_clean.head()
    
    return df_clean, y_target

# Getting hands dirty

In [150]:
y_label = 'count'
drop_cols = [
    "day",
    "season",
    "atemp",
    "humidity",
    "datetime",
    "hour",
]
year_month_hour = get_supervised_array(df)
year_month_hour[:5]    

array(['2011-1-0', '2011-1-1', '2011-1-2', '2011-1-3', '2011-1-4'],
      dtype='<U10')

In [138]:
df_clean, y_target = prepare_features(df, drop_cols,y_label)
df_clean.head()

Unnamed: 0,holiday,workingday,temp,windspeed,year,month,weekday,class_hour_day,class_hour_evening,class_hour_morning,class_hour_night,class_hour_rush_day,class_hour_rush_morning,weather_bad,weather_cloudy,weather_mainly,weather_sunny
0,0,0,9.84,0.0,2011,1,5,0,0,0,1,0,0,0,0,0,1
1,0,0,9.02,0.0,2011,1,5,0,0,0,1,0,0,0,0,0,1
2,0,0,9.02,0.0,2011,1,5,0,0,0,1,0,0,0,0,0,1
3,0,0,9.84,0.0,2011,1,5,0,0,0,1,0,0,0,0,0,1
4,0,0,9.84,0.0,2011,1,5,0,0,0,1,0,0,0,0,0,1


In [139]:
y_target.head()

0    16
1    40
2    32
3    13
4     1
Name: count, dtype: int64

## Baseline modelling

In [140]:
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold

In [141]:
# alpha - это общая сила регуляризации
# l1_ration - доля, которая приходтися на Lasso регуляризацию. Соответственно остальное идет на Rigde
regressor = ElasticNet(alpha=1.0, l1_ratio=0.5)

In [251]:
# преобразовываем признаки в вектора
onehotencoder = OneHotEncoder(categorical_features=[5,6])

In [252]:
# нормализовываем признаки
standart = StandardScaler(with_mean=False)

In [253]:
# строим модель
model = make_pipeline(
    onehotencoder, 
    standart, 
    regressor)

In [254]:
def rlmse_score(y_test, y_hat):
    # Your code here
    if len(y_hat) != len(y_test):
        print("Error!")
        return -1
    N = len(y_hat)
    summ = 0
    for idx in range(0, N):
        summ += (np.log10(y_test[idx] + 1) - np.log10(y_hat[idx] + 1))**2
    rlmse = np.sqrt(summ/N)
    return rlmse

# Эту функцию трогать не надо
def rlmse_scorer(estimator, X, y):
    y_hat = estimator.predict(X)
    
    return rlmse_score(y, y_hat)

In [255]:
rlmse_score(
    y_test=[0.75, 0.86, -0.1],
    y_hat=[0.70, 0.81, -0.4]
)

0.10215457232396205

In [256]:
from sklearn.model_selection import validation_curve

In [257]:
# Это не обычная кросс-валидация а стратифицированная. №
# С ее помощью мы добъемся того, что в обучающем и валидационном фолде будет одна и та же пропорция годов-месяцев-часов
# Тут то и понадобится нам year_month_hour
# Подробности http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
n_folds = 5
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=123)

# Переберем 20 значений в интервале от 10^-3 до 10^5
alpha_range = np.logspace(-3, 5, 20)

param_name = "elasticnet__alpha"
# пишем не просто alpha, потому что у нас pipeline и sklearn должен как-то понимать параметр какого этапа мы перебираем
# cv=cv.split(df_clean, year_month_hour)
# model.fit(df_clean, y_target)
# regressor.get_params().keys()
y = y_target.as_matrix()
train_scores, valid_scores = validation_curve(model, df_clean, y, param_name=param_name, param_range=alpha_range,
                                               cv=cv.split(df_clean, year_month_hour), scoring=rlmse_scorer)

# На выходе мы получим 2 матрицы размера 20 на n_folds
# Потому что у нас было 5 фолдов и 20 возможных значений гиперпараметра
# А матриц 2, потому что в каждом фолде есть обучающая и валидационная часть

  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ 

In [258]:
train_scores

array([[       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [0.61255784, 0.61509441, 0.61447332, 0.61585648, 0.61275259],
       [0.65233438, 0.65512875, 0.65401082, 0.65579393, 0.65304881],
       [0.67143021, 0.67429596, 0.6728541 , 0.67477624, 0.67237406],
       [0.6785347 , 0.6814673 , 0.67992121, 0.68192433, 0.67950808],
       [0.67974334, 0.6827429 , 0.68111333, 0.68316628, 0.68072907],
       [0.67974334, 0.6827429 , 0.

In [259]:
valid_scores

array([[       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan,        nan],
       [0.61717473, 0.61550832, 0.61129647, 0.60924085, 0.61982484],
       [0.6575558 , 0.65630639, 0.65126681, 0.64893454, 0.65749037],
       [0.67682499, 0.67567598, 0.67031039, 0.66786279, 0.67576711],
       [0.68398813, 0.68276716, 0.677445  , 0.67491421, 0.68291659],
       [0.6852085 , 0.68401454, 0.67865153, 0.67613449, 0.68416093],
       [0.6852085 , 0.68401454, 0.

In [261]:
# стандартное отклонение
np.std(train_scores, axis=1)

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan, 0.00129585,
       0.00127682, 0.0012333 , 0.00125569, 0.00127679, 0.00127679,
       0.00127679, 0.00127679, 0.00127679, 0.00127679, 0.00127679])

In [262]:
# стандартное отклонение
np.std(valid_scores, axis=1)

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan, 0.00385711,
       0.00354379, 0.00353999, 0.00356755, 0.00357685, 0.00357685,
       0.00357685, 0.00357685, 0.00357685, 0.00357685, 0.00357685])

In [263]:
# среднее значение 
np.mean(train_scores, axis=1)

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan, 0.61414693,
       0.65406334, 0.67314611, 0.68027112, 0.68149898, 0.68149898,
       0.68149898, 0.68149898, 0.68149898, 0.68149898, 0.68149898])

In [265]:
# среднее значение 
np.mean(valid_scores, axis=1)

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan, 0.61460904,
       0.65431078, 0.67328825, 0.68040622, 0.681634  , 0.681634  ,
       0.681634  , 0.681634  , 0.681634  , 0.681634  , 0.681634  ])