In [406]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from time import time
from sklearn.metrics import mean_absolute_error,mean_squared_error

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PowerTransformer

In [407]:
data = pd.read_csv('data/london_merged.csv')
data.head(1)
np.random.seed(0)


In [408]:
data.corr(method='pearson')
data['new']= data['t1'] * data['t2']
data.corr(method='pearson')

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,new
cnt,1.0,0.388798,0.369035,-0.462901,0.116295,-0.166633,-0.051698,-0.096499,-0.11618,0.406547
t1,0.388798,1.0,0.988344,-0.447781,0.145471,-0.097114,-0.042233,-0.005342,-0.285851,0.965622
t2,0.369035,0.988344,1.0,-0.403495,0.088409,-0.098385,-0.040051,-0.00851,-0.2859,0.946437
hum,-0.462901,-0.447781,-0.403495,1.0,-0.287789,0.33475,0.032068,0.028098,0.290381,-0.465079
wind_speed,0.116295,0.145471,0.088409,-0.287789,1.0,0.124803,-0.002606,0.011479,0.010305,0.092746
weather_code,-0.166633,-0.097114,-0.098385,0.33475,0.124803,1.0,0.012939,0.042362,0.098976,-0.123936
is_holiday,-0.051698,-0.042233,-0.040051,0.032068,-0.002606,0.012939,1.0,-0.094898,-0.032488,-0.047011
is_weekend,-0.096499,-0.005342,-0.00851,0.028098,0.011479,0.042362,-0.094898,1.0,0.001067,-0.007548
season,-0.11618,-0.285851,-0.2859,0.290381,0.010305,0.098976,-0.032488,0.001067,1.0,-0.245166
new,0.406547,0.965622,0.946437,-0.465079,0.092746,-0.123936,-0.047011,-0.007548,-0.245166,1.0


In [409]:

data['months'] = data['timestamp'].apply(lambda x: int(x.split('-')[1]))
data['hours'] = data['timestamp'].apply(lambda x: int(x.split()[1][:2]))
data['years'] = data['timestamp'].apply(lambda x: int(x[:4]))
data.drop('timestamp',axis=1,inplace=True)


In [410]:
data.corr(method='spearman')

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,new,months,hours,years
cnt,1.0,0.392384,0.386731,-0.504613,0.181434,-0.119776,-0.051904,-0.067534,-0.107223,0.388526,0.068387,0.468937,0.004637
t1,0.392384,1.0,0.995809,-0.380248,0.159299,-0.059792,-0.04137,-0.004992,-0.279427,0.997986,0.345724,0.154931,-0.047602
t2,0.386731,0.995809,1.0,-0.360955,0.107333,-0.066359,-0.041606,-0.004383,-0.280571,0.999457,0.357718,0.148514,-0.04391
hum,-0.504613,-0.380248,-0.360955,1.0,-0.319598,0.365801,0.032024,0.028362,0.272071,-0.366404,0.11804,-0.294712,0.073413
wind_speed,0.181434,0.159299,0.107333,-0.319598,1.0,0.160982,-0.008737,0.003751,-0.009141,0.122413,-0.078165,0.150897,-0.092412
weather_code,-0.119776,-0.059792,-0.066359,0.365801,0.160982,1.0,0.005594,0.049169,0.116525,-0.064383,0.009558,-0.034016,-0.009785
is_holiday,-0.051904,-0.04137,-0.041606,0.032024,-0.008737,0.005594,1.0,-0.094898,-0.032908,-0.041633,-0.011535,-0.000288,0.026368
is_weekend,-0.067534,-0.004992,-0.004383,0.028362,0.003751,0.049169,-0.094898,1.0,0.00106,-0.005069,-0.005373,0.001803,0.002649
season,-0.107223,-0.279427,-0.280571,0.272071,-0.009141,0.116525,-0.032908,0.00106,1.0,-0.280595,0.208029,4e-06,0.019117
new,0.388526,0.997986,0.999457,-0.366404,0.122413,-0.064383,-0.041633,-0.005069,-0.280595,1.0,0.35485,0.150291,-0.044942


In [411]:
def augmentation(data):
    synt_data = data.copy()
    for month in synt_data['months'].unique():

        t1_std = synt_data[synt_data['months']==month]['t1'].std()
        t2_std = synt_data[synt_data['months']==month]['t2'].std()
        hum_std = synt_data[synt_data['months']==month]['hum'].std()
        wind_std = synt_data[synt_data['months']==month]['wind_speed'].std()

        for i in synt_data[synt_data['months']==month].index:
            if np.random.randint(2) == 1:
                synt_data['t1'].values[i] += t1_std/8
            else:
                synt_data['t1'].values[i] -= t1_std/8
            
            if np.random.randint(2) == 1:
                synt_data['t2'].values[i] += t2_std/8
            else:
                synt_data['t2'].values[i] -= t2_std/8

            if np.random.randint(2) == 1:
                synt_data['hum'].values[i] += hum_std/8
            else:
                synt_data['hum'].values[i] -= hum_std/8

            if np.random.randint(2) == 1:
                synt_data['wind_speed'].values[i] += wind_std/8
            else:
                synt_data['wind_speed'].values[i] -= wind_std/8
    return synt_data
        

In [412]:
# def augmentation(data):
#     synt_data = data.copy()
#     for season in synt_data['season'].unique():

#         t1_std = synt_data[synt_data['season']==season]['t1'].std()
#         t2_std = synt_data[synt_data['season']==season]['t2'].std()
#         hum_std = synt_data[synt_data['season']==season]['hum'].std()
#         wind_std = synt_data[synt_data['season']==season]['wind_speed'].std()

#         for i in synt_data[synt_data['season']==season].index:
#             if np.random.randint(2) == 1:
#                 synt_data['t1'].values[i] += t1_std/10
#             else:
#                 synt_data['t1'].values[i] -= t1_std/10
            
#             if np.random.randint(2) == 1:
#                 synt_data['t2'].values[i] += t2_std/10
#             else:
#                 synt_data['t2'].values[i] -= t2_std/10

#             if np.random.randint(2) == 1:
#                 synt_data['hum'].values[i] += hum_std/10
#             else:
#                 synt_data['hum'].values[i] -= hum_std/10

#             if np.random.randint(2) == 1:
#                 synt_data['wind_speed'].values[i] += wind_std/10
#             else:
#                 synt_data['wind_speed'].values[i] -= wind_std/10
#     return synt_data
        

In [413]:
augmented = augmentation(data)


In [414]:
x,y = data.drop('cnt',axis=1),data['cnt']

In [415]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=0)
extra_sample = augmented.sample(augmented.shape[0]//3)
x_train = pd.concat((x_train,extra_sample.drop('cnt',axis=1)),axis=0)
y_train = pd.concat((y_train,extra_sample['cnt']),axis=0)
transformer = PowerTransformer()
y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
y_test = transformer.transform(y_test.values.reshape(-1,1))


In [416]:
x_train

Unnamed: 0,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,new,months,hours,years
4493,26.000000,25.000000,25.000000,18.000000,1.0,0.0,0.0,1.0,650.00,7,16,2015
14407,16.500000,16.500000,66.000000,5.000000,1.0,0.0,0.0,1.0,272.25,8,2,2016
7862,11.000000,11.000000,72.000000,27.000000,7.0,0.0,1.0,2.0,121.00,11,6,2015
2627,11.500000,11.500000,69.500000,17.000000,1.0,0.0,0.0,0.0,132.25,4,19,2015
1764,9.000000,6.000000,66.000000,18.000000,2.0,0.0,0.0,0.0,54.00,3,14,2015
...,...,...,...,...,...,...,...,...,...,...,...,...
11029,10.076808,8.457725,48.529195,23.922700,1.0,0.0,1.0,0.0,94.50,4,10,2016
7647,9.973347,8.117360,70.283474,17.096995,2.0,0.0,0.0,2.0,71.25,11,7,2015
3182,13.593837,13.536417,51.446218,15.112036,1.0,0.0,1.0,0.0,196.00,5,22,2015
11044,10.423192,7.457725,83.970805,16.922700,2.0,0.0,0.0,0.0,80.00,4,1,2016


In [417]:
num_cols = ['t1','t2','hum','wind_speed','new']
cat_cols = ['weather_code','is_holiday','is_weekend','season','months','years','hours']
num_imputer = SimpleImputer(strategy='constant',fill_value=-999)
cat_imputer =SimpleImputer(strategy='constant',fill_value='missing value')
cat_encoder = OrdinalEncoder(handle_unknown='ignore')
num_pipe = Pipeline([('num_imputer',num_imputer)])
cat_pipe = Pipeline([('cat_imputer',cat_imputer),('cat_encoder',cat_encoder)])

prep_pro = ColumnTransformer([('numerical',num_pipe,num_cols),('categorical',cat_pipe,cat_cols)],remainder='drop')


In [418]:
trees = {
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Extra Trees': ExtraTreesRegressor(),
        'Ada Boost': AdaBoostRegressor(),
        'XGB Regressor': XGBRegressor(),
        'LGBM Regressor': LGBMRegressor(),
        'CatBoost Regressor': CatBoostRegressor(verbose=False),
        'Sklearn Gradient Boost' : GradientBoostingRegressor()
}

pipes = {name: make_pipeline(prep_pro,model) for name,model in trees.items()}
results=[]
for name,model in trees.items():
        start = time()
        model.fit(x_train,y_train)
        tot_time = time() - start

        pred = model.predict(x_test)

        results.append({
                'name': name,
                'MSE':  mean_squared_error(y_test,pred),
                'MAE': mean_absolute_error(y_test,pred),
                'fitting time': tot_time
        })



  model.fit(x_train,y_train)
  model.fit(x_train,y_train)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [419]:
dfff= pd.DataFrame(results)
dfff.sort_values(by='MSE')

Unnamed: 0,name,MSE,MAE,fitting time
6,CatBoost Regressor,0.026682,0.114985,2.824519
2,Extra Trees,0.027432,0.105648,3.436988
4,XGB Regressor,0.029014,0.120842,0.580146
1,Random Forest,0.030483,0.116254,6.069911
5,LGBM Regressor,0.031957,0.124807,0.125084
0,Decision Tree,0.060076,0.148063,0.096593
7,Sklearn Gradient Boost,0.078116,0.201859,1.656924
3,Ada Boost,0.217791,0.379985,0.838897


In [420]:
dfff= pd.DataFrame(results)
dfff.sort_values(by='MSE')

Unnamed: 0,name,MSE,MAE,fitting time
6,CatBoost Regressor,0.026682,0.114985,2.824519
2,Extra Trees,0.027432,0.105648,3.436988
4,XGB Regressor,0.029014,0.120842,0.580146
1,Random Forest,0.030483,0.116254,6.069911
5,LGBM Regressor,0.031957,0.124807,0.125084
0,Decision Tree,0.060076,0.148063,0.096593
7,Sklearn Gradient Boost,0.078116,0.201859,1.656924
3,Ada Boost,0.217791,0.379985,0.838897
