In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import missingno as msno
from datetime import *
import warnings
import calendar
from scipy.stats import kurtosis, skew
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.svm import SVR

In [3]:
#Some default functions to ease the data exploration
def int_min(param):
    return np.iinfo(param).min

def int_max(param):
    return np.iinfo(param).max

def flo_min(param):
    return np.finfo(param).min

def flo_max(param):
    return np.finfo(param).max

def decrease_mem(in_table, verbose = True):
    data_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    initial_mem_val = in_table.memory_usage().sum() / 2**20 #Calculate memory usage of complete column 
                                                            #and dividng it by 2^20 to convert it into Mb    
    for c in in_table.columns:
        column_type = in_table[c].dtypes
        if column_type in data_types:
            min_val = in_table[c].min()
            max_val = in_table[c].max()
            if str(column_type)[:3] == 'int':
                if min_val > int_min(np.int8) and max_val < int_max(np.int8): #calling another fn int_min & int_max
                    in_table[c] = in_table[c].astype(np.int8)
                elif min_val > int_min(np.int16) and max_val < int_max(np.int16):
                    in_table[c] = in_table[c].astype(np.int16)
                elif min_val > int_min(np.int32) and max_val < int_max(np.int32):
                    in_table[c] = in_table[c].astype(np.int32)
                elif min_val > int_min(np.int64) and max_val < int_max(np.int64):
                    in_table[c] = in_table[c].astype(np.int64)  
            elif min_val > flo_min(np.float16) and max_val < flo_max(np.float16):
                    in_table[c] = in_table[c].astype(np.float16)
            elif min_val > flo_min(np.float32) and max_val < flo_max(np.float32):
                    in_table[c] = in_table[c].astype(np.float32)
            else: in_table[c] = in_table[c].astype(np.float64)
                
    mem_aftr_ops = in_table.memory_usage().sum() / 2**20
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.
          format(mem_aftr_ops, ((initial_mem_val - mem_aftr_ops) / initial_mem_val)* 100))
    return in_table

def EDA(df_table):
    summ = pd.DataFrame(df_table.dtypes, columns=['Data Type'])
    summ = summ.reset_index()
    summ['Name'] = summ['index']
    summ = summ[['Name', 'Data Type']]
    summ['Missing Values'] = df_table.isnull().sum().values
    summ['Unique'] = df_table.nunique().values
    summ['First Value'] = df_table.loc[0].values
    summ['Second Value'] = df_table.loc[1].values
    summ['Third Value'] = df_table.loc[2].values
    summ['Minimum Value'] = df_table.min().values
    summ['Maximum Value'] = df_table.max().values
    #summary['Uniques'] = df.nunique().values
    
    for name in summ['Name'].value_counts().index:
        summ.loc[summ['Name'] == name, 'Entropy'] = round(stats.entropy(df_table[name].value_counts(normalize=True), 
                                                                        base=2),2)
    return summ

def plot_stack(column_1, column_2):
    plot_stck=pd.crosstab(index=column_1, columns=column_2)
    plot_stck.plot(kind='bar', figsize=(8,8), stacked=True)
    return

In [4]:
#reading the dataset 'mod_data' for model training

mod_data = pd.read_csv(r"D:\OneDrive - Queen's University\ECE\Statistical Learning\Midterm\mod_data.csv")
mod_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,month,weekday,day,year,peak_time,weekoff_count,best_condition,not_fav,sine_hr,cos_hr
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.4,81,6.986063,3,...,1,Saturday,1,2011,0,0,0,0,0.0,1.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.63,80,6.845595,8,...,1,Saturday,1,2011,0,0,0,0,0.258819,0.965926
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.63,80,6.845595,5,...,1,Saturday,1,2011,0,0,0,0,0.5,0.866025
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.4,75,6.672821,3,...,1,Saturday,1,2011,0,0,0,0,0.707107,0.707107
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.4,75,6.672821,0,...,1,Saturday,1,2011,0,0,0,0,0.866025,0.5


# Feature Selection

### I have tried to get an brief idea of what would be the best features for training the model. For that I have used sklearn feature selection library.

In [304]:
# Performing feature selection using Sklearn's feature selection library. Based on this, the best features are 
# ['season', 'temp', 'humidity', 'month', 'year', 'peak_time', 'weekoff_count', 'best_condition', 'sine_hr', 'cos_hr']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression
from dataframe_column_identifier import DataFrameColumnIdentifier
dfci = DataFrameColumnIdentifier()

fs_cols = ['season', 'holiday', 'workingday', 'weather', 'temp',
           'humidity', 'windspeed', 'month', 'year', 'peak_time',
           'weekoff_count', 'best_condition', 'not_fav', 'sine_hr', 'cos_hr']

fs_label = np.array(mod_data['count'])

for i in range(1, 11):
    kbest = SelectKBest(score_func = mutual_info_regression, k = i)
    X_clf_new = kbest.fit_transform(mod_data[fs_cols], fs_label)
    kbest_get_support_output = kbest.get_support()

    print(dfci.select_columns_KBest(one_data[fs_cols], kbest_get_support_output, verbose=1))

1 - Feature selected: cos_hr
['cos_hr']
1 - Feature selected: sine_hr
2 - Feature selected: cos_hr
['sine_hr', 'cos_hr']
1 - Feature selected: peak_time
2 - Feature selected: sine_hr
3 - Feature selected: cos_hr
['peak_time', 'sine_hr', 'cos_hr']
1 - Feature selected: temp
2 - Feature selected: peak_time
3 - Feature selected: sine_hr
4 - Feature selected: cos_hr
['temp', 'peak_time', 'sine_hr', 'cos_hr']
1 - Feature selected: temp
2 - Feature selected: humidity
3 - Feature selected: peak_time
4 - Feature selected: sine_hr
5 - Feature selected: cos_hr
['temp', 'humidity', 'peak_time', 'sine_hr', 'cos_hr']
1 - Feature selected: temp
2 - Feature selected: humidity
3 - Feature selected: month
4 - Feature selected: peak_time
5 - Feature selected: sine_hr
6 - Feature selected: cos_hr
['temp', 'humidity', 'month', 'peak_time', 'sine_hr', 'cos_hr']
1 - Feature selected: temp
2 - Feature selected: humidity
3 - Feature selected: month
4 - Feature selected: peak_time
5 - Feature selected: best_co

## Modelling

For modelling, I have first trained the models on the complete training dataset (without considering only historical data). The reason for doing so was to get a clear picture of the accuracy of the models. Along with it, it was easier for performing hyperparameter tuning for different models. After selecting the best parameters for different models, I trained my models on the historical data only (as per the rules of kaggle competition). Then I tried to fine tune my hyperparameters using the historical data. However, as anticipated, it was computationally very expensive and it gave me memory error every time. So, I used the earlier best parameters only! 

In [5]:
for col in ['casual', 'registered', 'count']:
    mod_data['%s_log' % col] = np.log(mod_data[col] + 1) #transforming dependent variables with log 
    # 1 has been added to avoid infinity values at zero

In [164]:
mod_data.columns #checking columns of mod_data

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'date', 'hour', 'month', 'weekday', 'year', 'peak_time',
       'weekoff_count', 'best_condition', 'not_fav', 'sine_hr', 'cos_hr',
       'casual_log', 'registered_log', 'count_log', 'day'],
      dtype='object')

In [6]:
#basic functions to prepare data and predict models

# error calculation function 
def get_rmsle(y_pred, y_actual):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

# function for getting the mod_data dataframe
def get_data():
    data = mod_data[mod_data['count'] != 0].copy()
    return data

# splitting the data into train and validation set. # here we have used the cuttoff day as 15 instead of randomly splitting 
# the data. so, training data would be from day 1 to day 15, while the validation data would be day 16 onwards.
def custom_train_test_split(data, cutoff_day=15):
    train = data[data['day'] <= cutoff_day]
    test = data[data['day'] > cutoff_day]
    
    return train, test


# we would be predicting casual and registered values seperately, hence, preparing the labels for both.
def prep_data(data, input_cols):
    X = data[input_cols]
    y_r = np.array(data['registered_log'])
    y_c = np.array(data['casual_log'])
    
    return X, y_r, y_c


# function to prepare and fit model and calculate error!
def predict(input_cols, model_params={}):
    
    data = get_data()
    train, test = custom_train_test_split(data)
    X_train, y_train_r, y_train_c = prep_data(train, input_cols)
    X_test, y_test_r, y_test_c = prep_data(test, input_cols)
    model_params.update({
        'n_jobs': -1,
        'random_state': 123,
    })
    model = RandomForestRegressor(**model_params)
    model_r = model.fit(X_train, y_train_r)
    y_pred_r = np.exp(model_r.predict(X_test)) - 1
    model_c = model.fit(X_train, y_train_c)
    y_pred_c = np.exp(model_c.predict(X_test)) - 1
    y_pred_comb = np.round(y_pred_r + y_pred_c)
    y_pred_comb[y_pred_comb < 0] = 0
    y_test_comb = np.exp(y_test_r) + np.exp(y_test_c) - 2
    score = get_rmsle(y_pred_comb, y_test_comb)
    return (y_pred_comb, y_test_comb, score)

# function to prepare and fit model and calculate error!
def predict_on_validation_set(model, input_cols):
    data = get_data()

    train, test = custom_train_test_split(data)

    X_train, y_train_r, y_train_c = prep_data(train, input_cols)
    X_test, y_test_r, y_test_c = prep_data(test, input_cols)

    model_r = model.fit(X_train, y_train_r)
    y_pred_r = np.exp(model_r.predict(X_test)) - 1

    model_c = model.fit(X_train, y_train_c)
    y_pred_c = np.exp(model_c.predict(X_test)) - 1

    y_pred_comb = np.round(y_pred_r + y_pred_c)
    y_pred_comb[y_pred_comb < 0] = 0

    y_test_comb = np.exp(y_test_r) + np.exp(y_test_c) - 2

    score = get_rmsle(y_pred_comb, y_test_comb)
    return (y_pred_comb, y_test_comb, score)

df_test = mod_data[mod_data['count'] == 0].copy()

# predict on test set & transform output back from log scale
def predict_on_test_set(model, x_cols):
    # prepare training set
    df_train = mod_data[mod_data['count'] != 0].copy()
    X_train = df_train[x_cols]
    y_train_cas = np.array(df_train['casual_log'])
    y_train_reg = np.array(df_train['registered_log'])

    # prepare test set
    X_test = df_test[x_cols]

    casual_model = model.fit(X_train, y_train_cas)
    y_pred_cas = casual_model.predict(X_test)
    y_pred_cas = np.exp(y_pred_cas) - 1
    registered_model = model.fit(X_train, y_train_reg)
    y_pred_reg = registered_model.predict(X_test)
    y_pred_reg = np.exp(y_pred_reg) - 1
    # add casual & registered predictions together
    return y_pred_cas + y_pred_reg

#### Here I tried to perform the forward approach to determine best features

In [168]:
base_cols = ['weather', 'temp', 'atemp', 'humidity', 'windspeed', 'holiday', 'workingday', 'season', 'sine_hr', 'cos_hr']

try_cols = ['month', 'year', 'peak_time', 'weekoff_count', 'best_condition', 'not_fav']

for i in range(0, len(try_cols) + 1):
    new_cols = try_cols[:i]
    all_cols = base_cols + new_cols
    print('cols: base_cols + {}\nrmse: {}\n'.format(
        new_cols, 
        predict(all_cols)
    ))
    
# best_parameters selected = base_cols + ['month', 'year', 'peak_time', 'weekoff_count']



cols: base_cols + []
rmse: 0.46763734753166936





cols: base_cols + ['month']
rmse: 0.4585166613826167





cols: base_cols + ['month', 'year']
rmse: 0.3516819428964412





cols: base_cols + ['month', 'year', 'peak_time']
rmse: 0.3588887050658396





cols: base_cols + ['month', 'year', 'peak_time', 'weekoff_count']
rmse: 0.35514102551051635





cols: base_cols + ['month', 'year', 'peak_time', 'weekoff_count', 'best_condition']
rmse: 0.3567468954563939





cols: base_cols + ['month', 'year', 'peak_time', 'weekoff_count', 'best_condition', 'not_fav']
rmse: 0.35595848478952036



# Grid Search for hyperparameter tuning

In [171]:
#First, I tried to fit a randomforestRegressor with default parameters. Got rmsle score of 0.46657.
params = {'n_estimators': 1000, 'max_depth': 15, 'random_state': 0, 'min_samples_split' : 5, 'n_jobs': -1} # random parameters
rf_model = RandomForestRegressor(**params)
rf_cols = [
    'weather', 'temp', 'atemp', 'windspeed',
    'workingday', 'season', 'holiday', 'not_fav',
    'hour', 'peak_time'
    ]

(rf_p, rf_t, rf_score) = predict_on_validation_set(rf_model, rf_cols)
print(rf_score)

0.46657138733119374


### Random Forest Parameter tuning

In [189]:
x_cols = ['weather', 'temp', 'atemp', 'humidity', 'windspeed',
          'holiday', 'workingday', 'season', 'hour', 'year', 'best_condition', 'not_fav']

rf_model = RandomForestRegressor()
# random forest param grid
n_estimators = [500, 1000, 1500]
min_samples_splits = [6, 8, 10, 12, 14]

best_score, best_params = np.inf, None

# loop through param grid & find top performer
for ne in n_estimators:
    for mss in min_samples_splits:
        params = {'n_estimators': ne, 'min_samples_split': mss}
        (rf_p, rf_t, score) = predict(x_cols, params)
        print('trees: {}, mss: {}, rmse: {}'.format(ne, mss, score))
        
        if score < best_score:
            best_params = params
            best_score = score
            
print('best params: {}, rmse: {}'.format(best_params, best_score))

trees: 500, mss: 6, rmse: 0.3624808530696919
trees: 500, mss: 8, rmse: 0.3607852450093484
trees: 500, mss: 10, rmse: 0.35980365347162535
trees: 500, mss: 12, rmse: 0.3588912529302141
trees: 500, mss: 14, rmse: 0.3589586589712074
trees: 1000, mss: 6, rmse: 0.36250783382492097
trees: 1000, mss: 8, rmse: 0.3609776060495743
trees: 1000, mss: 10, rmse: 0.3596985492605452
trees: 1000, mss: 12, rmse: 0.3583742993569411
trees: 1000, mss: 14, rmse: 0.35841353790238156
trees: 1500, mss: 6, rmse: 0.36178199926268795
trees: 1500, mss: 8, rmse: 0.36040295293113433
trees: 1500, mss: 10, rmse: 0.3595663963215969
trees: 1500, mss: 12, rmse: 0.35845279369155203
trees: 1500, mss: 14, rmse: 0.3583228852913084
best params: {'n_estimators': 1500, 'min_samples_split': 14, 'n_jobs': -1, 'random_state': 123}, rmse: 0.3583228852913084


In [190]:
# training with the best parameters. Hyperparameter tuning paid! the error has reduced significantly!

params = {'n_estimators': 1500, 'max_depth': 15, 'random_state': 123, 'min_samples_split' : 14, 'n_jobs': -1}
rf_model = RandomForestRegressor(**params)
rf_cols = ['weather', 'temp', 'atemp', 'humidity', 'windspeed',
           'holiday', 'workingday', 'season', 'hour', 'year',
           'best_condition', 'not_fav']

(rf_p, rf_t, rf_score) = predict_on_validation_set(rf_model, rf_cols)
print(rf_score)

0.3581244629806633


### Gradient Boosting Parameter Tuning

#### Training Gradient Boosting Regressor with default parameters. Combining with the output of Random Forest Regressor!

In [174]:
# Training with default parameters!
params = {'n_estimators': 150, 'max_depth': 5, 'random_state': 0, 'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
gbm_model = GradientBoostingRegressor(**params)
gbm_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday', 'workingday', 'season',
    'hour', 'year', 'best_condition']

(gbm_p, gbm_t, gbm_score) = predict_on_validation_set(gbm_model, gbm_cols)
print(gbm_score)

0.3311940527956884


#### By combining the predictions of random forest and gradient boosting (20-80%), 
## I got Kaggle Score of 0.38024!

### However, as mentioned earlier, I have used the complete training dataset for training the model. :(

In [175]:
rf_pred = predict_on_test_set(rf_model, rf_cols)
gbm_pred = predict_on_test_set(gbm_model, gbm_cols)
y_pred = np.round(.2*rf_pred + .8*gbm_pred)
# output predictions for submission
df_test['count'] = y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submit4.csv', index=False) # kaggle score 0.38024!

#### Let's see how does the GradientBoostingRegressor do with default parameters on kaggle (without combining it with randomforest)

In [181]:
params = {'n_estimators': 150, 'max_depth': 5, 'random_state': 0, 'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
gbm_model = GradientBoostingRegressor(**params)
gbm_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday', 'workingday', 'season',
    'hour', 'year', 'best_condition', 'not_fav']

(gbm_p, gbm_t, gbm_score) = predict(gbm_model, gbm_cols)
print(gbm_score)

0.3339104996093259


## I got a score of 0.38889 on Kaggle with default parameters of GBM.
#### This implies that we have done good feature engineering. Let's fine tune the model.

In [182]:
# rf_pred = predict_on_test_set(rf_model, rf_cols)
gbm_pred = predict_on_test_set(gbm_model, gbm_cols)
y_pred = np.round(gbm_pred)
# output predictions for submission
df_test['count'] = y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submit5.csv', index=False) # kaggle score 0.38889

### Parameter Tuning for GBM

In [194]:
def predict_gbm(input_cols, model_params={}):
    
    data = get_data()
    
    train, test = custom_train_test_split(data)
    
    X_train, y_train_r, y_train_c = prep_data(train, input_cols)
    X_test, y_test_r, y_test_c = prep_data(test, input_cols)

    model_params.update({'random_state': 123})
    model = GradientBoostingRegressor(**model_params)
    
    model_r = model.fit(X_train, y_train_r)
    y_pred_r = np.exp(model_r.predict(X_test)) - 1
    
    model_c = model.fit(X_train, y_train_c)
    y_pred_c = np.exp(model_c.predict(X_test)) - 1

    y_pred_comb = np.round(y_pred_r + y_pred_c)
    y_pred_comb[y_pred_comb < 0] = 0
    
    y_test_comb = np.exp(y_test_r) + np.exp(y_test_c) - 2

    score = get_rmsle(y_pred_comb, y_test_comb)
    return (y_pred_comb, y_test_comb, score)

In [195]:
#Performing hyperparameter tuning of gradient boosting regression. 
# best parameters came out to be: best params: 
# {'n_estimators': 500, 'min_samples_leaf': 8, 'learning_rate': 0.1, 'subsample': 0.7, 'random_state': 123}

x_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday', 'workingday', 'season',
    'hour', 'year', 'best_condition', 'not_fav']

rf_model = GradientBoostingRegressor()
# gradient boosting param grid
n_estimators = [150, 200, 500, 1000, 1500, 1800]
min_samples_leaf = [6, 8, 10, 12, 14]
learning_rate = [0.1, 0.01]
subsample = [0.5, 0.6, 0.7, 0.8]

best_score, best_params = np.inf, None

# loop through param grid & find top performer
for ne in n_estimators:
    for mss in min_samples_leaf:
        for lr in learning_rate:
            for sub in subsample:
                params = {'n_estimators': ne, 'min_samples_leaf': mss, 'learning_rate': lr, 'subsample': sub}
                (rf_p, rf_t, score) = predict_gbm(x_cols, params)
                print('trees: {}, mss: {}, rmse: {}'.format(ne, mss, score))

                if score < best_score:
                    best_params = params
                    best_score = score
            
print('best params: {}, rmse: {}'.format(best_params, best_score))

trees: 150, mss: 6, rmse: 0.35330101166320665
trees: 150, mss: 6, rmse: 0.3470252420547264
trees: 150, mss: 6, rmse: 0.34556942136931673
trees: 150, mss: 6, rmse: 0.3486512911656853
trees: 150, mss: 6, rmse: 0.6936687676390332
trees: 150, mss: 6, rmse: 0.6931471005362088
trees: 150, mss: 6, rmse: 0.694288700457273
trees: 150, mss: 6, rmse: 0.693244272421617
trees: 150, mss: 8, rmse: 0.34574854766037133
trees: 150, mss: 8, rmse: 0.34748139354025953
trees: 150, mss: 8, rmse: 0.3450228583608803
trees: 150, mss: 8, rmse: 0.3493410563176933
trees: 150, mss: 8, rmse: 0.6937250867338061
trees: 150, mss: 8, rmse: 0.6931471005362088
trees: 150, mss: 8, rmse: 0.694288700457273
trees: 150, mss: 8, rmse: 0.693244272421617
trees: 150, mss: 10, rmse: 0.3498323250528359
trees: 150, mss: 10, rmse: 0.3437897406420964
trees: 150, mss: 10, rmse: 0.34581040933420765
trees: 150, mss: 10, rmse: 0.350775381671304
trees: 150, mss: 10, rmse: 0.6937250867338061
trees: 150, mss: 10, rmse: 0.6931471005362088
tree

trees: 1500, mss: 10, rmse: 0.3442548578602096
trees: 1500, mss: 10, rmse: 0.34495127377298407
trees: 1500, mss: 10, rmse: 0.3478434168821772
trees: 1500, mss: 10, rmse: 0.3471131965649173
trees: 1500, mss: 10, rmse: 0.3463264558382087
trees: 1500, mss: 10, rmse: 0.34738653712080186
trees: 1500, mss: 12, rmse: 0.34665215489970364
trees: 1500, mss: 12, rmse: 0.34205207584521585
trees: 1500, mss: 12, rmse: 0.3436729617025161
trees: 1500, mss: 12, rmse: 0.34403579871232454
trees: 1500, mss: 12, rmse: 0.34799255967848547
trees: 1500, mss: 12, rmse: 0.3480944532754327
trees: 1500, mss: 12, rmse: 0.34634510325102336
trees: 1500, mss: 12, rmse: 0.34754606773186214
trees: 1500, mss: 14, rmse: 0.34731198485956055
trees: 1500, mss: 14, rmse: 0.3449036007212849
trees: 1500, mss: 14, rmse: 0.34389089995555394
trees: 1500, mss: 14, rmse: 0.34708107010117784
trees: 1500, mss: 14, rmse: 0.35021031162976257
trees: 1500, mss: 14, rmse: 0.3491088481384823
trees: 1500, mss: 14, rmse: 0.3465882015324493
t

In [6]:
# Defining functions for training the model on each months (still not using historic data, just using each month for
# both years as training data). By this, I would be able to understand if my algorithm is accurately predicting the count - 
# for each month!

def custom_train_test_split1(data, month, cutoff_day=15):
    train = data[(data['day'] <= cutoff_day) & (data['month'] == month)]
    test = data[(data['day'] > cutoff_day) & (data['month'] == month)]
    
    return train, test
    
def predict_1(model, input_cols):
    data = get_data()
    
    for month in range(1, 13):
    
        train, test = custom_train_test_split1(data, month)

        X_train, y_train_r, y_train_c = prep_data(train, input_cols)
        X_test, y_test_r, y_test_c = prep_data(test, input_cols)

        model_r = model.fit(X_train, y_train_r)
        y_pred_r = np.exp(model_r.predict(X_test)) - 1

        model_c = model.fit(X_train, y_train_c)
        y_pred_c = np.exp(model_c.predict(X_test)) - 1

        y_pred_comb = np.round(y_pred_r + y_pred_c)
        y_pred_comb[y_pred_comb < 0] = 0

        y_test_comb = np.exp(y_test_r) + np.exp(y_test_c) - 2

        score = get_rmsle(y_pred_comb, y_test_comb)
        print("month {}, score {}".format(month, score))
    return (y_pred_comb, y_test_comb, score)

In [11]:
# Based on the output, it can be seen that the algorithm's rmsle score for month 1-5 is low; however, it gets improved after that.

best_params = {'n_estimators': 500, 'min_samples_leaf': 8, 'learning_rate': 0.1, 'subsample': 0.7, 'random_state': 123}

gbm_model = GradientBoostingRegressor(**best_params)
gbm_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday', 'workingday', 'season',
    'hour', 'year', 'best_condition', 'not_fav']

(gbm_p, gbm_t, gbm_score) = predict_1(gbm_model, gbm_cols)
print(gbm_score)

month 1, score 0.5156243744826036
month 2, score 0.41161759063326836
month 3, score 0.40848867269659617
month 4, score 0.573548815741194
month 5, score 0.3608484579611672
month 6, score 0.3166987362456103
month 7, score 0.283281206225008
month 8, score 0.3481298011184439
month 9, score 0.4254745493446438
month 10, score 0.39954383474426486
month 11, score 0.3712948590399787
month 12, score 0.393385934982505
0.393385934982505


# Final Code: Defining function to predict on historical data (as per kaggle rule) i.e., only using information which was available prior to the time of prediction.

In [241]:
def predict_on_test_set_1(model, x_cols):
    # prepare training set
    df_ = mod_data[mod_data['count'] != 0].copy()
    df_t = mod_data[mod_data['count'] == 0].copy()
    years = [2011, 2012]
    df_batch = pd.DataFrame()
    df_result = pd.DataFrame()
    result = np.empty((0,1), float)
    y_result = []
    for yr in years:
        for i in range(1, 13):
            df_train = df_[(df_['month'] == i) & (df_['year'] == yr)]
            df_batch = df_batch.append(df_train)
            
            X_train = df_batch[x_cols]
            y_train_cas = np.array(df_batch['casual_log'])
            y_train_reg = np.array(df_batch['registered_log'])

            # prepare test set
            df_test = df_t[(df_t['month'] == i) & (df_t['year'] == yr)]
            X_test = df_test[x_cols]

            casual_model = model.fit(X_train, y_train_cas)
            y_pred_cas = casual_model.predict(X_test)
            y_pred_cas = np.exp(y_pred_cas) - 1
            registered_model = model.fit(X_train, y_train_reg)
            y_pred_reg = registered_model.predict(X_test)
            y_pred_reg = np.exp(y_pred_reg) - 1
            y_count = y_pred_cas + y_pred_reg
            print(y_count.size)
            y_result = np.concatenate([y_result, y_count])
    return y_result

### Predicting on the test dataset using Gradient Boosting Regressor!
## I got Kaggle Score of 0.42099!

In [264]:
best_params = {'n_estimators': 500, 'min_samples_leaf': 8, 'learning_rate': 0.1, 'subsample': 0.7, 'random_state': 123}

gbm_model = GradientBoostingRegressor(**best_params)
gbm_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday', 'workingday', 'season',
    'hour', 'year', 'best_condition', 'not_fav']

gbm_pred = predict_on_test_set_1(gbm_model, gbm_cols)

y_pred = np.round(gbm_pred)

# output predictions for submission

df_test['count'] = y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submit6_1.csv', index=False) # kaggle score 0.42099 (best params)

# Please ignore the outputs below!

257
203
284
264
288
264
288
275
264
288
263
285
288
237
288
264
288
264
288
288
264
252
263
286


### Predicting on the test dataset using Gradient Boosting Regressor; but dropping one last column 'not_fav'! By doing so, it improved the score. 
## I got Kaggle Score of 0.41872!

In [266]:
gbm_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday', 'workingday', 'season',
    'hour', 'year', 'best_condition']
best_params = {'n_estimators': 500, 'min_samples_leaf': 8, 'learning_rate': 0.1, 'subsample': 0.7, 'random_state': 123}
gbm_model = GradientBoostingRegressor(**best_params)

gbm_pred = predict_on_test_set_1(gbm_model, gbm_cols)

y_pred = np.round(gbm_pred)

# output predictions for submission

df_test['count'] = y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submit7_1.csv', index=False) # kaggle score 0.41872 (best_params)

# Please ignore the outputs below!

257
203
284
264
288
264
288
275
264
288
263
285
288
237
288
264
288
264
288
288
264
252
263
286


In [245]:
#Training RandomForest with best_parameters and best_columns. RandomForest does not perform as good as GradientBoosting but \n
# by combining there outputs I got better score at Kaggle! (show in next cell)

params = {'n_estimators': 1500, 'max_depth': 15, 'random_state': 123, 'min_samples_split' : 14, 'n_jobs': -1}
rf_model = RandomForestRegressor(**params)
rf_cols = [
    'weather', 'temp', 'atemp', 'windspeed',
    'workingday', 'season', 'holiday', 'not_fav',
    'hour', 'peak_time'
    ]

rnd_pred = predict_on_test_set_1(rf_model, rf_cols)

rf_y_pred = np.round(rnd_pred)

# output predictions for submission

df_test['count'] = rf_y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submit9.csv', index=False) # kaggle score 0.48258

# Please ignore the outputs below!

257
203
284
264
288
264
288
275
264
288
263
285
288
237
288
264
288
264
288
288
264
252
263
286


# Combining Predictions of GradientBoosting and RandomForest Improves the overall Kaggle Score. I got Kaggle Score of 0.41490 (Best)! 

In [246]:
# Combining Randomforest and GradientBoosting predictions by 20-80%. We got the score of 0.41490
y_pred = np.round(.2*rnd_pred + .8*gbm_pred)
# output predictions for submission
df_test['count'] = y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submit10.csv', index=False) # kaggle score (BEST) 0.41490

# Support Vector Regression (SVR)

In [25]:
# Predicting with default parameters of SVR
svr_cols = ['weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'year', 'peak_time',
            'weekoff_count', 'best_condition', 'not_fav', 'season_is_1',
            'season_is_2', 'season_is_3', 'season_is_4', 'holiday_is_0',
            'holiday_is_1', 'workingday_is_0', 'workingday_is_1', 'hour_is_0',
            'hour_is_1', 'hour_is_2', 'hour_is_3', 'hour_is_4', 'hour_is_5',
            'hour_is_6', 'hour_is_7', 'hour_is_8', 'hour_is_9', 'hour_is_10',
            'hour_is_11', 'hour_is_12', 'hour_is_13', 'hour_is_14', 'hour_is_15',
            'hour_is_16', 'hour_is_17', 'hour_is_18', 'hour_is_19', 'hour_is_20',
            'hour_is_21', 'hour_is_22', 'hour_is_23']

clf_svr = SVR(C=1.0, epsilon=0.2)

df_test_svr = pd.read_csv(r"D:\OneDrive - Queen's University\ECE\Statistical Learning\Midterm\test.csv")
mod_svr = predict_on_test_set_2(one_data, clf_svr, svr_cols)


svr_y_pred = np.round(mod_svr)

# output predictions for submission

df_test_svr['count'] = svr_y_pred
final_df = df_test_svr[['datetime', 'count']].copy()
final_df.to_csv('submit15.csv', index=False) # kaggle score 1.10676 



257
203




284




264




288




264




288




275




264




288




263




285




288




237




288




264




288




264




288




288




264




252




263




286


In [22]:
# Tuning for Parameter C.

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
svr_regressor = SVR(verbose=True)

svr_cols = ['weather', 'temp', 'atemp', 'humidity', 'windspeed',
            'holiday', 'workingday', 'season',
            'hour', 'year', 'best_condition', 'not_fav']

Xtrain_svr = mod_data[svr_cols]
ytrain_svr = mod_data['count_log']

regre_svr = SVR(verbose=True)
C = [float(x) for x in np.linspace(start = 0.5, stop = 2, num = 5)]
kernelstring = ['rbf']
gamma = ['auto']
epsilon = [0.2]

# create random grid
random_grid = {'C': C, 'kernel': kernelstring, 'gamma': gamma, 'epsilon': epsilon}

# Random search of parameters
svr_random = RandomizedSearchCV(estimator = svr_regressor, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=123, n_jobs = -1)

# Fit the model
svr_random.fit(Xtrain_svr, np.array(ytrain_svr))

# print results
print(svr_random.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  23 out of  25 | elapsed:  1.5min remaining:    7.7s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.8min finished


[LibSVM]{'kernel': 'rbf', 'gamma': 'auto', 'epsilon': 0.2, 'C': 0.5}


In [24]:
# Predicting with tuned parameters of SVR
svr_cols = ['weather', 'temp', 'atemp', 'humidity', 'windspeed',
            'holiday', 'workingday', 'season',
            'hour', 'year', 'best_condition', 'not_fav']

clf_svr = SVR(kernel= 'rbf', gamma = 'auto', epsilon = 0.2, C = 0.5)

df_test_svr = pd.read_csv(r"D:\OneDrive - Queen's University\ECE\Statistical Learning\Midterm\test.csv")
mod_svr = predict_on_test_set_2(mod_data, clf_svr, svr_cols)


svr_y_pred = np.round(mod_svr)

# output predictions for submission

df_test_svr['count'] = svr_y_pred
final_df = df_test_svr[['datetime', 'count']].copy()
final_df.to_csv('submit14.csv', index=False) # kaggle score 0.98756

257
203
284
264
288
264
288
275
264
288
263
285
288
237
288
264
288
264
288
288
264
252
263
286


## Training with one hot encoding the categorical values

In [11]:
one_data = mod_data.copy()
one_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,year,peak_time,weekoff_count,best_condition,not_fav,sine_hr,cos_hr,casual_log,registered_log,count_log
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.40,81,6.986063,3,...,2011,0,0,0,0,0.000000,1.000000,1.386294,2.639057,2.833213
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.63,80,6.845595,8,...,2011,0,0,0,0,0.258819,0.965926,2.197225,3.496508,3.713572
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.63,80,6.845595,5,...,2011,0,0,0,0,0.500000,0.866025,1.791759,3.332205,3.496508
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.40,75,6.672821,3,...,2011,0,0,0,0,0.707107,0.707107,1.386294,2.397895,2.639057
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.40,75,6.672821,0,...,2011,0,0,0,0,0.866025,0.500000,0.000000,0.693147,0.693147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.000000,0,...,2012,1,0,0,1,-0.965926,0.258819,0.000000,0.000000,0.000000
17375,2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.000000,0,...,2012,0,0,0,1,-0.866025,0.500000,0.000000,0.000000,0.000000
17376,2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.000000,0,...,2012,0,0,0,1,-0.707107,0.707107,0.000000,0.000000,0.000000
17377,2012-12-31 22:00:00,1,0,1,1,10.66,13.63,56,9.000000,0,...,2012,0,0,0,0,-0.500000,0.866025,0.000000,0.000000,0.000000


In [12]:
one_data.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'date', 'hour', 'month', 'weekday', 'day', 'year', 'peak_time',
       'weekoff_count', 'best_condition', 'not_fav', 'sine_hr', 'cos_hr',
       'casual_log', 'registered_log', 'count_log'],
      dtype='object')

In [13]:
def dummy_encode(data, column):
    df = pd.get_dummies(data, columns=column, prefix=["season_is", "holiday_is", "workingday_is", "hour_is"])
    return df

one_data_1 = dummy_encode(one_data[['season', 'holiday', 'workingday', 'hour']], ['season', 'holiday', 'workingday', 'hour'])

In [14]:
one_data = one_data.join(one_data_1)

In [15]:
one_data.head(10)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,hour_is_14,hour_is_15,hour_is_16,hour_is_17,hour_is_18,hour_is_19,hour_is_20,hour_is_21,hour_is_22,hour_is_23
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.4,81,6.986063,3,...,0,0,0,0,0,0,0,0,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.63,80,6.845595,8,...,0,0,0,0,0,0,0,0,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.63,80,6.845595,5,...,0,0,0,0,0,0,0,0,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.4,75,6.672821,3,...,0,0,0,0,0,0,0,0,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.4,75,6.672821,0,...,0,0,0,0,0,0,0,0,0,0
5,2011-01-01 05:00:00,1,0,0,2,9.84,12.88,75,6.003906,0,...,0,0,0,0,0,0,0,0,0,0
6,2011-01-01 06:00:00,1,0,0,1,9.02,13.63,80,6.845595,2,...,0,0,0,0,0,0,0,0,0,0
7,2011-01-01 07:00:00,1,0,0,1,8.2,12.88,86,6.762674,1,...,0,0,0,0,0,0,0,0,0,0
8,2011-01-01 08:00:00,1,0,0,1,9.84,14.4,75,6.672821,1,...,0,0,0,0,0,0,0,0,0,0
9,2011-01-01 09:00:00,1,0,0,1,13.12,17.42,76,9.359136,8,...,0,0,0,0,0,0,0,0,0,0


In [9]:
def predict_on_test_set_2(data_, model, x_cols):
    # prepare training set
    df_ = data_[data_['count'] != 0].copy()
    df_t = data_[data_['count'] == 0].copy()
    years = [2011, 2012]
    df_batch = pd.DataFrame()
    df_result = pd.DataFrame()
    result = np.empty((0,1), float)
    y_result = []
    for yr in years:
        for i in range(1, 13):
            df_train = df_[(df_['month'] == i) & (df_['year'] == yr)]
            df_batch = df_batch.append(df_train)
            
            X_train = df_batch[x_cols]
            y_train_cas = np.array(df_batch['casual_log'])
            y_train_reg = np.array(df_batch['registered_log'])

            # prepare test set
            df_test = df_t[(df_t['month'] == i) & (df_t['year'] == yr)]
            X_test = df_test[x_cols]

            casual_model = model.fit(X_train, y_train_cas)
            y_pred_cas = casual_model.predict(X_test)
            y_pred_cas = np.exp(y_pred_cas) - 1
            registered_model = model.fit(X_train, y_train_reg)
            y_pred_reg = registered_model.predict(X_test)
            y_pred_reg = np.exp(y_pred_reg) - 1
            y_count = y_pred_cas + y_pred_reg
            print(y_count.size)
            y_result = np.concatenate([y_result, y_count])
    return y_result

In [260]:
one_data.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'date', 'hour', 'month', 'weekday', 'year', 'peak_time',
       'weekoff_count', 'best_condition', 'not_fav', 'sine_hr', 'cos_hr',
       'casual_log', 'registered_log', 'count_log', 'day', 'season_is_1',
       'season_is_2', 'season_is_3', 'season_is_4', 'holiday_is_0',
       'holiday_is_1', 'workingday_is_0', 'workingday_is_1', 'hour_is_0',
       'hour_is_1', 'hour_is_2', 'hour_is_3', 'hour_is_4', 'hour_is_5',
       'hour_is_6', 'hour_is_7', 'hour_is_8', 'hour_is_9', 'hour_is_10',
       'hour_is_11', 'hour_is_12', 'hour_is_13', 'hour_is_14', 'hour_is_15',
       'hour_is_16', 'hour_is_17', 'hour_is_18', 'hour_is_19', 'hour_is_20',
       'hour_is_21', 'hour_is_22', 'hour_is_23'],
      dtype='object')

In [261]:
params = {'n_estimators': 1500, 'max_depth': 15, 'random_state': 123, 'min_samples_split' : 14, 'n_jobs': -1}
rf_model = RandomForestRegressor(**params)

rf_cols = ['season_is_1', 'season_is_2', 'season_is_3', 'season_is_4','holiday_is_0',
           'holiday_is_1', 'workingday_is_0', 'workingday_is_1', 'weather', 'temp',
           'atemp', 'humidity', 'windspeed', 'peak_time', 'not_fav', 'sine_hr', 'cos_hr']

rnd_pred = predict_on_test_set_2(one_data, rf_model, rf_cols)

rf_y_pred = np.round(rnd_pred)

# output predictions for submission

df_test['count'] = rf_y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submit11.csv', index=False)

257
203
284
264
288
264
288
275
264
288
263
285
288
237
288
264
288
264
288
288
264
252
263
286


In [265]:
gbm_cols = [
    'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    'holiday_is_0', 'holiday_is_1', 'workingday_is_0', 
    'workingday_is_1', 'season_is_1', 'season_is_2', 'season_is_3', 'season_is_4',
    'sine_hr', 'cos_hr', 'year', 'best_condition']

best_params = {'n_estimators': 500, 'min_samples_leaf': 8, 'learning_rate': 0.1, 'subsample': 0.7, 'random_state': 123}

gbm_model = GradientBoostingRegressor(**best_params)

gbm_pred = predict_on_test_set_2(one_data, gbm_model, gbm_cols)

y_pred = np.round(gbm_pred)

# output predictions for submission

df_test['count'] = y_pred
final_df = df_test[['datetime', 'count']].copy()
final_df.to_csv('submit12.csv', index=False) # kaggle score (one hot encoding) 0.42257

257
203
284
264
288
264
288
275
264
288
263
285
288
237
288
264
288
264
288
288
264
252
263
286


# Using Cross Validation with Batch ## doesn't work! memory error

In [269]:
from sklearn.model_selection import RandomizedSearchCV
rand_clas_1 = RandomForestRegressor(verbose=True, random_state=123)

# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 10)]

# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(10, 50, num = 10)]
max_depth.append(None)

# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }

# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rand_clas_1, param_distributions = random_grid, n_iter = 50, cv = 10, verbose=1, random_state=42, n_jobs = -1)

# Fit the model
df_ = one_data[one_data['count'] != 0].copy()
years = [2011, 2012]
df_batch = pd.DataFrame()
best_params_cv = []
cnt = 1

for yr in years:
    for i in range(1, 13):
        #prepare train set with batches
        df_train = df_[(df_['month'] == i) & (df_['year'] == yr)]
        df_batch = df_batch.append(df_train)
        X_train = df_batch[x_cols]
        y_train = np.array(df_batch['count_log'])
        print(cnt)
        cnt = cnt + 1
        count_model = rfc_random.fit(X_train, y_train)
        print(count_model.best_params_)
        best_params_cv.append(count_model.best_params_)
print(best_params_cv)

1
Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 2000, 'max_features': 'auto', 'max_depth': None}
2
Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1111 out of 1111 | elapsed:    3.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1111, 'max_features': 'auto', 'max_depth': None}
3
Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 10.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1777 out of 1777 | elapsed:    6.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1777, 'max_features': 'auto', 'max_depth': 36}
4
Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 13.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1222 out of 1222 | elapsed:    7.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1222, 'max_features': 'auto', 'max_depth': 41}
5
Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 15.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1666 out of 1666 | elapsed:   10.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1666, 'max_features': 'auto', 'max_depth': 27}
6
Fitting 10 folds for each of 50 candidates, totalling 500 fits




MemoryError: could not allocate 229376 bytes

In [277]:
# fine tuned 
#still doesn't work. Need more computational power to perform grid search cv with batches.
rand_clas_1 = RandomForestRegressor(verbose=True, random_state=123)
# params = {'n_estimators': 1500, 'max_depth': 15, 'random_state': 123, 'min_samples_split' : 14, 'n_jobs': -1}
# number of trees in random forest
n_estimators = [1000, 1200, 1500, 1800]
# number of features at every split
min_samples_split = [12, 14, 16, 18]
# max depth
max_depth = [10, 12, 15, 18, 20, 25]
max_depth.append(None)
# create random grid

random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }

# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rand_clas_1, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=123, n_jobs = -1)

# Fit the model
df_ = one_data[one_data['count'] != 0].copy()
years = [2011, 2012]
df_batch = pd.DataFrame()
best_params_cv = []
cnt = 1

for yr in years:
    for i in range(1, 13):
        #prepare train set with batches
        df_train = df_[(df_['month'] == i) & (df_['year'] == yr)]
        df_batch = df_batch.append(df_train)
        X_train = df_batch[x_cols]
        y_train = np.array(df_batch['count_log'])
        print(cnt)
        cnt = cnt + 1
        count_model = rfc_random.fit(X_train, y_train)
        print(count_model.best_params_)
        best_params_cv.append(count_model.best_params_)
        
print(best_params_cv)

1
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1800, 'max_features': 'auto', 'max_depth': 18}
2
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  2.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    4.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1800, 'max_features': 'auto', 'max_depth': None}
3
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  3.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    4.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 18}
4
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  4.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1500 out of 1500 | elapsed:    7.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1500, 'max_features': 'auto', 'max_depth': None}
5
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  5.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1200 out of 1200 | elapsed:    8.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1200, 'max_features': 'auto', 'max_depth': 25}
6
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  6.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   14.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1800, 'max_features': 'auto', 'max_depth': None}
7
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  7.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    9.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 20}
8
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  8.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   17.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1800, 'max_features': 'auto', 'max_depth': 15}
9
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  9.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   10.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 15}
10
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 10.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   14.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 25}
11
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 11.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   27.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1800, 'max_features': 'auto', 'max_depth': 18}
12
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 12.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   33.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1800, 'max_features': 'auto', 'max_depth': 20}
13
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 13.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:   33.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1800, 'max_features': 'auto', 'max_depth': 20}
14
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 14.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   19.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 18}
15
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.7min


MemoryError: could not allocate 458752 bytes

# Grid Search for SVR #Memory ERROR!

In [None]:
from sklearn.model_selection import RandomizedSearchCV

svr_cols = ['weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'year', 'peak_time',
            'weekoff_count', 'best_condition', 'not_fav', 'season_is_1',
            'season_is_2', 'season_is_3', 'season_is_4', 'holiday_is_0',
            'holiday_is_1', 'workingday_is_0', 'workingday_is_1', 'hour_is_0',
            'hour_is_1', 'hour_is_2', 'hour_is_3', 'hour_is_4', 'hour_is_5',
            'hour_is_6', 'hour_is_7', 'hour_is_8', 'hour_is_9', 'hour_is_10',
            'hour_is_11', 'hour_is_12', 'hour_is_13', 'hour_is_14', 'hour_is_15',
            'hour_is_16', 'hour_is_17', 'hour_is_18', 'hour_is_19', 'hour_is_20',
            'hour_is_21', 'hour_is_22', 'hour_is_23']

regre_svr = SVR(verbose=True)
C = [float(x) for x in np.linspace(start = 0.5, stop = 2, num = 5)]
degree = [3, 4, 5, 6]
kernelstring = ['rbf', 'poly']
gamma = ['auto']
epsilon = [0.2, 0.3, 0.4]

# create random grid
random_grid = {'C': C, 'degree': degree, 'kernel': kernelstring, 'gamma': gamma, 'epsilon': epsilon}

# Random search of parameters
svr_random = RandomizedSearchCV(estimator = regre_svr, param_distributions = random_grid, cv = 10, verbose=1, random_state=123, n_jobs = -1)
# Fit the model
df_ = one_data[one_data['count'] != 0].copy()
years = [2011, 2012]
df_batch = pd.DataFrame()
best_params_cv = []
cnt = 1

for yr in years:
    for i in range(1, 13):
        #prepare train set with batches
        df_train = df_[(df_['month'] == i) & (df_['year'] == yr)]
        df_batch = df_batch.append(df_train)
        X_train = df_batch[svr_cols]
        y_train = np.array(df_batch['count_log'])
        print(cnt)
        cnt = cnt + 1
        count_model = svr_random.fit(X_train, y_train)
        print(count_model.best_params_)
        best_params_cv.append(count_model.best_params_)
print(best_params_cv)

1
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


# So, I can conclude that though individual models did not do that well on the training datasets, but the best kaggle score was achieved by combining the predictions of the models. 
# The best Kaggle score I could achieve was - 0.41490, by combining the outputs of GradientBoostingRegressor (80%) and RandomForestRegressor (20%)