In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#FE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
#from sklearn.impute import SimpleImputer #no missing vals
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

#regressors
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Cross validation 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# Scoring metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import plot_roc_curve, auc, roc_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error

#https://www.kaggle.com/competitions/bike-sharing-demand/overview

In [2]:
#pip install xgboost
df= pd.read_csv('bike_time_features.csv', index_col=0, parse_dates=True)
df_reg = df.drop(['count', 'casual'], axis=1)
df_cas = df.drop(['count', 'registered'], axis = 1)

### Train, test, split for different data combos

Based on the EDA in the other notebooks, this will be done comparing regressors fit on the entire data, and regressors fit on the casual and registered data separately (with overall avg performance as comparison to entire dataset)

In [3]:
cn_train, cn_test = train_test_split(df, test_size=0.3, random_state=12)
rg_train, rg_test = train_test_split(df_reg, test_size=0.3, random_state=12)
cs_train, cs_test = train_test_split(df_cas, test_size=0.3, random_state=12)

In [4]:
Xtrain_cn = cn_train.drop(columns=['count', 'registered', 'casual'])
Xtrain_rg = rg_train.drop(columns=['registered'])
Xtrain_cs = cs_train.drop(columns=['casual'])

Xtest_cn = cn_test.drop(columns=['count', 'registered', 'casual'])
Xtest_rg = rg_test.drop(columns=['registered'])
Xtest_cs = cs_test.drop(columns=['casual'])

In [5]:
ytrain_cn = cn_train['count']
ytrain_rg = rg_train['registered']
ytrain_cs = cs_train['casual']

ytest_cn = cn_test['count']
ytest_rg = rg_test['registered']
ytest_cs = cs_test['casual']

In [6]:
#df.info() #for FE menu below

##### Regressor pipeline placeholders (for visualising whats in function) 

pipeline_LR = Pipeline([
    ('trans',transformer),
    ('model',LinearRegression())
])

pipeline__RF = Pipeline([
    ('trans',transformer),
    ('model',RandomForestRegressor(max_depth = 10, random_state=0)) #func for testing on diff depths later..see below
])

pipeline_PR = Pipeline([
    ('trans',transformer),
    ('model',PoissonRegressor(max_iter=200)) #func for testing iterations later
])

# Scoring Metrics
aside from accuracy that is...

RMSE: how erroneous is the model performing, depicts absolute error, if high model is quite erroneous...


RMSLE: what is the relative error of this model, log transformed values, doesnt penalize error term because of outliers as much, larger penalty for underestimation of y than overestimation (good for business cases, underestimation undesirable) 

# Function for overview // efficiency! 

Function for different pipeline models (V0 not efficient for using results in a table for comparison / later use). 

In [7]:
def Regressor_testing_V0(Xtrain, ytrain, Xtest,ytest, Transformer, Model, model_name_str):
    
    '''
    Function to easily implement different models and transformers for quick testing
    Different transformers, datasets and models can be input to quickly get an overview of results
    Transformers should be defined outside of func (poss with other func?)
    '''
    ytrainlog = np.log1p(ytrain)
    ytestlog = np.log1p(ytest)
    
    pipeline = Pipeline([ ('trans', Transformer), ('model', Model) ])
    pipeline.fit(Xtrain, ytrainlog)
    ypred = pipeline.predict(Xtest)
    
    score_train = "R2 train score of" + model_name_str + 'is ' + str(pipeline.score(Xtrain, ytrainlog))
    score_test = "R2 test score of" + model_name_str + 'is ' + str(pipeline.score(Xtest, ytestlog))
    
    RMSE = "RMSE of"+ model_name_str + 'is ' +  str(np.sqrt(mean_squared_error(ytest, ypred))) 
    RMSLE = "RMSLE of"+ model_name_str + 'is ' +  str(np.sqrt(mean_squared_log_error(ytest, ypred))) 
    
    
    return (score_train, score_test, RMSE, RMSLE)                        

In [8]:
def Regressor_testing(Xtrain, ytrain, Xtest,ytest, Transformer, Model, model_name_str):
    
    '''
    Function to easily implement different models and transformers for quick testing
    Different transformers, datasets and models can be input to quickly get an overview of results
    Transformers should be defined outside of func (poss with other func?)
    '''
    
    model_output= []
    
    ytrainlog = np.log1p(ytrain)
    ytestlog = np.log1p(ytest)
    
    pipeline = Pipeline([ ('trans', Transformer), ('model', Model) ])
    pipeline.fit(Xtrain, ytrainlog)
    ypred = pipeline.predict(Xtest)
    
    score_train = "R2 train score of" + model_name_str + 'is ' + str(pipeline.score(Xtrain, ytrainlog)) + ' AND the'
    score_train_mo =pipeline.score(Xtrain, ytrainlog)
    model_output.append(score_train_mo)
    
    score_test = "R2 test score of" + model_name_str + 'is ' + str(pipeline.score(Xtest, ytestlog)) + ' AND the'
    score_test_mo = pipeline.score(Xtrain, ytrainlog)
    model_output.append(score_test_mo)
    
    RMSE = "RMSE of"+ model_name_str + 'is ' +  str(np.sqrt(mean_squared_error(ytest, ypred)))  + ' AND the'
    RMSE_mo = np.sqrt(mean_squared_error(ytest, ypred))
    model_output.append(RMSE_mo)
    #note: need to figure out how to implement new line in the string formatting above..
    
    
    RMSLE = "RMSLE of"+ model_name_str + 'is ' +  str(np.sqrt(mean_squared_log_error(ytest, ypred))) 
    RMSLE_mo = np.sqrt(mean_squared_log_error(ytest, ypred))
    model_output.append(RMSLE_mo)
    
    model_output = list(model_output)
    
    
    print(score_train,score_test, RMSE, RMSLE)
    
    return (model_output)  #use this to put into a func which puts it in a table?

In [11]:
def regressor_output_table(df_output, model_output, model_name_str):
    
    '''
    A function taking the defined regressors being tested, 
    which formats all of them into a dataframe 
    for readability and quick comparison
    '''
    
    df_model = pd.DataFrame([model_output], index=[model_name_str], columns=['R2_train','R2_test','RMSE','RMSLE'])
    df = pd.concat([df_output, df_model])
    
    return(df)

## Different pipelines to be used in transformers

In [12]:
pipeline_scaler = Pipeline([
            ('Scaler', StandardScaler())
])

pipeline_ohe = Pipeline([
            ('ohe', OneHotEncoder(handle_unknown="ignore"))
])

#cols to keep the same and use in model
#('pass', 'passthrough', [])

#custom func
#('', FunctionTransformer(), ['relevant_col'])

# Baseline Models (lin reg and entire dataset) 

### Models using basically all features with different engineering combos

#### Model 1: most simple,two trials (one with temp, one with atemp), and all relevant cat feats, NO polynomials

In [13]:
#FE
cat_feat1 = ['hour', 'weekofyear', 'weather'] #OHE 

num_feat1a = ['atemp', 'humidity', 'windspeed' ]

num_feat1b = ['temp', 'humidity', 'windspeed' ]

In [14]:
#transformer
T1 = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat1a),
    ('ohc',pipeline_ohe,cat_feat1)])

In [15]:
#model a output
LinRegM1 = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1, LinearRegression(), ' LinRegM1 ')

R2 train score of LinRegM1 is 0.8062386884181659 AND the R2 test score of LinRegM1 is 0.8044214096982655 AND the RMSE of LinRegM1 is 262.3054225839597 AND the RMSLE of LinRegM1 is 3.1489362688705955


In [16]:
df = pd.DataFrame([LinRegM1], index=['LinRegM1'], columns=['R2_train','R2_test','RMSE','RMSLE'])

In [17]:
df #first output has to form input df for func, so can continually concatenate on all the others

Unnamed: 0,R2_train,R2_test,RMSE,RMSLE
LinRegM1,0.806239,0.806239,262.305423,3.148936


In [18]:
#model b output
T1b = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat1b),
    ('ohc',pipeline_ohe,cat_feat1)])

In [19]:
#model a output
LinRegM1b = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1b, LinearRegression(), ' LinRegM1b ')

df = regressor_output_table(df, LinRegM1b, 'LinRegM1b')

R2 train score of LinRegM1b is 0.8063357052293839 AND the R2 test score of LinRegM1b is 0.8048731773556155 AND the RMSE of LinRegM1b is 262.3049695348988 AND the RMSLE of LinRegM1b is 3.1489682458412047


In [20]:
df

Unnamed: 0,R2_train,R2_test,RMSE,RMSLE
LinRegM1,0.806239,0.806239,262.305423,3.148936
LinRegM1b,0.806336,0.806336,262.30497,3.148968


In [30]:
#with holiday and working day (forgot pass through statement above)
T1_b = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat1b),
    ('ohc',pipeline_ohe,cat_feat1),
    ('pass','passthrough',['holiday','workingday']) #already OHE..try adding in cat in next anyways? 
])

In [21]:
LinRegM1b = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1b, LinearRegression(), ' LinRegM1b ') #same in anycase

R2 train score of LinRegM1b is 0.8063357052293839 AND the R2 test score of LinRegM1b is 0.8048731773556155 AND the RMSE of LinRegM1b is 262.3049695348988 AND the RMSLE of LinRegM1b is 3.1489682458412047


Test score slightly higher for b, RMSE and RMSLE v close, so use temp from now on probs 

#### Model 2: same as above with polynomial features (all numeric)

In [23]:
cat_feat2 = ['hour', 'weekofyear', 'weather', 'holiday', 'workingday'] 

num_feat2 = ['temp', 'humidity', 'windspeed' ]

poly_feat2 = ['temp', 'humidity', 'windspeed' ]

In [24]:
T2 = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat2),
    ('ohc',pipeline_ohe,cat_feat2),
    ('poly',PolynomialFeatures(degree=5,interaction_only=True,include_bias=False),poly_feat2)
])

In [25]:
LinRegM2 = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T2, LinearRegression(), ' LinRegM2 ')

df = regressor_output_table(df, LinRegM2, 'LinRegM2')

R2 train score of LinRegM2 is 0.8076964880392206 AND the R2 test score of LinRegM2 is 0.8063047310573082 AND the RMSE of LinRegM2 is 262.3042491284425 AND the RMSLE of LinRegM2 is 3.148391018028354


R scores better, RMSE and RMSLE same

#### Model 3: Try with poisson with and without poly

In [28]:
PoisRegPoly = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T2, PoissonRegressor(max_iter=190), ' PoisRegPoly ')

df = regressor_output_table(df, PoisRegPoly, 'PoisRegPoly')

R2 train score of PoisRegPoly is 0.0 AND the R2 test score of PoisRegPoly is -0.00022014838016892746 AND the RMSE of PoisRegPoly is 262.95066268109133 AND the RMSLE of PoisRegPoly is 3.2151097899407


  return np.exp(lin_pred)
  return np.exp(lin_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [32]:
PoisReg_M1 = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1_b, PoissonRegressor(max_iter=200), ' PoisReg_M1 ')

df = regressor_output_table(df, PoisReg_M1, 'PoisReg_M1')

R2 train score of PoisReg_M1 is 0.3319010982709264 AND the R2 test score of PoisReg_M1 is 0.33269777688214763 AND the RMSE of PoisReg_M1 is 262.6932727213375 AND the RMSLE of PoisReg_M1 is 3.191024610752198


##### Obs: much higher accuracy without poly features

#### Model 4: trying with Random Forest, and various different depths 

Without poly features

In [34]:
RF_10 = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1_b, RandomForestRegressor(max_depth =10, random_state=0), ' RF_10 ')

R2 train score of RF_10 is 0.8438910211668131 AND the R2 test score of RF_10 is 0.8230839525270042 AND the RMSE of RF_10 is 262.38321150578867 AND the RMSLE of RF_10 is 3.144320723963199


In [35]:
df = regressor_output_table(df, RF_10, 'RF_10')

In [36]:
RF_15 = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1_b, RandomForestRegressor(max_depth =15, random_state=0), ' RF_15 ')

R2 train score of RF_15 is 0.9249187652893911 AND the R2 test score of RF_15 is 0.8767466406978608 AND the RMSE of RF_15 is 262.2825342555777 AND the RMSLE of RF_15 is 3.1396731323297433


In [38]:
df = regressor_output_table(df, RF_15, 'RF_15')

In [37]:
RF_20 = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1_b, RandomForestRegressor(max_depth =20, random_state=0), ' RF_20 ')

R2 train score of RF_20 is 0.9611796728490452 AND the R2 test score of RF_20 is 0.8971854885556041 AND the RMSE of RF_20 is 262.24246914193884 AND the RMSLE of RF_20 is 3.138988786303532


In [39]:
df = regressor_output_table(df, RF_15, 'RF_15')

##### Obs: RFR highest accuracy so far, but RMSE and RMSLE only slightly better than lin reg models, and PR (also lingreg still pretty accurate)

With poly features

In [41]:
RF_10_poly = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T2, RandomForestRegressor(max_depth =10, random_state=0), ' RF_10_poly ')

R2 train score of RF_10_poly is 0.8455451101074689 AND the R2 test score of RF_10_poly is 0.8216115668544954 AND the RMSE of RF_10_poly is 262.38351427396253 AND the RMSLE of RF_10_poly is 3.144383637058642


In [44]:
df = regressor_output_table(df, RF_10_poly, 'RF_10_poly')

In [42]:
RF_15_poly = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T2, RandomForestRegressor(max_depth =15, random_state=0), ' RF_15_poly ')

R2 train score of RF_15_poly is 0.9262049458439154 AND the R2 test score of RF_15_poly is 0.8749134293998453 AND the RMSE of RF_15_poly is 262.28442341430383 AND the RMSLE of RF_15_poly is 3.1399228944270465


In [45]:
df = regressor_output_table(df, RF_15_poly, 'RF_15_poly')

In [43]:
RF_20_poly = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T2, RandomForestRegressor(max_depth =20, random_state=0), ' RF_20_poly ')

R2 train score of RF_20_poly is 0.9617129047889337 AND the R2 test score of RF_20_poly is 0.8950142034420682 AND the RMSE of RF_20_poly is 262.245764733932 AND the RMSLE of RF_20_poly is 3.1393051095879034


In [46]:
df = regressor_output_table(df, RF_20_poly, 'RF_20_poly')

##### slightly more accurate than without poly, but ! RMSLE and RMSE slightly worse?

#### Model 5: various tries with XG Boost

In [None]:
XG_5 = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1_b, xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10), ' XG_5') 

In [None]:
XG_10 = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1_b, xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 10), ' XG_10') 

First time using XG boost, need to look into it more and figure out why neg....

#### Model(s) 6 : trying with windspeed in a custom bin (just to trial custom pipeline, going to do better testing later!)
windspeed seems to have two distinct areas

In [50]:
#custom func: bin wind into two (32 divide) to make more predictive (based on dist) -> see EDA for custom bin
def custom_bin(df):
    
    for a in df:
        df.loc[ (df['windspeed']> 0) & (df['windspeed'] <= 25), 'windspeed'] = 0
        df.loc[ (df['windspeed']> 25), 'windspeed'] = 1
    
        return df[['windspeed']]
#for in pipe: ('cust_bin', FunctionTransformer(custom_bin), ['windspeed'])

without poly

In [51]:
T3 = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat2),
    ('ohc',pipeline_ohe,cat_feat2),
    ('cust_bin', FunctionTransformer(custom_bin), ['windspeed'])
])

T4 = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat2),
    ('ohc',pipeline_ohe,cat_feat2),
    ('cust_bin', FunctionTransformer(custom_bin), ['windspeed']),
    ('poly',PolynomialFeatures(degree=5,interaction_only=True,include_bias=False),poly_feat2)
])

In [52]:
RF_CB = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T3, RandomForestRegressor(max_depth =20, random_state=0), ' RF_CB ')
#less well performing

R2 train score of RF_CB is 0.9611310718743683 AND the R2 test score of RF_CB is 0.8975681725685324 AND the RMSE of RF_CB is 262.24218723046147 AND the RMSLE of RF_CB is 3.1388684380565053


In [53]:
df = regressor_output_table(df, RF_CB, 'RF_CB')

In [54]:
PoisReg_CB = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T3, PoissonRegressor(max_iter=200), ' PoisReg_CB ')
#performs marginally better

R2 train score of PoisReg_CB is 0.3320414630854772 AND the R2 test score of PoisReg_CB is 0.3326859157816441 AND the RMSE of PoisReg_CB is 262.69341876821693 AND the RMSLE of PoisReg_CB is 3.19102356410071


In [55]:
df = regressor_output_table(df, PoisReg_CB, 'PoisReg_CB')

In [56]:
LinReg_CB = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T3, LinearRegression(), ' LinReg_CB ')
#same

R2 train score of LinReg_CB is 0.8073192434464606 AND the R2 test score of LinReg_CB is 0.8059762869044551 AND the RMSE of LinReg_CB is 262.3051402370977 AND the RMSLE of LinReg_CB is 3.1487053210350733


In [58]:
df = regressor_output_table(df, LinReg_CB, 'LinReg_CB')

In [57]:
RF_CB_poly = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T4, RandomForestRegressor(max_depth =20, random_state=0), ' RF_CB_poly ')


R2 train score of RF_CB_poly is 0.9617571117684969 AND the R2 test score of RF_CB_poly is 0.8951331767834021 AND the RMSE of RF_CB_poly is 262.24595340443693 AND the RMSLE of RF_CB_poly is 3.1393224460026388


In [59]:
df = regressor_output_table(df, RF_CB_poly, 'RF_CB_poly')

Better than without poly and more accurate, but still not as good as RF above without custom bin

In [61]:
LinReg_CB_poly  = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T4, LinearRegression(), ' LinReg_CB_poly ')
df = regressor_output_table(df, LinReg_CB_poly, 'LinReg_CB_poly')

R2 train score of LinReg_CB_poly is 0.8077485545350103 AND the R2 test score of LinReg_CB_poly is 0.8061637556774409 AND the RMSE of LinReg_CB_poly is 262.30433552666636 AND the RMSLE of LinReg_CB_poly is 3.1484266795865885


In [62]:
PoisReg_CB_poly = Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T4, PoissonRegressor(max_iter=200), ' PoisReg_CB_poly ')
df = regressor_output_table(df, PoisReg_CB_poly, 'PoisReg_CB_poly')

R2 train score of PoisReg_CB_poly is 0.0 AND the R2 test score of PoisReg_CB_poly is -0.00022014838016892746 AND the RMSE of PoisReg_CB_poly is 262.95066268109133 AND the RMSLE of PoisReg_CB_poly is 3.2151097899407


  return np.exp(lin_pred)
  return np.exp(lin_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


# An output table with all results! (to csv to compare with split dataset)

In [63]:
df

Unnamed: 0,R2_train,R2_test,RMSE,RMSLE
LinRegM1,0.806239,0.806239,262.305423,3.148936
LinRegM1b,0.806336,0.806336,262.30497,3.148968
LinRegM2,0.807696,0.807696,262.304249,3.148391
PoisRegPoly,0.0,0.0,262.950663,3.21511
PoisReg_M1,0.331901,0.331901,262.693273,3.191025
RF_10,0.843891,0.843891,262.383212,3.144321
RF_15,0.924919,0.924919,262.282534,3.139673
RF_15,0.924919,0.924919,262.282534,3.139673
RF_10_poly,0.845545,0.845545,262.383514,3.144384
RF_15_poly,0.926205,0.926205,262.284423,3.139923


In [64]:
df.to_csv('data_all_regressors.csv')