In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#FE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
#from sklearn.impute import SimpleImputer #no missing vals
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

#regressors
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Cross validation 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# Scoring metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import plot_roc_curve, auc, roc_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error

#https://www.kaggle.com/competitions/bike-sharing-demand/overview

In [21]:
#pip install xgboost
df= pd.read_csv('bike_time_features.csv', index_col=0, parse_dates=True)
df_reg = df.drop(['count', 'casual'], axis=1)
df_cas = df.drop(['count', 'registered'], axis = 1)

### Train, test, split for different data combos

Based on the EDA in the other notebooks, this will be done comparing regressors fit on the entire data, and regressors fit on the casual and registered data separately (with overall avg performance as comparison to entire dataset)

In [22]:
cn_train, cn_test = train_test_split(df, test_size=0.3, random_state=12)
rg_train, rg_test = train_test_split(df_reg, test_size=0.3, random_state=12)
cs_train, cs_test = train_test_split(df_cas, test_size=0.3, random_state=12)

In [23]:
Xtrain_cn = cn_train.drop(columns=['count', 'registered', 'casual'])
Xtrain_rg = rg_train.drop(columns=['registered'])
Xtrain_cs = cs_train.drop(columns=['casual'])

Xtest_cn = cn_test.drop(columns=['count', 'registered', 'casual'])
Xtest_rg = rg_test.drop(columns=['registered'])
Xtest_cs = cs_test.drop(columns=['casual'])

In [24]:
ytrain_cn = cn_train['count']
ytrain_rg = rg_train['registered']
ytrain_cs = cs_train['casual']

ytest_cn = cn_test['count']
ytest_rg = rg_test['registered']
ytest_cs = cs_test['casual']

Log of y data sets (for RMSLE)

In [25]:
ytrain_lcn=np.log1p(ytrain_cn)
ytest_lcn=np.log1p(ytest_cn)

ytrain_lrg=np.log1p(ytrain_rg)
ytest_lrg=np.log1p(ytest_rg)

ytrain_lcs=np.log1p(ytrain_cs)
ytest_lcs=np.log1p(ytest_cs)

In [27]:
df.info() #for FE menu below

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10886 entries, 2011-01-01 00:00:00 to 2012-12-19 23:00:00
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      10886 non-null  int64  
 1   holiday     10886 non-null  int64  
 2   workingday  10886 non-null  int64  
 3   weather     10886 non-null  int64  
 4   temp        10886 non-null  float64
 5   atemp       10886 non-null  float64
 6   humidity    10886 non-null  int64  
 7   windspeed   10886 non-null  float64
 8   casual      10886 non-null  int64  
 9   registered  10886 non-null  int64  
 10  count       10886 non-null  int64  
 11  hour        10886 non-null  int64  
 12  dayofyear   10886 non-null  int64  
 13  weekofyear  10886 non-null  int64  
 14  weekday     10886 non-null  int64  
dtypes: float64(3), int64(12)
memory usage: 1.3 MB


# Feature Engineering and combinations menu !! 

define numeric and categorical features for pipeline (baseline models- start of simple, going to perform subpar) 

In [30]:
num_all = ["temp","humidity","windspeed", "atemp"] #risk overfitting / multico, atemp and temp too close
num_t = ["temp","humidity","windspeed"]
num_at = ["atemp","humidity","windspeed"]


cat_feat_all = ["weather","dayofyear","weekofyear","weekday", 'workingday' , 'holiday' , 'season']

Avoiding multicollinearity: only one-> week of year / day of year, 
Notes: humidity slight corr w season and weather (interaction features?) 

##### Placeholder pipeline for testing / copy paste

transformer = ColumnTransformer([
    ('scaler',pipeline_scaler,numeric_features),
    ('ohc',pipeline_ohe,categorical_features),
    ('poly',PolynomialFeatures(degree= ,interaction_only=True,include_bias=False),numeric_features),
    ('pass','passthrough',[])
])

# Model Pipelines (inc. regressors)
##### Placeholders

pipeline_LR = Pipeline([
    ('trans',transformer),
    ('model',LinearRegression())
])

pipeline__RF = Pipeline([
    ('trans',transformer),
    ('model',RandomForestRegressor(max_depth = 10, random_state=0)) #func for testing on diff depths later..see below
])

pipeline_PR = Pipeline([
    ('trans',transformer),
    ('model',PoissonRegressor(max_iter=200)) #func for testing iterations later
])

# Scoring Metrics
aside from accuracy that is...

RMSE: how erroneous is the model performing, depicts absolute error, if high model is quite erroneous...


RMSLE: what is the relative error of this model, log transformed values, doesnt penalize error term because of outliers as much, larger penalty for underestimation of y than overestimation (good for business cases, underestimation undesirable) 

In [35]:
def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def RMSLE(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

#better just in func below...

# Function for overview // efficiency! 

Function for different pipeline models 

In [38]:
def Regressor_testing(Xtrain, ytrainlog, Xtest,ytestlog, ytest, Transformer, Model, Model_name_str):
    
    '''
    Function to easily implement different models and transformers for quick testing
    Different transformers, datasets and models can be input to quickly get an overview of results
    Transformers should be defined outside of func (poss with other func?)
    '''
    
    pipeline = Pipeline([ ('trans', Transformer), ('model', Model) ])
    pipeline.fit(Xtrain,ytrain_log)
    ypred = pipeline.predict(Xtest)
    
    score_train = "R2 train score of" + model_name_str + 'is' + pipeline.score(Xtrain, ytrainlog)
    score_test = "R2 test score of" + model_name_str + 'is' + pipeline.score(Xtest, ytestlog)
    
    RMSE = "RMSE of"+ model_name_str + 'is' +  np.sqrt(mean_squared_error(ytest, ypred)) 
    RMSLE = "RMSLE of"+ model_name_str + 'is' +  np.sqrt(mean_squared_log_error(ytest, ypred)) 
    
    
    return (score_train, score_test, RMSE, RMSLE)                        

# Baseline Models using function

# Regularisation

# Grid Search 

# Cross Validation