In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#FE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
#from sklearn.impute import SimpleImputer #no missing vals
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

#regressors
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Cross validation 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# Scoring metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import plot_roc_curve, auc, roc_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error

#https://www.kaggle.com/competitions/bike-sharing-demand/overview

In [2]:
#pip install xgboost
df= pd.read_csv('bike_time_features.csv', index_col=0, parse_dates=True)
df_reg = df.drop(['count', 'casual'], axis=1)
df_cas = df.drop(['count', 'registered'], axis = 1)

### Train, test, split for different data combos

Based on the EDA in the other notebooks, this will be done comparing regressors fit on the entire data, and regressors fit on the casual and registered data separately (with overall avg performance as comparison to entire dataset)

In [3]:
cn_train, cn_test = train_test_split(df, test_size=0.3, random_state=12)
rg_train, rg_test = train_test_split(df_reg, test_size=0.3, random_state=12)
cs_train, cs_test = train_test_split(df_cas, test_size=0.3, random_state=12)

In [4]:
Xtrain_cn = cn_train.drop(columns=['count', 'registered', 'casual'])
Xtrain_rg = rg_train.drop(columns=['registered'])
Xtrain_cs = cs_train.drop(columns=['casual'])

Xtest_cn = cn_test.drop(columns=['count', 'registered', 'casual'])
Xtest_rg = rg_test.drop(columns=['registered'])
Xtest_cs = cs_test.drop(columns=['casual'])

In [5]:
ytrain_cn = cn_train['count']
ytrain_rg = rg_train['registered']
ytrain_cs = cs_train['casual']

ytest_cn = cn_test['count']
ytest_rg = rg_test['registered']
ytest_cs = cs_test['casual']

In [83]:
#df.info() #for FE menu below

##### Regressor pipeline placeholders (for visualising whats in function) 

pipeline_LR = Pipeline([
    ('trans',transformer),
    ('model',LinearRegression())
])

pipeline__RF = Pipeline([
    ('trans',transformer),
    ('model',RandomForestRegressor(max_depth = 10, random_state=0)) #func for testing on diff depths later..see below
])

pipeline_PR = Pipeline([
    ('trans',transformer),
    ('model',PoissonRegressor(max_iter=200)) #func for testing iterations later
])

# Scoring Metrics
aside from accuracy that is...

RMSE: how erroneous is the model performing, depicts absolute error, if high model is quite erroneous...


RMSLE: what is the relative error of this model, log transformed values, doesnt penalize error term because of outliers as much, larger penalty for underestimation of y than overestimation (good for business cases, underestimation undesirable) 

# Function for overview // efficiency! 

Function for different pipeline models 

In [65]:
def Regressor_testing(Xtrain, ytrain, Xtest,ytest, Transformer, Model, model_name_str):
    
    '''
    Function to easily implement different models and transformers for quick testing
    Different transformers, datasets and models can be input to quickly get an overview of results
    Transformers should be defined outside of func (poss with other func?)
    '''
    ytrainlog = np.log1p(ytrain)
    ytestlog = np.log1p(ytest)
    
    pipeline = Pipeline([ ('trans', Transformer), ('model', Model) ])
    pipeline.fit(Xtrain, ytrainlog)
    ypred = pipeline.predict(Xtest)
    
    score_train = "R2 train score of" + model_name_str + 'is ' + str(pipeline.score(Xtrain, ytrainlog))
    score_test = "R2 test score of" + model_name_str + 'is ' + str(pipeline.score(Xtest, ytestlog))
    
    RMSE = "RMSE of"+ model_name_str + 'is ' +  str(np.sqrt(mean_squared_error(ytest, ypred))) 
    RMSLE = "RMSLE of"+ model_name_str + 'is ' +  str(np.sqrt(mean_squared_log_error(ytest, ypred))) 
    
    
    return (score_train, score_test, RMSE, RMSLE)                        

## Different pipelines to be used in transformers

In [66]:
pipeline_scaler = Pipeline([
            ('Scaler', StandardScaler())
])

pipeline_ohe = Pipeline([
            ('ohe', OneHotEncoder(handle_unknown="ignore"))
])

#cols to keep the same and use in model
#('pass', 'passthrough', [])

#custom func
#('', FunctionTransformer(), ['relevant_col'])

('pass', 'passthrough', [])

# Baseline Models (lin reg and entire dataset) 

### Models using basically all features with different engineering combos

#### Model 1: most simple,two trials (one with temp, one with atemp), and all relevant cat feats, NO polynomials

In [68]:
#FE
cat_feat1 = ['hour', 'weekofyear', 'weather'] #OHE 

num_feat1a = ['atemp', 'humidity', 'windspeed' ]

num_feat1b = ['temp', 'humidity', 'windspeed' ]

In [69]:
#transformer
T1 = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat1a),
    ('ohc',pipeline_ohe,cat_feat1)])

In [70]:
#model a output
Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1, LinearRegression(), ' LinRegM1 ')

('R2 train score of LinRegM1 is 0.8062386884181659',
 'R2 test score of LinRegM1 is 0.8044214096982655',
 'RMSE of LinRegM1 is 262.3054225839597',
 'RMSLE of LinRegM1 is 3.1489362688705955')

In [71]:
#model b output
T1b = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat1b),
    ('ohc',pipeline_ohe,cat_feat1)])

In [72]:
#model a output
Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1b, LinearRegression(), ' LinRegM1b ')

('R2 train score of LinRegM1b is 0.8063357052293839',
 'R2 test score of LinRegM1b is 0.8048731773556155',
 'RMSE of LinRegM1b is 262.3049695348988',
 'RMSLE of LinRegM1b is 3.1489682458412047')

In [75]:
#with holiday and working day (forgot pass through statement above)
T1_b = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat1b),
    ('ohc',pipeline_ohe,cat_feat1),
    ('pass','passthrough',['holiday','workingday']) #already OHE..try adding in cat in next anyways? 
])

In [76]:
Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T1b, LinearRegression(), ' LinRegM1b ') #same in anycase

('R2 train score of LinRegM1b is 0.8063357052293839',
 'R2 test score of LinRegM1b is 0.8048731773556155',
 'RMSE of LinRegM1b is 262.3049695348988',
 'RMSLE of LinRegM1b is 3.1489682458412047')

Test score slightly higher for b, RMSE and RMSLE v close, so use temp from now on probs 

#### Model 2: same as above with polynomial features (all numeric)

In [78]:
cat_feat2 = ['hour', 'weekofyear', 'weather', 'holiday', 'workingday'] 

num_feat2 = ['temp', 'humidity', 'windspeed' ]

poly_feat2 = ['temp', 'humidity', 'windspeed' ]

In [81]:
T2 = ColumnTransformer([
    ('scaler',pipeline_scaler,num_feat2),
    ('ohc',pipeline_ohe,cat_feat2),
    ('poly',PolynomialFeatures(degree=5,interaction_only=True,include_bias=False),poly_feat2)
])

In [82]:
Regressor_testing(Xtrain_cn, ytrain_cn, Xtest_cn, ytest_cn, T2, LinearRegression(), ' LinRegM2 ')

('R2 train score of LinRegM2 is 0.8076964880392206',
 'R2 test score of LinRegM2 is 0.8063047310573082',
 'RMSE of LinRegM2 is 262.3042491284425',
 'RMSLE of LinRegM2 is 3.148391018028354')

R scores better, RMSE and RMSLE same

#### Model 3: same as above with some interaction features

Add into func: regularisation, grid search and cross val