In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#FE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
#from sklearn.impute import SimpleImputer #no missing vals
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

#regressors
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Cross validation 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# Scoring metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import plot_roc_curve, auc, roc_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error

#https://www.kaggle.com/competitions/bike-sharing-demand/overview

In [2]:
df= pd.read_csv('bike_time_features.csv', index_col=0, parse_dates=True)
df_reg = df.drop(['count', 'casual'], axis=1)
df_cas = df.drop(['count', 'registered'], axis = 1)

In [3]:
rg_train, rg_test = train_test_split(df_reg, test_size=0.3, random_state=12)
cs_train, cs_test = train_test_split(df_cas, test_size=0.3, random_state=12)

In [4]:
Xtrain_rg = rg_train.drop(columns=['registered'])
Xtrain_cs = cs_train.drop(columns=['casual'])

Xtest_rg = rg_test.drop(columns=['registered'])
Xtest_cs = cs_test.drop(columns=['casual'])

ytrain_rg = rg_train['registered']
ytrain_cs = cs_train['casual']

ytest_rg = rg_test['registered']
ytest_cs = cs_test['casual']

Data now TTS (for registered and casual)- going to be doing analyses on the split data, since it looked promising compared to the data as a whole (casual trends differed in a visually significant way)- and compare to the analyses I have previously done on the entire data set.

# Regressor Testing function 
Originally architected in other file, for efficient testing (only have to adjust col transformer pipeline) 

In [10]:
def Regressor_testing(Xtrain, ytrain, Xtest,ytest, Transformer, Model, model_name_str):
    
    '''
    Function to easily implement different models and transformers for quick testing
    Different transformers, datasets and models can be input to quickly get an overview of results
    Transformers should be defined outside of func (poss with other func?)
    '''
    
    model_output= []
    
    ytrainlog = np.log1p(ytrain)
    ytestlog = np.log1p(ytest)
    
    pipeline = Pipeline([ ('trans', Transformer), ('model', Model) ])
    pipeline.fit(Xtrain, ytrainlog)
    ypred = pipeline.predict(Xtest)
    
    score_train = "R2 train score of" + model_name_str + 'is ' + str(pipeline.score(Xtrain, ytrainlog))
    score_train_mo =pipeline.score(Xtrain, ytrainlog)
    model_output.append(score_train_mo)
    
    score_test = "R2 test score of" + model_name_str + 'is ' + str(pipeline.score(Xtest, ytestlog))
    score_test_mo = pipeline.score(Xtrain, ytrainlog)
    model_output.append(score_test_mo)
    
    RMSE = "RMSE of"+ model_name_str + 'is ' +  str(np.sqrt(mean_squared_error(ytest, ypred))) 
    RMSE_mo = np.sqrt(mean_squared_error(ytest, ypred))
    model_output.append(RMSE_mo)
    
    
    RMSLE = "RMSLE of"+ model_name_str + 'is ' +  str(np.sqrt(mean_squared_log_error(ytest, ypred))) 
    RMSLE_mo = np.sqrt(mean_squared_log_error(ytest, ypred))
    model_output.append(RMSLE_mo)
    
    
    print(score_train, score_test, RMSE, RMSLE)
    
    return (model_output)  #use this to put into a func which puts it in a table?

In [1]:
def regressor_output_table(df_output, model_output, model_name_str):
    
    '''
    A function taking the defined regressors being tested, 
    which formats all of them into a dataframe 
    for readability and quick comparison
    '''
    
    df_model = pd.DataFrame([model_output], index=[model_name_str], columns=['R2_train','R2_test','RMSE','RMSLE'])
    df = pd.concat([df_output, df_model])
    
    return(df)

### Pipeline Placeholders 
(can copy and paste for efficiency)

In [6]:
pipeline_scaler = Pipeline([
            ('Scaler', StandardScaler())
])

pipeline_ohe = Pipeline([
            ('ohe', OneHotEncoder(handle_unknown="ignore"))
])

#cols to keep the same and use in model
#('pass', 'passthrough', [])

#custom func
#('', FunctionTransformer(), ['relevant_col'])