__Author__ - Vrisha Parekh

__Email__ - parekh.vrisha@gmail.com


__LinkedIn__ - https://bit.ly/VrishaParekh_LinkedIn

In [10]:
#importing libraries
import pandas as pd
import numpy as np
import scipy as sp
from numpy import mean
from numpy import std
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [11]:
#Load the data
def load_csv(file):
    #load csv file
    return pd.read_csv(file)


#Dropping the duplicates
def clean_df(df):
    data=df.drop_duplicates()
    return data


#Dropping columns
def drop_columns(df,columns):
#Dropping the instant column as we already have the index and dteday because our model would not process datetime feature.
#Dropping casual and registered as they have high influence on the response variable(cnt=casual+registered) 
    data=df.drop(columns,inplace=True,axis=1)
    return data


#Remove outliers based on the function
def remove_outlier(df):
    #Remove outliers from humidity column
    df=df[(df['hum']> 0.20 )]
    
    #Remove outliers from windspeed column
    df=df[(df['windspeed']<0.44)]
    return df
    
    
#Splitting the data into train and test sets
def split(df):
    
    X,y= df.loc[:,df.columns!='cnt'],df['cnt']
    
    X_train,X_test,y_train,y_test=sklearn_train_test_split(X,y,test_size=0.25,random_state=42)
    
    #Scaling the data
    scaler=StandardScaler()
    scaler.fit(X_train)
    split.transformed_X_train=scaler.transform(X_train)
    split.transformed_X_test= scaler.transform(X_test)
    return X_train,X_test,y_train,y_test


#Aplying baseline model
def model(model):
    #Linear regression model
    model= model
    fitted_model=model.fit(X_train,y_train)
    y_pred=fitted_model.predict(X_test)
    r2 = format(r2_score(y_test, y_pred),'.3f')
    rmse = format(np.sqrt(mean_squared_error(y_test, y_pred)),'.3f')
    mae = format(mean_absolute_error(y_test, y_pred),'.3f')

    result=pd.DataFrame({'Model':['Baseline_model'],'R-squared':[r2],'RMSE':[rmse],'MAE':[mae]})
    return result


#Checking for multicollinearity
def multicollinearity():
    X= df.loc[:,df.columns!='cnt']
    X_vif= X
    print(pd.Series([variance_inflation_factor(X_vif.values,i) for i in range (X_vif.shape[1])],index=X_vif.columns))
    model(LinearRegression())


    
#Fitting with cross validation
def fit_model(model):
    model=LinearRegression()
    fitted_model=model.fit(X_train,y_train)
    y_pred=fitted_model.predict(X_test)
    cv=KFold(n_splits=5,shuffle=True,random_state=1)
    score=cross_val_score(model, X_train,y_train, scoring='r2', cv=cv, n_jobs=-1)
    scores_2 =cross_val_score(model,X_train,y_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    print('R2 with cross val: %.3f (%.3f)' % (mean(score)*100, std(score)*100))
    print('MSE with cross val: %.3f (%.3f)' % (mean(scores_2)*100, std(scores_2)*100))
    r2 = format(r2_score(y_test, y_pred),'.3f')
    rmse = format(np.sqrt(mean_squared_error(y_test, y_pred)),'.3f')
    mae = format(mean_absolute_error(y_test, y_pred),'.3f')
    result_2 = pd.DataFrame({'Model':['Improvised_model'],'R-squared':[r2],'RMSE':[rmse],'MAE':[mae]})
    result_new = result.append(result_2)
    return result_new

    
#Checking the p-values of the dataset 
def OLS_model():
    
    X,y= df.loc[:,df.columns!='cnt'],df['cnt']

    lm=sm.OLS(y,X).fit()
    return lm.summary()

In [12]:

#Load the data
df=load_csv('day.csv')

#Dropping the duplicates
df=clean_df(df)

#Dropping columns
drop_columns(df,['instant','dteday','casual','registered'])

#Remove outliers based on the function
remove_outlier(df)


#Splitting the data into train and test sets
X_train,X_test,y_train,y_test=split(df)

#Aplying baseline model
result=model(LinearRegression())

#Checking for multicollinearity
multicollinearity()

#dropping the columns with high multicollinearity
drop_columns(df,['atemp','hum','season','weathersit'])

#Fitting with cross validation
result_new=fit_model(LinearRegression())
result_new=fit_model(RandomForestRegressor(n_estimators = 1000,random_state=1233))

#Checking the p-values of the dataset 
OLS_model()

season         21.247200
yr              1.954891
mnth           15.239929
holiday         1.099342
weekday         3.101564
workingday      3.276291
weathersit     13.171543
temp          493.399107
atemp         558.624052
hum            28.048426
windspeed       5.297042
dtype: float64
R2 with cross val: 76.608 (2.834)
MSE with cross val: -85080203.803 (10060330.512)
R2 with cross val: 76.608 (2.834)
MSE with cross val: -85080203.803 (10060330.512)


0,1,2,3
Dep. Variable:,cnt,R-squared (uncentered):,0.957
Model:,OLS,Adj. R-squared (uncentered):,0.957
Method:,Least Squares,F-statistic:,2301.0
Date:,"Thu, 01 Oct 2020",Prob (F-statistic):,0.0
Time:,17:13:55,Log-Likelihood:,-6098.9
No. Observations:,731,AIC:,12210.0
Df Residuals:,724,BIC:,12240.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
yr,2118.6846,74.412,28.472,0.000,1972.596,2264.773
mnth,85.0426,10.600,8.023,0.000,64.231,105.854
holiday,-488.9276,232.804,-2.100,0.036,-945.978,-31.877
weekday,73.9147,18.175,4.067,0.000,38.232,109.598
workingday,121.8623,80.430,1.515,0.130,-36.041,279.766
temp,6056.8932,183.435,33.019,0.000,5696.766,6417.021
windspeed,-2189.8336,388.302,-5.640,0.000,-2952.165,-1427.502

0,1,2,3
Omnibus:,97.683,Durbin-Watson:,0.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,163.957
Skew:,-0.853,Prob(JB):,2.5e-36
Kurtosis:,4.573,Cond. No.,83.1
