Project : Bike Count

In [None]:
#Import libraries
import os
import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats

from sklearn.cross_validation import train_test_split
from datetime import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression


In [None]:
os.chdir("E:\study\Data")

In [None]:
#read the data from the file
df=pd.read_csv("day2.csv")

In [None]:
train,test=train_test_split(df,test_size=0.2)

In [None]:
#Data Summary
df.shape
test.shape
train.shape
df.head(10)
df.dtypes


In [None]:
#converting to catagorical for data exploration
df["weekday"] = df.dteday.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d").weekday()])
df["mnth"] = df.dteday.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%Y-%m-%d").month])
df["season"] = df.season.map({1: "Spring", 2 : "Summer", 3 : "Fall", 4 :"Winter" })
df["weathersit"] = df.weathersit.map({1: " Clear + Few clouds + Partly cloudy + Partly cloudy",\
                                        2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", \
                                        3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", \
                                        4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog " })

In [None]:
categoryVariableList = ["weekday","mnth","season","weathersit","holiday","workingday"]
for var in categoryVariableList:
    df[var] = df[var].astype("category")

In [None]:
df  = df.drop(["dteday"],axis=1)

In [None]:
#outlier analysis
fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(20, 10)
sn.boxplot(data=df,y="cnt",orient="v",ax=axes[0][0])
sn.boxplot(data=df,y="cnt",x="season",orient="v",ax=axes[0][1])

sn.boxplot(data=df,y="cnt",x="workingday",orient="v",ax=axes[1][0])
sn.boxplot(data=df,y="cnt",x="weathersit",orient="v",ax=axes[1][1])

axes[0][0].set(ylabel='cnt',title="Box Plot On Count")
axes[0][1].set(xlabel='Season', ylabel='Count',title="Box Plot On Count Across Season")

axes[1][0].set(xlabel='Working Day', ylabel='Count',title="Box Plot On Count Across Working Day")
axes[1][1].set(xlabel='weather', ylabel='Count',title="Box Plot On Count Across Weathers")

In [None]:
fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(20, 10)
sn.boxplot(data=df,y="cnt",x="holiday",orient="v",ax=axes[0][0])
sn.boxplot(data=df,y="cnt",x="mnth",orient="v",ax=axes[0][1])

sn.boxplot(data=df,y="cnt",x="yr",orient="v",ax=axes[1][0])
sn.boxplot(data=df,y="cnt",x="weekday",orient="v",ax=axes[1][1])

axes[0][0].set(xlabel="holiday",ylabel='cnt',title="Box Plot On Year")
axes[0][1].set(xlabel='month', ylabel='Count',title="Box Plot On Count Across months")

axes[1][0].set(xlabel='yr', ylabel='Count',title="Box Plot On Count Across holiday")
axes[1][1].set(xlabel='weekday', ylabel='Count',title="Box plot on Count Across Temperature ")

In [None]:
#Corelation analysis
corrMatt = df[["temp","atemp","casual","registered","hum","windspeed","cnt"]].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sn.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True)

In [None]:
# Average count by month
fig,(ax1)= plt.subplots(nrows=1)
fig.set_size_inches(15,3)
sortOrder = ["January","February","March","April","May","June","July","August","September","October","November","December"]
hueOrder = ["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"]

monthAggregated = pd.DataFrame(df.groupby("mnth")["cnt"].mean()).reset_index()
monthSorted = monthAggregated.sort_values(by="cnt",ascending=False)
sn.barplot(data=monthSorted,x="mnth",y="cnt",ax=ax1,order=sortOrder)
ax1.set(xlabel='Month', ylabel='Avearage Count',title="Average Count By Month")



In [None]:
# Average count by Season
fig,(ax1)= plt.subplots(nrows=1)
fig.set_size_inches(15,3)
sortOrder = ["Spring","Summer","Fall","Winter"]
hueOrder = ["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"]

monthAggregated = pd.DataFrame(df.groupby("season")["cnt"].mean()).reset_index()
monthSorted = monthAggregated.sort_values(by="cnt",ascending=False)
sn.barplot(data=monthSorted,x="season",y="cnt",ax=ax1,order=sortOrder)
ax1.set(xlabel='season', ylabel='Avearage Count',title="Average Count By Season")

In [None]:
Model building #Model 1

In [128]:
df=pd.read_csv("day2.csv")

In [129]:
#Detect and replace with NA
# #Extract quartiles
q75, q25 = np.percentile(df['holiday'], [75 ,25])

# #Calculate IQR
iqr = q75 - q25

In [130]:
#Calculate inner and outer fence
minimum = q25 - (iqr*1.5)
maximum = q75 + (iqr*1.5)

In [131]:
# #Replace Outliners with NA
df.loc[df['holiday'] < minimum,:'holiday'] = np.nan
df.loc[df['holiday'] > maximum,:'holiday'] = np.nan

In [132]:
#compute NA with mean
df['holiday'] = df['holiday'].fillna(df['holiday'].mean())

In [133]:
df['season'] = df['season'].fillna(df['season'].mean())

In [134]:
df['yr'] = df['yr'].fillna(df['yr'].mean())

In [135]:
df['mnth'] = df['mnth'].fillna(df['mnth'].mean())

In [136]:
#missing value analysis
missing_val = pd.DataFrame(df.isnull().sum())
missing_val

Unnamed: 0,0
instant,21
dteday,21
season,0
yr,0
mnth,0
holiday,0
weekday,0
workingday,0
weathersit,0
temp,0


In [164]:
##MAPE 
def MAPE(y_true,y_pred):
    mape=np.mean(np.abs((y_true-y_pred)/y_true))*100
    return mape

In [159]:
df=df.drop("dteday",axis=1)
df=df.drop("instant",axis=1)

df=df.drop("casual",axis=1)
df=df.drop("registered",axis=1)

KeyError: "labels ['dteday'] not contained in axis"

In [None]:
df=df.drop("atemp",axis=1)

In [None]:
df.head()


In [None]:
#Model 1 (Linear Regression)

In [166]:
train,test=train_test_split(df,test_size=0.2)

In [167]:
import statsmodels.api as sm
lModel = LinearRegression()

In [168]:
model = sm.OLS(train.iloc[:,10],train.iloc[:,0:10]).fit()

In [169]:
model.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.966
Model:,OLS,Adj. R-squared:,0.965
Method:,Least Squares,F-statistic:,1804.0
Date:,"Fri, 08 Feb 2019",Prob (F-statistic):,0.0
Time:,05:40:28,Log-Likelihood:,-4806.6
No. Observations:,584,AIC:,9631.0
Df Residuals:,575,BIC:,9670.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
season,516.1801,66.841,7.723,0.000,384.899,647.462
yr,2163.3488,75.868,28.514,0.000,2014.336,2312.362
mnth,-37.3813,21.162,-1.766,0.078,-78.945,4.182
holiday,-7.833e-13,7.68e-14,-10.204,0.000,-9.34e-13,-6.33e-13
weekday,101.6911,18.661,5.449,0.000,65.039,138.343
workingday,282.8389,81.607,3.466,0.001,122.554,443.124
weathersit,-715.0508,89.341,-8.004,0.000,-890.524,-539.577
temp,5633.0511,220.007,25.604,0.000,5200.935,6065.167
hum,395.1004,291.772,1.354,0.176,-177.969,968.170

0,1,2,3
Omnibus:,77.391,Durbin-Watson:,1.942
Prob(Omnibus):,0.0,Jarque-Bera (JB):,160.301
Skew:,-0.753,Prob(JB):,1.55e-35
Kurtosis:,5.079,Cond. No.,7.88e+17


In [170]:
predictions_lr= model.predict(test.iloc[:,0:10])

In [171]:
print ("MAPE  For Linear Regression: ", MAPE (test.iloc[:,10],predictions_lr))

MAPE  For Linear Regression:  22.73079404918339


In [None]:
# MAPE Error= 21%

In [None]:
#model 2 - Random Forest

In [None]:
#RMSLE func
def rmsle(y, y_,convertExp=True):
    if convertExp:
        y = np.exp(y),
        y_ = np.exp(y_)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

In [None]:
yLabels=df["cnt"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_validate, y_train, y_validate = train_test_split( df, yLabels, test_size=0.3, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfModel = RandomForestRegressor(n_estimators=100) 



In [None]:
#fit the model on the train data
rfModel.fit(X = X_train,y = np.log1p(y_train))


In [None]:
preds = rfModel.predict(X= X_validate)

In [None]:
print ("RMSLE Value For Random Forest: ",rmsle(np.exp(np.log1p(y_validate)),np.exp(preds),False))

In [None]:
#RMSLE Value is 0.20%