In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <center> Rental Bike Sharing Dataset <center>

<center> <img src="https://storage.googleapis.com/gweb-uniblog-publish-prod/images/image1_hH9B4gs.max-1000x1000.jpg"> <center>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
import scipy.stats as st
from scipy.stats import levene
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

* importing the required libraries

In [None]:
main_df = pd.read_csv("/kaggle/input/rental-bike-sharing/day.csv")
df = main_df.copy()
df.head()

* Reading dataset and making a copy of it.

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isna().any()

In [None]:
sns.heatmap(df.isnull())

* No null value is present

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

* It is used to calculate how one variable is correlated/ dependent on other variable.
* Extreme values signify high correlation.
* Multicollinear variables with correlation more than a threshold are usually dropped from the dataset. 

In [None]:
corr = df.corr()
c1 = corr.abs().unstack()
c1.sort_values(ascending = False)[15:27:2]

*  Here we can see that ("temp", "atemp"),("instant", "yr") and ("season", "mnth")  is highly correlated.
* So we can drop any one of the highly correlated feature columns.

In [None]:
df['yr'].nunique()

* Number of unique values in year are 2

In [None]:
df['season'].nunique()

In [None]:
df['mnth'].unique()

In [None]:
cate_cols = ["dteday", "season", "yr", "mnth", "holiday", "weekday", "workingday", "weathersit"]
for col in cate_cols:
    df[col] = df[col].astype('category')

* Changing Datatypes into categorical

In [None]:
print(df.info())

In [None]:
df = df.rename(columns= {'dteday':'date', 'yr':'year', 'mnth':'month', 'weathersit': 'weather', 'hum':'humidity', 'cnt':'count'})
df.head()

* Renaming Columns

#### Univariate Analysis

In [None]:
df.describe()

* In temp :- mean value is 0.49, Median value is 0.49, max is 0.86, min is 0.05 .
* Similarly we can check for all other.

In [None]:
sns.displot(x="temp", data=df, kde=True)
description = df['temp'].describe()
plt.axvline(description["25%"], ls="--", color='r')
plt.axvline(description["mean"], ls="--", color='r')
plt.axvline(description["75%"], ls="--", color='r')

* By this distplot we can also see that median is ~ 0.49
* All value are within 0 to 1 so it seems like normally distributed but it is Bimodal.
* Two peaks of data, usually indicates you’ve got two different groups.

In [None]:
sns.boxplot(df['temp'], showmeans=True,color='red' )

In [None]:
fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['temp'], ax=ax1 , color ='red')
ax1.set(title='temp distribution')
qqplot(df['temp'], ax=ax2, line='s')
ax2.set(title='Quantile quantile plot')

* Distplot, Kdeplot, histogram and QQ-Plot is used to check normality.
* Within IQR we can see that it is slightly above and below the which which means we have Bimodal data.

In [None]:
print("Upper limit ",df['temp'].mean() + 3*df['temp'].std())

In [None]:
print("Lower limit ",df['temp'].mean() - 3*df['temp'].std())

In [None]:
df[(df['temp'] > 1.04) | (df['temp'] < -0.053)]

* The data points which fall below mean-3*(sigma) or above mean+3*(sigma) are outliers.
* And here we have no such data means no outliers.

In [None]:
df['temp'].skew()

##### Casual User's feature for univariate analysis

In [None]:
sns.displot(x="casual", data=df, kde=True)
description = df['casual'].describe()
plt.axvline(description["25%"], ls="--", color='r')
plt.axvline(description["mean"], ls="--", color='r')
plt.axvline(description["75%"], ls="--", color='r')

In [None]:
print("Upper limit ",df['casual'].mean() + 3*df['casual'].std())

In [None]:
print("Lower limit ",df['casual'].mean() - 3*df['casual'].std())

In [None]:
df[(df['casual'] > 2908.04) | (df['casual'] < -1211.69)]

In [None]:
len(df[(df['casual'] > 2908.04) | (df['casual'] < -1211.69)])

* Above 8 rows are present beyond 3 standard deviation ( 99.7% rule).
* So these are the outliers.

In [None]:
sns.boxplot(df['casual'], showmeans=True,color='red' )

* Box Plot is using following IQR range
*  above Q3 + 1.5*IQR
*  below Q1 - 1.5*IQR 

* Green dot represents mean value. 
* where as outliers are present in right side means data is right skewed.
* So we will have to apply log transformation to change it into normal distribution.

In [None]:
fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['casual'], ax=ax1 , color ='red')
ax1.set(title='casual user distribution')
qqplot(df['casual'], ax=ax2, line='s')
ax2.set(title='Quantile quantile plot')

* Data is right skewed and it is not normally distributed.

#### **Using Shortcut for checking Normality and Skewness using pandas skew function.**

In [None]:
skew_val = df.skew().sort_values(ascending=False)
skew_val

###### Information :- 
            
| Feature          | Information                         |
| -----------------| ------------------------------------|
| Casual           | Right skewed.                       |
| Windspeed        | Moderately right skew               |
| Instant          | Perfectly Normal distribution       |
| Registered       | Normally distributed                |
| Count            | Normally distributed                |
| Temp             | slightly left skewed                |
| humidity         | Moderately left Skew                |
| atemp            | Left Skewed                         |

#### Bivariate Analysis

In [None]:
fig = px.box(df, x="season", y="count", color="season", width=1000, height=600)

def custom_legend_name(new_names):
    for i, new_name in enumerate(new_names):
        fig.data[i].name = new_name

custom_legend_name(['season 1','season 2','season 3','season 4' ])
fig.show()

* In season 3 (fall) we have highest number of count and least in season 1.

In [None]:
fig = px.bar(df, x='season', y='count', color="season")
fig.show()

In [None]:
fig, ax = plt.subplots(2,2, figsize = (14,8))
sns.barplot(x = 'weather', y = 'count', data = df, saturation=0.90, ax = ax[0][0])
sns.barplot(x = 'season', y = 'count', data = df,  saturation=0.90, ax = ax[0][1])
sns.barplot(x = 'holiday', y = 'count', data = df, saturation=0.90, ax = ax[1][0])
sns.barplot(x = 'workingday', y = 'count', data = df,  saturation=0.90, ax = ax[1][1])

###### Information :- 
            
| Barplot                         | Information                                                                   |
| ------------------------------| ------------------------------------------------------------------------------|
| weather vs Count       | When weather is 1 (Clear, Partly cloudy) then count is highest.               |
| season vs Count        | In season 3 ( fall ) we have highest number of count & least in 1 (springer). |
| holiday vs Count       | In holiday (1) time we have least number of count.                            |
| workingday vs Count    | In working day ( Neighter holiday/ weekend) we have highest number of count.  |

In [None]:
fig = px.histogram (df, x = "temp",  facet_row = "season",  template = 'plotly_dark')
fig.show ()

* In season 1 (springer)  when temp ranges from 0.25 to 0.29 then we have highest number of count.
* In season 2 (summer)  when temp ranges from 0.6 to 0.64 then we have highest number of count.
* In season 3 (fall)  when temp ranges from 0.7 to 0.74 then we have highest number of count.
* In season 4 (winter)  when temp ranges from 0.3 to 0.34 then we have highest number of count.

In [None]:
fig = px.histogram (df, x = "temp",  facet_row = "workingday",  template = 'plotly_dark')
fig.show ()

* In workingday 1  when temp ranges from 0.7 to 0.74 then we have highest number of count.
* In workingday 0  when temp ranges from 0.65 to 0.69 and 0.34 to 0.39 then we have highest number of count.

In [None]:
fig =  px.pie (df, names = "workingday", hole = 0.4, template = "gridon")
fig.show ()

In [None]:
fig =  px.pie (df, names = "weather", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
fig =  px.pie (df, names = "season", hole = 0.4, template = "plotly_dark")
fig.show ()

* From weather 1 we have highest number of count.
* From weather 3 we have least number of count.

In [None]:
fig = px.scatter (df, x = "count", y = "temp", color = "season", template = "plotly_dark",  trendline="ols")
fig.show ()

In [None]:
fig = px.scatter (df, x = "count", y = "temp", color = "season", template = "plotly_dark",  trendline="lowess")
fig.show ()

#### Multivariate Analysis

In [None]:
sns.pairplot(data=df, vars=['temp', 'humidity', 'windspeed'], \
             hue='year', kind='reg', diag_kind='kde', markers=['*','.'], size=5, palette='husl')

In [None]:
sns.pairplot(data=df, vars=['temp', 'humidity', 'windspeed'], \
             hue='workingday', kind='reg', diag_kind='kde', markers=['*','.'], size=4, palette='husl')

In [None]:
df.head()

In [None]:
df=df.drop(columns=['instant', 'atemp', 'date',  'count', 'year'],axis=1)

In [None]:
# sdfsd

In [None]:
df.head()

In [None]:
df.shape

In [None]:
categorycols=['season', 'month', 'weekday', 'weather','workingday','holiday']
df = pd.get_dummies(df, columns = categorycols,drop_first=True)
df.head()

In [None]:
df.shape

#### Model Building

In [None]:
X = df.drop(['casual', 'registered'], axis=1)
y = df[['casual', 'registered']]

X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
from sklearn.ensemble import  RandomForestRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## RandomForest

In [None]:
model_rf = RandomForestRegressor(random_state=42)

In [None]:
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)

In [None]:
print('Testing R2 Score: ', r2_score(y_test, pred_rf)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, pred_rf)))
print('Testing MAE: ', mean_absolute_error(y_test, pred_rf))
print('Testing MSE: ', mean_squared_error(y_test, pred_rf))

In [None]:
pred_rf[:4]

In [None]:
pred_rf_trn = model_rf.predict(X_train)

In [None]:
print('Training R2 Score: ', r2_score(y_train, pred_rf_trn)*100)
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, pred_rf_trn)))
print('Training MAE: ', mean_absolute_error(y_train, pred_rf_trn))
print('Training MSE: ', mean_squared_error(y_train, pred_rf_trn))

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
parameters={'n_estimators':[int(x) for x in np.linspace(50,2000,200)],
            'max_depth':[int(x) for x in np.linspace(1,50,30)],
            'criterion':["mse", "mae"],
            'min_samples_split': [int(x) for x in np.linspace(1,50,30)],
            'min_samples_leaf': [int(x) for x in np.linspace(1,50,30)]}

In [None]:
rfm=RandomizedSearchCV(model_rf,parameters,cv=5,n_iter=30,n_jobs=-1,verbose=5,random_state=2)
rfm.fit(X_train,y_train)
rfm.best_score_

In [None]:
rfmod=rfm.best_estimator_

In [None]:
rfmod

In [None]:
rfmod.fit(X_train,y_train)
ypred=rfmod.predict(X_test)
r2_score(y_test,ypred)

## GradientBoostingRegressor

In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model_gb = MultiOutputRegressor(GradientBoostingRegressor(random_state=2))

model_gb.fit(X_train, y_train)
pred_gb = model_gb.predict(X_test)

In [None]:
print('Testing R2 Score: ', r2_score(y_test, pred_gb)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, pred_gb)))
print('Testing MAE: ', mean_absolute_error(y_test, pred_gb))
print('Testing MSE: ', mean_squared_error(y_test, pred_gb))

In [None]:
pred_gb[:4]

In [None]:
pred_gb_trn = model_gb.predict(X_train)

In [None]:
print('Training R2 Score: ', r2_score(y_train, pred_gb_trn)*100)
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, pred_gb_trn)))
print('Training MAE: ', mean_absolute_error(y_train, pred_gb_trn))
print('Training MSE: ', mean_squared_error(y_train, pred_gb_trn))

#### Thank you for reading this notebook.