# Importing Libraries¶


In [None]:
import matplotlib.pyplot as plt
plt.style.use('bmh')
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error
%matplotlib inline
from sklearn.preprocessing import MaxAbsScaler,PowerTransformer,MinMaxScaler,RobustScaler
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor
from scipy.stats import skew
from scipy import stats

In [None]:
train_df=pd.read_csv('../input/bike-sharing-demand/train.csv')
test_df=pd.read_csv('../input/bike-sharing-demand/test.csv')
display(train_df.head())

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
display(train_df.describe().T)


In [None]:
train_df.duplicated().sum()


In [None]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

In [None]:
sns.pairplot(train_df,vars = ['temp','windspeed','humidity','atemp','casual','registered','count'],height=2.5);

In [None]:
#Draw Correlation heatmap
fig, ax = plt.subplots(figsize=(14,8))
cor_mat= train_df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cmap = 'viridis_r',cbar=True)
plt.title("Columns Correlations");

In [None]:
def add_day_month_year(df):
    df['datetime'] = pd.to_datetime(df['datetime'], 
     format = '%Y-%m-%d %H:%M:%S', 
     errors = 'coerce')
    df['Year'] = df['datetime'].dt.year
    df['Month'] = df['datetime'].dt.month
    df['Day'] = df['datetime'].dt.day
    df['Hour'] = df['datetime'].dt.hour
    df['Minute'] = df['datetime'].dt.minute 
    df['second'] = df['datetime'].dt.second
    df['weekday'] = df['datetime'].dt.dayofweek
    df['weekEnd'] = df['weekday'] * df['weekday'] >= 5
    return df

In [None]:
train_df = add_day_month_year(train_df)
test_df = add_day_month_year(test_df)

In [None]:
train_df

In [None]:
train_df.apply(lambda x: x.unique())

In [None]:
train_df.drop(columns = ['second','Minute'],inplace = True)
test_df.drop(columns = ['second','Minute'],inplace = True)

In [None]:
train_df = train_df.set_index('datetime')
test_df = test_df.set_index('datetime')
test_df_ID = test_df.index
train_df.head()

In [None]:
casual_df = train_df.drop(['registered','count'],axis = 1)
casual_df.head()

In [None]:
registered_df = train_df.drop(['casual','count'],axis = 1)
registered_df.head()

In [None]:
#Draw Correlation heatmap
fig, ax = plt.subplots(figsize=(14,8))
cor_mat= registered_df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cmap = 'viridis_r',cbar=True)
plt.title("Columns Correlations");

In [None]:
#Draw Correlation heatmap
fig, ax = plt.subplots(figsize=(14,8))
cor_mat= casual_df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cmap = 'viridis_r',cbar=True)
plt.title("Columns Correlations");

In [None]:
registered_df['registered'].hist()

In [None]:
casual_df['casual'].hist()

In [None]:
#Data Distributions
plt.figure(figsize=(25, 25))
for i, col in enumerate(list(registered_df.columns)):
    plt.subplot(7, 4, i+1)
    sns.histplot(train_df[col], kde=True, bins=10)

In [None]:
#Data Distributions
plt.figure(figsize=(25, 25))
for i, col in enumerate(list(casual_df.columns)):
    plt.subplot(7, 4, i+1)
    sns.histplot(train_df[col], kde=True, bins=10)

In [None]:
sns.catplot(x="Hour",y="registered",data=registered_df,kind='bar',height=5,aspect=1.5)
plt.show()

In [None]:
registered_df['rushHours'] = registered_df['Hour'].isin([8,17,18])


In [None]:
sns.catplot(x="Hour",y="casual",data=casual_df,kind='bar',height=5,aspect=1.5)
plt.show()

In [None]:
#Handling Skewness of Columns
registered_df['registered']=np.log1p(registered_df['registered'])
registered_df['windspeed']=np.log1p(registered_df['windspeed'])


In [None]:
casual_df['casual']=np.log1p(casual_df['casual'])
casual_df['windspeed']=np.log1p(casual_df['windspeed'])


In [None]:
#Data Distributions
plt.figure(figsize=(25, 25))
plt.subplot(7, 4, i+1)
sns.histplot(registered_df['registered'], kde=True, bins=10)

In [None]:
#Data Distributions
plt.figure(figsize=(25, 25))
plt.subplot(7, 4, i+1)
sns.histplot(registered_df['windspeed'], kde=True, bins=10)

In [None]:
#Data Distributions
plt.figure(figsize=(25, 25))
plt.subplot(7, 4, i+1)
sns.histplot(casual_df['casual'], kde=True, bins=10)

In [None]:
#Data Distributions
plt.figure(figsize=(25, 25))
plt.subplot(7, 4, i+1)
sns.histplot(casual_df['casual'], kde=True, bins=10)

In [None]:
registered_df['rushHours'] = pd.factorize(registered_df['rushHours'])[0].reshape(-1, 1)
registered_df['weekEnd'] = pd.factorize(registered_df['weekEnd'])[0].reshape(-1, 1)
casual_df['weekEnd'] = pd.factorize(casual_df['weekEnd'])[0].reshape(-1, 1)


In [None]:
registered_df.head()

In [None]:
casual_df.head()

In [None]:
Y_registered=registered_df.registered
registered_df.drop(columns=['registered','atemp'],inplace = True)
X_train, X_valid, y_train, y_valid = train_test_split(registered_df,Y_registered, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
def get_best_model(X_train, X_valid, y_train, y_valid):
    estimators=[('et',ExtraTreesRegressor()),('hgr', HistGradientBoostingRegressor())]
    models=[RandomForestRegressor(),AdaBoostRegressor(),BaggingRegressor(),SVR(),LinearRegression(),DecisionTreeRegressor(),ExtraTreesRegressor(), HistGradientBoostingRegressor(),VotingRegressor(estimators=estimators)]
    model_names=['RandomForestRegressor','AdaBoostRegressor','BaggingRegressor','SVR','LinearRegression','DecisionTreeRegressor','ExtraTreesRegressor',' HistGradientBoostingRegressor','VotingRegressor']
    rmsle=[]
    d={}
    for model in range (len(models)):
        clf=models[model]
        clf.fit(X_train,y_train)
        print("model_name : ",model_names[model])
        print(clf.get_params())
        test_pred=clf.predict(X_valid)
        rmsle.append(np.sqrt(mean_squared_log_error(abs(test_pred),(y_valid))))

    d={'Modelling Algo':model_names,'RMSLE':rmsle}   
    rmsle_frame=pd.DataFrame(d)
    print(f'{rmsle_frame}\n______________________________________________________________________________________')
    sns.factorplot(y='Modelling Algo',x='RMSLE',data=rmsle_frame,kind='bar',size=5,aspect=2)
    plt.show()


In [None]:
get_best_model(X_train, X_valid, y_train, y_valid)

In [None]:
def rmsle(y_pred,y_true):
    y_pred = np.expm1(y_pred)
    y_true = np.expm1(y_true)
    log1=np.log(y_pred + 1)
    log2=np.log(y_true + 1)
    se = (log1 - log2) ** 2 
    mse=np.mean(se)
    return np.sqrt(mse)

from sklearn.metrics import make_scorer
myScorer = make_scorer(rmsle, greater_is_better=False)

In [None]:
HistGradientAlgo_registered = HistGradientBoostingRegressor()

param = {
    'max_iter':[100],
    'max_depth' : [10],
    'max_leaf_nodes':[15],
        }

gridSearch_HistGradientAlgo_registered=GridSearchCV(HistGradientAlgo_registered,param,scoring=myScorer,cv=10,verbose=3)
gridSearch_HistGradientAlgo_registered.fit(X_train, y_train)

best_HistGradientAlgo_registered=gridSearch_HistGradientAlgo_registered.best_estimator_
bestHistGradientAlgo_testScore_registered=best_HistGradientAlgo_registered.score(X_train, y_train)

In [None]:
pred_registered = best_HistGradientAlgo_registered.predict(X_valid)
print(rmsle(pred_registered,y_valid))

In [None]:
Y_casual=casual_df.casual
casual_df.drop(columns=['casual','atemp'],inplace = True)
X_train, X_valid, y_train, y_valid = train_test_split(casual_df,Y_casual, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
get_best_model(X_train, X_valid, y_train, y_valid)

In [None]:
HistGradientAlgo_casual = HistGradientBoostingRegressor()

param = {
    'max_iter':[i for i in range(115,118)],
    'max_depth' : [i for i in range(13,18)],
    'max_leaf_nodes':[25]
        }

gridSearch_HistGradientAlgo_casual=GridSearchCV(HistGradientAlgo_casual,param,scoring=myScorer,cv=5,verbose=3)
gridSearch_HistGradientAlgo_casual.fit(X_train, y_train)

best_HistGradientAlgo_casual=gridSearch_HistGradientAlgo_casual.best_estimator_
bestHistGradientAlgo_testScore_casual=best_HistGradientAlgo_casual.score(X_train, y_train)

In [None]:
pred_casual = best_HistGradientAlgo_casual.predict(X_valid)
print(rmsle(pred_casual,y_valid))

In [None]:
test_df['windspeed']=np.log1p(test_df['windspeed'])
test_df['rushHours'] = test_df['Hour'].isin([8,17,18])
test_df['rushHours'] = pd.factorize(test_df['rushHours'])[0].reshape(-1, 1)
test_df['weekEnd'] = pd.factorize(test_df['weekEnd'])[0].reshape(-1, 1)
test_df.drop(columns=['atemp'],inplace = True)
pred_casual = np.round(np.expm1(best_HistGradientAlgo_casual.predict(test_df.drop('rushHours',axis = 1 )))).astype(int)
pred_registered = np.round(np.expm1(best_HistGradientAlgo_registered.predict(test_df))).astype(int)


In [None]:
pred_registered = pd.DataFrame(pred_registered,columns = ['count'])
pred_registered

In [None]:
pred_casual = pd.DataFrame(pred_casual,columns = ['count'])
pred_casual

In [None]:
predictions = pd.DataFrame({'datetime':test_df_ID})
predictions['count'] = pred_registered['count'] + pred_casual['count']

In [None]:
predictions

In [None]:
predictions.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")