# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error
%matplotlib inline

from sklearn.preprocessing import MaxAbsScaler,PowerTransformer,MinMaxScaler,RobustScaler

from sklearn.inspection import permutation_importance

from xgboost import XGBRegressor

from scipy.stats import skew

from scipy import stats

# Loading Train and Test Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
train_df=pd.read_csv('../input/bike-sharing-demand/train.csv')
test_df=pd.read_csv('../input/bike-sharing-demand/test.csv')
test_date=test_df.datetime
display(train_df.head())

# EDA

In [None]:
display(train_df.describe().T)
display(train_df.info())
train_df[train_df['count']<0]

In [None]:
#duplicates Checking
train_df.duplicated().sum()


**⚡ No Duplicates**

In [None]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

**⚡ No Missing Data**

In [None]:
# check when the not a workingday there is no rental of bike
#But found there exist renting
z=train_df[train_df['workingday']==0]
z['count'].shape

In [None]:
#right Skewed Label
_=sns.histplot(train_df['count'])
_=plt.title("Visualizing Target column")
_=plt.xlabel("Rented bikes")
_=plt.ylabel("Frequency")

**⚡ It's Right
Skewed :(**

In [None]:
#Data Distributions
plt.figure(figsize=(25, 25))
for i, col in enumerate(list(train_df.columns)):
    plt.subplot(7, 4, i+1)
    sns.histplot(train_df[col], kde=True, bins=10)

In [None]:
def drawFeatures_VS_y(df):
    for col in df.columns:
        col_rental = df.groupby(col,as_index=False)['count'].mean()
        sns.scatterplot(data = col_rental,x=col,y='count')
        plt.title(col)
        plt.show()

In [None]:
#Draw scatter plot between each feature and Target
drawFeatures_VS_y(train_df)

In [None]:
# see the mean of label to every unique value of each column
#may be helpful to know most important features and for featur engineerng and encoding
def insights(df):
    for col in df.columns:
        if col=='count':
            continue
        else:
            display(df[[col, 'count']].groupby([col], as_index=False).mean().sort_values(by='count', ascending=False).T)

In [None]:
insights(train_df)

In [None]:
#Calc the skeweness of each continous feature

def calc_skew(df):
    print("\nIF THE DATA IS HIGHLY SKEWED IF SKWENESS  > 1 OR < -1 \n")
    for col in df.loc[:, df.dtypes != np.object ]:
        print("the skewness of ",col,"is :",df[col].skew())

calc_skew(train_df)

# Preprocessing And Feature Engineering

In [None]:
# Transform data
def transformation(df,columns,func):
    for col in columns:
        df[col]=func(df[col])
    return df

#Demo of function params
#transformation(test_df,['Temperature(�C)','Hour'],np.log1p)

In [None]:
# Adding Day Month Year to data frame
def add_day_month_year(df):
    df['Year'] =  pd.DatetimeIndex(df['datetime']).year
    df['Month'] =  pd.DatetimeIndex(df['datetime']).month
    df['weekday'] =  pd.DatetimeIndex(df['datetime']).dayofweek
    df['weekofyear']= pd.DatetimeIndex(df['datetime']).weekofyear
    df['dayofyear']= pd.DatetimeIndex(df['datetime']).dayofyear
    df['Hour']= pd.DatetimeIndex(df['datetime']).hour
    return df


In [None]:
#Calculate if it day or night and the hours of the day the bikes rented the most these ours are :8,17,18,19,20,21
def add_rush_hours(df):
    df['RushHour']= df['Hour'].isin([8,17,18,19,20,21])
    df['lowHour']= df['Hour'].isin([0,1,2,3,4])
    return df

In [None]:
def add_day_or_night(df):
    df['DayorNight'] = (df['Hour'] >= 7) & (df['Hour'] <= 20)
    return df

In [None]:
def label_encoding(df):
    cat_features = df.select_dtypes(exclude=["number"])
    for col in cat_features.columns:
        if col != 'datetime':
            df[col] = pd.factorize(df[col])[0].reshape(-1, 1)
    return df 


In [None]:
def preprocessing(df):
    new_df=add_day_month_year(df)
    new_df=add_rush_hours(new_df)
    new_df=add_day_or_night(new_df)
    new_df=label_encoding(new_df)
    return new_df

In [None]:
train_data=preprocessing(train_df)
test_data=preprocessing(test_df)

In [None]:
train_data.head()

In [None]:
test_data.columns

In [None]:
cols=['count','registered','casual']

final_train_data=transformation(train_data,cols,np.log1p)


# Grid Search

### Splitting Data into Features and target

In [None]:
Y=final_train_data[['count']]


final_train_data.drop(columns=['datetime','count','atemp','Year','registered','casual'],inplace = True)
test_data.drop(columns=['datetime','atemp','Year'],inplace = True)


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(final_train_data,Y, train_size=0.8, test_size=0.2,random_state=0)


In [None]:
#evaluation matrix
def rmsle(y_pred,y_true):
    y_pred = np.expm1(y_pred)
    y_true = np.expm1(y_true)
    log1=np.log(y_pred + 1)
    log2=np.log(y_true + 1)
    se = (log1 - log2) ** 2 
    mse=np.mean(se)
    return np.sqrt(mse)

from sklearn.metrics import make_scorer
myScorer = make_scorer(rmsle, greater_is_better=False)

In [None]:
train_data.info()

In [None]:
HistGradient = HistGradientBoostingRegressor()

param = {#n_estimators' : [180], 
    'max_iter':[115],
    'max_depth' : [11],
    'max_leaf_nodes':[15],
    'max_bins':[150]
         #min_samples_split':[2],
         #min_samples_leaf':[1],
        }
gridSearch_HistGradient = GridSearchCV(HistGradient,param,scoring=myScorer,cv=10,verbose=3)
gridSearch_HistGradient.fit(X_train,y_train.values.ravel())

best_HistGradient = gridSearch_HistGradient.best_estimator_
bestHistGradient_testScore=best_HistGradient.score(X_train, y_train)

In [None]:
gridSearch_HistGradient.best_params_

In [None]:
bestHistGradient_testScore

In [None]:
pred=best_HistGradient.predict(X_valid)
print(rmsle(pred,y_valid.values.ravel()))

In [None]:
 r = permutation_importance(gridSearch_HistGradient, X_valid, y_valid.values.ravel(),
                            n_repeats=30)

for i in r.importances_mean.argsort()[::-1]:

    print(f"{X_train.columns[i]} "
           f"{r.importances_mean[i]:.3f} "
           f" +/- {r.importances_std[i]:.3f}")

In [None]:
plt.figure(figsize=(10,7))
plt.barh(X_train.columns, r.importances_mean)

In [None]:
test_data.head(20)

In [None]:
pred=np.fix(np.expm1(best_HistGradient.predict(test_data))).astype(int)
predictions = pd.DataFrame({'datetime':test_date,
                       'count': pred})

# 😄 Generating Submission File

In [None]:
predictions.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
predictions.head(10)