In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

In [None]:
train.head(3)

**A short description of the features**

* datetime - hourly date + timestamp
* season - 1 = spring, 2 = summer, 3 = fall, 4 = winter
* holiday - whether the day is considered a holiday
* workingday - whether the day is neither a weekend nor holiday
* weather -
  
  1: Clear, Few clouds, Partly cloudy, Partly cloudy
  
  2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist

  3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds

  4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

* temp - temperature in Celsius
* atemp - "feels like" temperature in Celsius
* humidity - relative humidity
* windspeed - wind speed
* casual - number of non-registered user rentals initiated
* registered - number of registered user rentals initiated
* count - number of total rentals

In [None]:
train.info()

In [None]:
train.describe().T

# Exploratory Data Analysis (EDA)

**Plot in categorical variable**

In [None]:
# countplot in categorical variable
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.set_size_inches(16,8)

sns.countplot(train['season'], ax=axes[0][0])
sns.countplot(train['holiday'], ax=axes[0][1])
sns.countplot(train['workingday'], ax=axes[1][0])
sns.countplot(train['weather'], ax=axes[1][1])

In [None]:
# pointplot in categorical variable
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.set_size_inches(16,8)

sns.pointplot(data=train, x='season', y='count', ax=axes[0][0])
sns.pointplot(data=train, x='holiday',y='count', ax=axes[0][1])
sns.pointplot(data=train, x='workingday',y='count', ax=axes[1][0])
sns.pointplot(data=train, x='weather',y='count', ax=axes[1][1])

In [None]:
# count in the worst weather 
train[train['weather']==4]

**Insight:**
*   season: There are high rental demand in summer and fall.
*   holiday/working: Rental count in workingday is much more than that in holiday.
*   weather: Rental count is large in good weather.


---

**Plot continuous variable**

In [None]:
# boxplot in continuous variable
fig, axes = plt.subplots(nrows=2, ncols=3)
fig.set_size_inches(18,8)

sns.boxplot(train['temp'], ax=axes[0][0])
sns.boxplot(train['atemp'], ax=axes[0][1])
sns.boxplot(train['humidity'], ax=axes[0][2])
sns.boxplot(train['windspeed'], ax=axes[1][0])
sns.boxplot(train['count'], ax=axes[1][1])

In [None]:
# distplot in continuous variable
fig, axes = plt.subplots(nrows=2,ncols=3)
fig.set_size_inches(18,8)

sns.distplot(train['temp'],ax=axes[0][0])
sns.distplot(train['atemp'],ax=axes[0][1])
sns.distplot(train['humidity'],ax=axes[0][2])
sns.distplot(train['windspeed'],ax=axes[1][0])
sns.distplot(train['count'],ax=axes[1][1])

**Insight：**

*   count : There are lots of outlier in count.
*   windspeed : According to the common sense that windspeed is continuous variable, but actually no. Maybe someone filled the missing values up with 0. I will use randomfores model to refill the missing values up with reasonable value.

---


**Correlation matrix**

In [None]:
corr = train.corr()
mask = np.array(corr)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(10,10)
sns.heatmap(data=corr,
            mask=mask,
            cmap='Oranges',
            square=True,
            annot=True,
            cbar=True)

# Feature Engineering 

**Outliers Detection**

In [None]:
# delect the outliers
print('before delect train outlier: ', train.shape)
train = train[np.abs(train['count']-train['count'].mean()) <= 3*train['count'].std()]
print('before delect train outlier: ', train.shape)

**Feature Transformation in datetime**

In [None]:
weekday={'Sunday':0,
        'Monday':1,
        'Tuesday':2,
        'Wednesday':3,
        'Thursday':4,
        'Friday':5,
        'Saturday':6}

In [None]:
# train dataset
train['datetime'] = pd.to_datetime(train['datetime'], format='%Y-%m-%d %H:%M:%S')
train['year']=train['datetime'].dt.year
train['year']=train['year'].map({2011:0, 2012:1})
train['month']=train['datetime'].dt.month
train['weekday']=pd.DatetimeIndex(train['datetime']).day_name()
train['weekday']=train['weekday'].map(weekday)
train['hour']=train['datetime'].dt.hour

In [None]:
# test dataset
test['datetime'] = pd.to_datetime(test['datetime'], format='%Y-%m-%d %H:%M:%S')
test['year']=test['datetime'].dt.year
test['year']=test['year'].map({2011:0, 2012:1})
test['month']=test['datetime'].dt.month
test['weekday']=pd.DatetimeIndex(test['datetime']).day_name()
test['weekday']=test['weekday'].map(weekday)
test['hour']=test['datetime'].dt.hour

In [None]:
train.head(3)

In [None]:
fig=plt.gcf()
fig.set_size_inches(12,6)
sns.pointplot(x = 'hour', y = 'count', data = train, 
              estimator=np.average, hue = 'weekday', palette='coolwarm')

In [None]:
fig=plt.gcf()
fig.set_size_inches(12,6)
sns.pointplot(x = 'month', y = 'count', data = train, 
              estimator=np.average, hue = 'weekday', palette='coolwarm')

**Feature Transformation in categorical variable**

Try two methon:
* Leave-One-Out Encoding
* Astype to category

The method 'Leave-One-Out Encoding' isn't more effective than the method 'Astype to category', so use the second way in feature transformation.

In [None]:
# Leave-One-Out Encoding

# pip install category_encoders
# import category_encoders as ce

# def LOO_Encoding(feature):
#   global train, test
#   for i in feature:
#     encoder = ce.LeaveOneOutEncoder(cols=[i], sigma=0.05)
#     train[i] = encoder.fit_transform(train[i], train['count'])
#     test[i] = encoder.transform(test[i])
#   return print('Finishing LOO_Encoding in categorical variable:\n', feature)

# feature_LOO = ['season','holiday','workingday','weather',
#                'year','month','weekday','hour']
# LOO_Encoding(feature_LOO)

In [None]:
# Astype to category
category_feature = ['season','holiday','workingday','weather',
                    'year','month','weekday','hour']

for i in category_feature:
  train[i] = train[i].astype('category')
  test[i] = test[i].astype('category')

In [None]:
train.info()

**Missing Value Imputation in windspeed**

In [None]:
# Combine train & test
data = train.append(test, ignore_index=True)

In [None]:
# plot windspeed
sns.distplot(data['windspeed'])

In [None]:
windFeatures = ['season', 'weather', 'temp', 'atemp', 'humidity', 'year', 'month', 'weekday', 'hour']

windspeedIs0 = data[data['windspeed']==0]
windspeedIsNot0  = data[data['windspeed']!=0]

X = windspeedIsNot0[windFeatures]
y = windspeedIsNot0['windspeed']

In [None]:
# use train_test data to evaluate model effectiveness
from sklearn.model_selection import train_test_split
x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

rfModel = RandomForestRegressor(n_estimators=1000, random_state=42)
rfModel.fit(x_train1, y_train1)
y_pred1 = rfModel.predict(x_test1)

In [None]:
# evaluate randomforest model effectiveness
from sklearn.metrics import mean_squared_error, mean_squared_log_error
wind_RMSE = np.sqrt(mean_squared_error(y_test1, y_pred1))
wind_RMSLE = np.sqrt(mean_squared_log_error(y_test1, y_pred1))

print('wind_RMSE: ', wind_RMSE)
print('wind_RMSLE: ', wind_RMSLE)

In [None]:
# fill up wind_zero_values in data
rfModel = RandomForestRegressor(n_estimators=1000, random_state=42)
rfModel.fit(X,y)
wind0Values = rfModel.predict(windspeedIs0[windFeatures])
windspeedIs0.loc[:,'windspeed'] = wind0Values

data = pd.concat((windspeedIs0, windspeedIsNot0), axis=0)

In [None]:
# plot windspeed after filling up 0 value
sns.distplot(data['windspeed'])

**Split train and test dataset**

In [None]:
dataTrain = data[pd.notnull(data['count'])].sort_values(by='datetime')
dataTest = data[~pd.notnull(data['count'])].sort_values(by='datetime')

dataTrain_Y = dataTrain['count']
datetimeCol = dataTest['datetime']

In [None]:
# dataTrain_Y
sns.distplot(dataTrain_Y)

In [None]:
# log Y
dataTrain_Y_log = np.log(dataTrain_Y)

In [None]:
# dataTrain_Y_log
sns.distplot(dataTrain_Y_log)

**Delete unnecessary columns**

In [None]:
dropFeatures = ['datetime','casual','registered','count']
dataTrain = dataTrain.drop(dropFeatures, axis=1)
dataTest = dataTest.drop(dropFeatures, axis=1)

In [None]:
dataTrain.head(3)

# Build machine learning models
* Linear regression
* Polynomial regression
* RandomForest regressor

In [None]:
# evaluate model effectiveness function
def evaluate_model_RMSE(modelName, y_test, y_pred):
  train_RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
  train_RMSLE = np.sqrt(mean_squared_log_error(y_test, y_pred))

  print(modelName, ' train_RMSE : ', train_RMSE)
  print(modelName, ' train_RMSLE : ', train_RMSLE)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dataTrain.values,
                                                    dataTrain_Y_log.values,
                                                    test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Linear Regression
LR = LinearRegression()
LR.fit(x_train, y_train)
y_pred = LR.predict(x_test)

evaluate_model_RMSE('LinearRegression',y_test,y_pred)

In [None]:
# Polynomial regression
deg = 3
regressor_poly = PolynomialFeatures(degree=deg)
x_train_poly = regressor_poly.fit_transform(x_train)
x_test_poly = regressor_poly.fit_transform(x_test)

LR = LinearRegression()
LR.fit(x_train_poly, y_train)
y_pred_poly = LR.predict(x_test_poly)
y_pred_poly = [max(0,x) for x in y_pred_poly]

evaluate_model_RMSE('LinearRegression Poly',y_test,y_pred_poly)

In [None]:
# RandomForestRegressor
rfModel = RandomForestRegressor(n_estimators=1000, random_state=42)
rfModel.fit(x_train, y_train)
y_pred_rf = rfModel.predict(x_test)

evaluate_model_RMSE('RandomForest', y_test, y_pred_rf)

# Choose model : RandomForest Regressor

In [None]:
rfModel = RandomForestRegressor(n_estimators=1000, random_state=42)
rfModel.fit(dataTrain, dataTrain_Y_log)
dataTest_Y_log = rfModel.predict(dataTest)

In [None]:
dataTest_Y = [max(0,x) for x in np.exp(dataTest_Y_log)]

In [None]:
result = pd.DataFrame({
    'datetime' : datetimeCol,
    'count' : dataTest_Y
})

In [None]:
result.to_csv('bike_prediction.csv',index=False)