In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 0. Importing the important libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Importing the data

## 1.1 import `training data`

In [None]:
data = pd.read_csv('../input/bike-sharing-demand/train.csv')

## 1.2 import `testing data`

In [None]:
test_data = pd.read_csv('../input/bike-sharing-demand/test.csv')

# 2. Data Exploring


## 2.1 `training data`

In [None]:
data.head(n=10)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.columns

### 2.1.1 `Data Fields`
- datetime - hourly date + timestamp  
- season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
- holiday - whether the day is considered a holiday
- workingday - whether the day is neither a weekend nor holiday
- weather - 
   - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
   - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
   - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
   - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
- temp - temperature in Celsius
- atemp - "feels like" temperature in Celsius
- humidity - relative humidity
- windspeed - wind speed
- casual - number of non-registered user rentals initiated
- registered - number of registered user rentals initiated
- count - number of total rentals

## 2.2 `testing data`

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
test_data.isnull().sum()

# 3. Data preprocessing

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(data.corr(), annot = True)
plt.show()


In [None]:
list(data.columns)


In [None]:
str(data['season'].value_counts()).split('\n')
 #season - 1 = spring, 2 = summer, 3 = fall, 4 = winter

In [None]:
def get_values(data):
    result = []
    for col in list(data.columns):
        result.append((str(data[col].value_counts())).split('\n'))
    return result
get_values(data)

In [None]:
data.columns

In [None]:
def count_plot(columns):
    for col in columns:
        plt.figure(figsize=(8,6))
        sns.countplot(x=col, data= data)
        plt.show
count_plot(['season', 'holiday', 'workingday', 'weather'])

In [None]:
data.columns

In [None]:
def dist_plot(columns):
    for col in columns:
        plt.figure(figsize=(14,6))
        sns.distplot(x = data[col]) # distribution
        plt.show
dist_plot(['temp', 'atemp', 'windspeed', 'humidity'])

In [None]:
def hist_plot(columns):
    for col in columns:
        plt.figure(figsize=(14,6))
        sns.histplot(data[col]) # distribution
        plt.show
hist_plot(['casual', 'registered', 'count'])

In [None]:
data['datetime'] = pd.to_datetime(data['datetime'])

In [None]:
test_data.datetime = pd.to_datetime(test_data.datetime)

In [None]:
data['datetime']

In [None]:
data['year'] = data['datetime'].apply(lambda x: x.year)
data['month'] = data['datetime'].apply(lambda x: x.month)
data['day'] = data['datetime'].apply(lambda x: x.day)
data['hour'] = data['datetime'].apply(lambda x: x.hour)
data['dayofweek'] = data['datetime'].dt.day_name()

test_data['year'] = test_data['datetime'].dt.year
test_data['month'] = test_data['datetime'].dt.month
test_data['day'] = test_data['datetime'].dt.day
test_data['hour'] = test_data['datetime'].dt.hour
test_data['dayofweek'] = test_data['datetime'].dt.day_name()


In [None]:
data.head(5)

In [None]:
test_data.head(5)

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(), annot=True)
plt.show()

- I notice that there is a new relationship between `month` & `season`, so let's visualize it!

- #### Box plot use the `IQR` method for finding display data and outliers.

Wikipedia Definition
The interquartile range (IQR), also called the midspread or middle 50%, or technically H-spread, is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles, `IQR = Q3 − Q1` In other words, the IQR is the first quartile subtracted from the third quartile; these quartiles can be clearly seen on a box plot on the data. It is a measure of the dispersion similar to standard deviation or variance, but is much more robust against outliers.


<img src="https://miro.medium.com/max/1838/1*2c21SkzJMf3frPXPAR_gZA.png" width="700">

- We will clear the outliers values.
    - Okay, let's check!

In [None]:
plt.figure(figsize=(16,8))
sns.boxplot(x='dayofweek',y='count', data=data)
plt.show()

- As we can see from above graph. This is positively(right) skewed data. Now we will look the box plot and outliers value.

<img src="https://upload.wikimedia.org/wikipedia/commons/c/cc/Relationship_between_mean_and_median_under_different_skewness.png" width="700">

In [None]:
plt.figure(figsize=(16,8))
sns.boxplot(x='season', y='count', data=data)
plt.xlabel("Seasons (1= spring,  2= summer,  3= fall,  4= winter)", fontsize=16)
plt.show()

- #### we can say of the graph above, people more rent bike on summer and fall.



In [None]:
# month_sorted = ['January', 'February', 'March', 'April ',  'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
plt.figure(figsize=(16,8))
sns.boxplot(x='month',y='count', data=data)
plt.show()

- we also notice that people more rent bike on summer and fall months which are [ 'June', 'July', 'August', 'September', 'October' ]

In [None]:
plt.figure(figsize=(16,8))
sns.boxplot(x='hour',y='count', data=data) # as we can see there is difference for each hour. We need to use it !
plt.show()

- We can say that people prefer the morning and evening times for renting bike

In [None]:
plt.figure(figsize=(16,8))
sns.boxplot(x='year',y='count', data=data) # bike were rented in 2012!
plt.show()

- #### Rented more bike in 2012 than 2011. 

In [None]:
plt.figure(figsize=(16,8))
plt.hist(data['count'][data['year'] == 2011], alpha=0.5, label='2011')
plt.hist(data['count'][data['year'] == 2012], alpha=0.5, label='2012', color='red')
plt.show()

In [None]:
data.tail()

In [None]:
# we need to convert categorical data to numeric data.

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['dayofweek'] = le.fit_transform(data['dayofweek'])
test_data['dayofweek'] = le.transform(test_data['dayofweek'])

In [None]:
data.tail()

In [None]:
df1=data.loc[:,['datetime','count']]
df1.set_index('datetime',inplace=True)
df1.head()

In [None]:
df1.plot(figsize=(12,5))
plt.ylabel('Number of total rentals')
plt.legend().set_visible(False)
plt.tight_layout()
plt.title('Total rentals Time Series')
sns.despine(top=True)
plt.show();

### *Search Outliers* 
#### Now, let's find the outliers


In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
data.set_index('datetime', inplace=True)

In [None]:
data['2011-01-19 23:00:00':]

In [None]:
train_without_outliers =data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)] 

In [None]:
train_without_outliers.dropna(inplace=True)

In [None]:
print(data.info())
print('*'* 80)
print(train_without_outliers.info())

> #### We removed outliers data points.


In [None]:
(train_without_outliers == 0).sum()

We are going to fill the row that wind speed is equal zero.



In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='season',y='windspeed',data=train_without_outliers,palette='winter')
plt.show()

In [None]:
train_without_outliers['windspeed'] = train_without_outliers['windspeed'].replace(0,np.NaN)
test_data['windspeed'] = test_data['windspeed'].replace(0,np.NaN) 

In [None]:
(train_without_outliers == 0).sum()

In [None]:
train_without_outliers['windspeed'].isnull().sum()


> #### Now, We repalced zero as NaN. We will fill NaN with interpolate. Interpolate is using fill NaN value for time series data.

In [None]:
train_without_outliers['windspeed'].fillna(method='bfill',inplace=True)
train_without_outliers['windspeed'] = train_without_outliers['windspeed'].interpolate()
test_data['windspeed'] = test_data['windspeed'].interpolate()

In [None]:
train_without_outliers['windspeed'].isnull().sum()


In [None]:
train_without_outliers.info()

# 5. Apply ML models 

## 5.1. Random Forest Regression


A Random Forest is an ensemble technique capable of performing both regression and classification tasks with the use of multiple decision trees and a technique called Bootstrap Aggregation, commonly known as bagging. What is bagging you may ask? Bagging, in the Random Forest method, involves training each decision tree on a different data sample where sampling is done with replacement.

`Train test split`

In [None]:
from sklearn.model_selection import train_test_split
X = train_without_outliers[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
                            'humidity', 'year', 'month', 'day', 'hour', 'dayofweek','windspeed']]
y = train_without_outliers['count']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)

In [None]:
y_train

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)

In [None]:
rf_prediction = rf.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn import metrics
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, rf_prediction)))

In [None]:
plt.scatter(y_test,rf_prediction)
plt.show()

In [None]:
plt.figure(figsize=(16,8))
plt.plot(rf_prediction[0:200],'r')
plt.plot(y_test[0:200].values)
plt.show()

## 5.2. Decision Tree Regression

The decision tree is a simple machine learning model for getting started with regression tasks.

Background A decision tree is a flow-chart-like structure, where each internal (non-leaf) node denotes a test on an attribute, each branch represents the outcome of a test, and each leaf (or terminal) node holds a class label. The topmost node in a tree is the root node. (see here for more details).

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
dt_prediction = dt_reg.predict(X_test)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, dt_prediction)))

In [None]:
plt.scatter(y_test,dt_prediction)
plt.show()

> ### Now, We will use the test data.



In [None]:
test_data.head()

In [None]:
test_data[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp','humidity', 'year', 'month', 'day', 'hour', 'dayofweek','windspeed']] = sc_X.fit_transform(test_data[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp','humidity', 'year', 'month', 'day', 'hour', 'dayofweek','windspeed']])

In [None]:
test_pred= rf.predict(test_data[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp','humidity', 'year', 'month', 'day', 'hour', 'dayofweek','windspeed']])

In [None]:
test_pred

In [None]:
test_pred=test_pred.reshape(-1,1)

In [None]:
test_pred = pd.DataFrame(test_pred, columns=['count'])

In [None]:
df = pd.concat([test_data['datetime'], test_pred],axis=1)

In [None]:
df.head()

In [None]:
df['count'] = df['count'].astype('int')

In [None]:
df.to_csv('submission1.csv' , index=False)

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=1)
# knn.fit(X_train, y_train)
# knn_prediction = knn.predict(X_test)
# print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, knn_prediction)))

In [None]:
# from sklearn import linear_model
# reg = linear_model.Lasso(alpha=)
# reg.fit(X_train, y_train)
# reg_prediction = reg.predict(X_test)
# print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, reg_prediction)))

In [None]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train) # calculate mean and std and transform it 
# X_valid = scaler.transform(X_valid) 

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# classifier = LogisticRegression(max_iter = 250)
# classifier.fit(X_train,y_train)
# y_predict = classifier.predict(X_valid)
# lr = accuracy_score(y_valid, y_predict)

In [None]:
# lr

In [None]:
# y_test_predict = classifier.predict(test_data)