In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


# Prepare Dataset

Import the preprocessed data set, which was preprocessed in the `data_preparation.ipynb` notebook

In [None]:
df_boston = pd.read_csv('./data/prepared/rides_data_prepared.csv', index_col=0, dtype={'start_station_id': np.int64, 'end_station_id': 'string', 'end_station_name': 'string', 'start_station_name': 'string', 'bike_id': np.int64, 'user_type': 'string'})
df_boston['start_time'] = pd.to_datetime(df_boston['start_time'], format='%Y-%m-%d %X')
 
df_boston.head()

## Prepare Data set
### Calculate Demand per Hour

First of all, it is necessary to transform the data set such that we obtain a time series with hourly frequency and providing the demand for bike rentals.

In [None]:
df_demand = df_boston.resample(rule='H', on='start_time').size().reset_index(name='demand')
df_demand.columns = ['date_time', 'demand']
df_demand.head()


### Include Weather Data
Now that the 

In [None]:
df_weather = pd.read_csv('./data/weather_hourly_boston.csv')
df_weather['date_time'] = pd.to_datetime(df_weather['date_time'], format='%Y-%m-%d %X')
df_demand = df_demand.merge(df_weather, how = 'left', on = 'date_time')
df_demand.head()

# Feature Engineering

## Previous demand as input

As we have given time series data, it is a common approach to use the demand from previous hours (or days etc.) as an input for the prediction. The assumption we hereby make is that the factors that influence the demand have not changed dramatically within the last hours. We have decided to use the demand of the 

In [None]:
df_demand['demand_h-1'] = df_demand.demand.shift(periods=11)
df_demand['demand_h-2'] = df_demand.demand.shift(periods=2)
df_demand['demand_h-24'] = df_demand.demand.shift(periods=24)
df_demand['demand_w-1'] = df_demand.demand.shift(periods=168)
df_demand['demand_average_w-1'] = df_demand.rolling(window=168, on='demand')
df_demand.head()

## Average temperature

In [None]:
df_demand['avg_temp'] = (df_demand.min_temp + df_demand.max_temp)/2
df_demand['avg_temp_dev'] = df_demand['avg_temp'] - df_demand['avg_temp'].mean()
df_demand.head(10)

### Temperature change within hour

In [None]:
df_demand['temp_change'] = df_demand.min_temp - df_demand.max_temp

### Season 

In [None]:
def determine_season(row):
    if row.date_time < datetime(2015,3,20):
        return 'winter'
    elif row.date_time >= datetime(2015,3,20) and row.date_time < datetime(2015,6,21):
        return 'spring'
    elif row.date_time >= datetime(2015,6,21) and row.date_time < datetime(2015,9,23):
        return 'autumn'
    else:
        return 'winter'

df_demand['season'] = df_demand.apply((lambda x: determine_season(x)), axis=1)
df_demand = pd.get_dummies(df_demand, columns=['season'])
df_demand.head()

### Time features

In [None]:
# Extract date and time features
df_demand['date'] = df_demand['date_time'].dt.strftime('%Y-%m-%d %X').apply(lambda x: x.split()[0])
df_demand['hour'] = df_demand['date_time'].dt.strftime('%Y-%m-%d %X').apply(lambda x: int(x.split()[1].split(':')[0]))


In [None]:
df_demand.head()

#### Weekend feature
In the descriptive analysis we have seen that on saturday and sunday the demand particularly low, hence we engineer a feature "weekend" which is 1 for all rides on saturday & sunday and zero for all other days.

In [None]:
def determine_weekend(row):
    if row.date_time.weekday() > 4:
        return 1
    else:
        return 0

df_demand['weekend'] = df_demand.apply((lambda x: determine_weekend(x)), axis=1)
df_demand.head()

#### Daytime features
Further the descriptive analysis has shown that the daytime, i.e. if it is night, morning, afternoon or evening, plays an important role for the demand. Hence, we engineer four features that respectively indicate if a rides takes place during
* Morning: 6am - 12pm
* Afternoon: 12pm - 6pm
* Evening: 6pm - 11pm
* Night: 12am - 6am

In [None]:
point_of_day = ["morning", "afternoon", "evening", "night"]
morning_hours = [*range(6, 12)]
afternoon_hours = [*range(12, 18)]
evening = [*range(18, 23)]
night = [23] + [*range(0, 6)]


def get_point_of_day(hour):
    if hour in morning_hours:
        return "morning"
    elif hour in afternoon_hours:
        return "afternoon"
    elif hour in evening:
        return "evening"
    elif hour in night:
        return "night"

In [None]:
df_demand['daytime'] = df_demand["date_time"].dt.hour.apply(lambda x: get_point_of_day(x))
df_demand = pd.get_dummies(df_demand, columns=['daytime'])

### Event Feature

There were several public holidays in Boston:
* New Year's Day: Friday, January 1
* Martin Luther King Jr. Day: Monday, January 19
* Presidents' Day: Monday, February 16
* Patriots' Day: Monday, April 20
* Memorial Day: Monday, May 25
* Independence Day: Saturday, July 4
* Labor Day: Monday, September 7
* Columbus Day: Monday, October 12
* Veterans Day: Wednesday, November 11
* Thanksgiving Day: Thursday, November 26
* Christmas Day: Friday, December 25

These events might have influenced the demand

In [None]:
holiday_dates = [datetime(2015,1,1), datetime(2015,1,19), datetime(2015,2,16), datetime(2015,4,20),datetime(2015,5,25), datetime(2015,7,4), datetime(2015,9,7), datetime(2015,10,12), datetime(2015,11,11), datetime(2015,11,26), datetime(2015,12,25)]

df_demand['public_holiday'] = df_demand.apply((lambda x: 1 if x.date_time in holiday_dates else 0), axis=1)

In [None]:
df_demand.head()

In [None]:
corr_matrix = df_demand.corr()
top_corr_features = corr_matrix.index
plt.figure(figsize=(12,12))

sns.heatmap(df_demand[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
df_demand.columns

In [None]:
features = ['date_time', 'max_temp', 'min_temp', 'precip', 'demand_h-1',
       'demand_h-2', 'demand_h-24', 'avg_temp', 'avg_temp_dev', 'temp_change',
       'season_autumn', 'season_spring', 'season_winter', 'date', 'hour',
       'weekend', 'daytime_afternoon', 'daytime_evening', 'daytime_morning',
       'daytime_night']
target = 'demand'

X = df_demand[features]
y = df_demand[target]

# Split the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100)

model.fit(X,y)
predicted =

