## To show this notebook as a slideshow, after downloading this notebook, change to the directory you have store the file in and type the following into the terminal/console:

jupyter nbconvert Random_forest.ipynb --to slides --post serve

## Obligatory dependencies

In [1]:
# importing dependencies
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

# For model building
from sklearn.model_selection import train_test_split, cross_val_score

# Models
from sklearn.ensemble import RandomForestRegressor

# This notebook builds a random forest decision tree for prediction daily ride patterns.

## Loading in of the data

In [None]:
# Load in the data
weather = pd.read_csv("~/Projects/NiceRide/Weather_data/01012010_12312017.csv") # Weather data
#weather = pd.read_csv('/home/gilmore/NiceRideMN/Weather_data/01012010_12312017.csv') # weather alt

temp = []
for x in [2013 + x for x in range(5)] :
    x = pd.read_csv("~/Projects/NiceRide/Nice_Ride_data/"+str(x)+"/NiceRide_trip_history_"+str(x)+".csv")
    #x = pd.read_csv('/home/gilmore/NiceRideMN/Nice_Ride_data/'+str(x)+'/NiceRide_trip_history_'+str(x)+'.csv')
    temp.append(x)
    nr = pd.concat(temp)

## Changing the datetime objects to datetime variables

In [None]:
weather['DATE'] = weather['DATE'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

nr['Start_date'] = nr['Start_date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
nr['End_date'] = nr['End_date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))

## Preparing the data to be used for the model
* Using datetime variable rides will be grouped by:
 * Daily count.
 * Mean ride duration

In [None]:
# Resampling our NR data so it takes the start date and Total duration, returns a count of rides per day 'daily_totals'

daily_totals = nr[['Start_date', 'duration']].resample('D', on='Start_date').count()
daily_totals = daily_totals.drop('Start_date', axis=1)
daily_totals = daily_totals.reset_index()
daily_totals = daily_totals.rename(index=str, columns={"duration": "daily_count",'Start_date':'DATE'})

In [None]:
daily_means = nr[['Start_date', 'duration']].resample('D', on='Start_date').mean()
daily_means = daily_means.reset_index()
daily_means = daily_means.rename(index=str, columns={"duration": "daily_mean",'Start_date':'DATE'})

## Correcting missing data in the weather dataframe
* From 2010-2013 There isn't TAVG this will be ameliorated by using the mean of TMIN and TMAX

In [None]:
# Changing TAVG-NAN values to the mean of the TMAX and TMIN
weather = weather.fillna(value={'TAVG': weather[weather.TAVG.isna()][['TMAX', 'TMIN']]
                                .agg("mean", axis="columns")})

In [None]:
## Dropping columns that won't be used in the analysis.

In [None]:
weather = weather.drop(['STATION', 'NAME'], axis=1)

## Now though our data is corrected for the analysis it will be merged together

In [None]:
temp = weather.merge(daily_totals, how='outer',on='DATE')
temp = temp.merge(daily_means, how='outer',on='DATE')
temp.info()

In [None]:
# Notice how the count of DCOUNT varies from what the other columns have
features = temp.fillna(value=0,axis=0) # Where we don't have a daily count (Dcount) fill this with zeros instead
features.info()

## This next section will change our date column from dtype datetime into seperate numerical relevant date colummns

In [None]:
features['YEAR'] = features['DATE'].dt.year
features['MONTH'] = features['DATE'].dt.month
features['DAY'] = features['DATE'].dt.day

In [None]:
features.head(5) # Three new, seperate, columns that identify year, month, day

# Feature engineering to create variables:
* Weekday or weekend
* Summer or not summer

In [None]:
# Create a weekday variable column Mon = 0, Sun = 6
features['WEND'] = features['DATE'].dt.weekday

# Adjust WEND column to be catagorical; If it's a weekday WEND = 0 else if weekend WEND = 1
features['WEND'] = features['WEND'].apply(lambda x:1 if x>4 else 0)

In [None]:
# Random forests can't use datetime objects
features = features.drop('DATE', axis=1)
features = features.drop(['SNWD', 'SNOW'], axis=1)

# Checking our dataset prior to data splits

In [None]:
features.info()

# Splitting the data on days with Daily Counts and without

In [None]:
# Seperating our data sets
winter_features = features[features['daily_count'] == 0].reset_index(drop=True)

# Need to drop all days where there where no rides taken (Dcount = 0)
features = features[features['daily_count'] != 0].reset_index(drop=True)

In [None]:
features.describe()

## DCOUNT shows Minimum of at least 1, and the

## I'm concerned why month shows min 1; there shouldn't be any data for January. Time to investigate further

In [None]:
features[features.MONTH == 1]

## Albeit rare, above freezing in January can happen in Minnesota

# Features, Targets, and converting Data in arrays for performance optimization

# Feature (variable) and Target (Daily rides) creation

In [None]:
# Saving the target/label variable and removing from features Dataframe
target = features.daily_count.values
feat = features.drop('daily_count', axis=1).values

# Saving the feature names for future use
feature_names = list(features.drop('daily_count', axis=1).columns)

# Splitting our data for training and testing

In [None]:
# Splitting the data into training and test sets
X, X_test, y, y_test = train_test_split(feat, target, test_size=.25, random_state=42)

### Inspecting the shape of the features and labels

In [None]:
print('Training Features shape', X.shape)
print('Training labels shape', y.shape)

print('Testing Features shape', X_test.shape)
print('Testing labels shape', y_test.shape)

# Training the data

In [None]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators=1000, random_state=42)

rf.fit(X, y)

# Making predictions on the test set

In [None]:
predictions = rf.predict(X_test)

# Assessing the performance of our RandomForestRegressor

In [None]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score

ex_var_score = explained_variance_score(y_test, predictions)
m_absolute_error = mean_absolute_error(y_test, predictions)
m_squared_error = mean_squared_error(y_test, predictions)
r_2_score = r2_score(y_test, predictions)

In [None]:
print("Explained Variance Score: "+str(ex_var_score))
print("Mean Absolute Error "+str(m_absolute_error))
print("Root Mean Squared Error "+str(np.sqrt(m_squared_error)))
print("R Squared Score "+str(r_2_score))

# These are the 4th degree polynomial benchmarks
* Explained Variance Score: 0.5826
* Mean Absolute Error: 460.3430
* Root Mean Squared Error: 616.5765
* R Squared Score: 0.5745

# These are the Random Forest Regression benchmarks
* Explained Variance Score: 0.8073
* Mean Absolute Error: 301.7230
* Root Mean Squared Error: 393.9897
* R Squared Score: 0.8038

# Percent improved from 4th poly to Random Forest Regression:
 * Explained Variance Score: 38.57%
 * Mean Absolute Error 34.46%
 * Root Mean Squared Error 36.10%
 * R Squared Score 39.91%

# Looking at the feature importance to the decision tree model

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_names, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

## Visualizing the feature importances

In [None]:
# Set the style
plt.style.use('fivethirtyeight')

plt.figure(figsize=[14,8])

# list of x locations for plotting
x_values = list(range(len(importances)))

# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')

# Tick labels for x axis
plt.xticks(x_values, feature_names, rotation='vertical')

# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
plt.show()

# Graphing the emperical data and the predictions
* This will be presented as:
 * Emperical as a line graph
 * Predictions will be as a 'o' marker

In [None]:
# Dates of training values
days = feat[:, feature_names.index('DAY')]
months = feat[:, feature_names.index('MONTH')]
years = feat[:, feature_names.index('YEAR')]

# List and then convert to datetime object
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
dates = [dt.datetime.strptime(date, '%Y-%m-%d') for date in dates]

# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'date': dates, 'DCOUNT': target})

# Dates of predictions
months = X_test[:, feature_names.index('MONTH')]
days = X_test[:, feature_names.index('DAY')]
years = X_test[:, feature_names.index('YEAR')]

# Column of dates
test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]

# Convert to datetime objects
test_dates = [dt.datetime.strptime(date, '%Y-%m-%d') for date in test_dates]

# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': predictions})

# Plot the actual values
plt.figure(figsize=(14.5,10))
plt.plot(true_data['date'], true_data['DCOUNT'], 'b', label = 'Daily Count', linewidth=1, alpha=.6)

# Plot the predicted values
plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction', markersize=3, alpha=.6)
plt.xticks(rotation = '60'); 
plt.legend()

# Graph labels
plt.xlabel('Date'); plt.ylabel('Daily Rides'); plt.title('Actual and Predicted Values');

# Scatter plot of: 
* Emperical Data vs Max temperature (highest feature)
* Prediction data vs Max temperature

In [None]:
# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'TMAX': features.TMAX.values, 'DCOUNT': target})
# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'TMAX': X_test[:,3], 'prediction': predictions})
# Plot the actual values and predicted values
plt.figure(figsize=(14.5,10))
plt.plot(true_data['TMAX'], true_data['DCOUNT'], 'bo', label = 'Daily Count', markersize=3, alpha=.6)
plt.plot(predictions_data['TMAX'], predictions_data['prediction'], 'ro', label = 'Prediction', markersize=3, alpha=.6)
plt.legend()
plt.xlabel('Temperature max in Fahrenheit'); plt.ylabel('Daily Rides'); plt.title('Actual vs Predicted Values');