# Data Preparation

In [409]:
# Import pandas, numpy, matplotlib, seaborn libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from tabulate import tabulate
import datetime
import keras

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor


# hide ipykernel warnings 
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Import data

### Import bike dynamic data from csv into dataframe

In [410]:
# Read in data as pandas dataframe and display first 5 rows
bikeDynamic = pd.read_csv('dBikeD.csv')
bikeDynamic.tail(5)

Unnamed: 0,id_Entry,number,status,bike_stands,available_bike_stands,available_bikes,last_update,data_entry_timestamp
369155,370900,39,OPEN,20,6,14,2020-03-24 11:37:49,2020-03-24 11:40:02
369156,370901,83,OPEN,40,27,13,2020-03-24 11:35:57,2020-03-24 11:40:02
369157,370902,92,OPEN,40,25,15,2020-03-24 11:38:08,2020-03-24 11:40:02
369158,370903,21,OPEN,30,22,8,2020-03-24 11:31:46,2020-03-24 11:40:02
369159,370904,88,OPEN,30,9,21,2020-03-24 11:35:03,2020-03-24 11:40:02


#### check for shape

In [411]:
print('The shape of our features is:', bikeDynamic.shape,'\n', bikeDynamic.dtypes)

The shape of our features is: (369160, 8) 
 id_Entry                  int64
number                    int64
status                   object
bike_stands               int64
available_bike_stands     int64
available_bikes           int64
last_update              object
data_entry_timestamp     object
dtype: object


#### check for Null entries

In [412]:
bikeDynamic.isnull().sum()

id_Entry                 0
number                   0
status                   0
bike_stands              0
available_bike_stands    0
available_bikes          0
last_update              0
data_entry_timestamp     0
dtype: int64

### Import weather dynamic data from csv into dataframe

In [413]:
# Read in data as pandas dataframe and display first 5 rows
weatherDynamic = pd.read_csv('dWeatherD.csv')
weatherDynamic = weatherDynamic[['weather_main','weather_description','clouds_all','main_temp','main_feels_like',\
                               'main_pressure','main_humidity','main_temp_min', 'main_temp_max', \
                                'wind_speed', 'wind_deg', 'visibility','data_entry_timestamp']]
weatherDynamic.tail(5)

Unnamed: 0,weather_main,weather_description,clouds_all,main_temp,main_feels_like,main_pressure,main_humidity,main_temp_min,main_temp_max,wind_speed,wind_deg,visibility,data_entry_timestamp
3427,Clouds,broken clouds,75,11,9,1017,58,9,12,1.0,,10000.0,3/24/2020 11:00
3428,Clouds,broken clouds,75,11,9,1017,58,9,12,1.0,,10000.0,3/24/2020 11:10
3429,Clouds,broken clouds,75,11,7,1017,62,10,12,4.1,200.0,10000.0,3/24/2020 11:20
3430,Clouds,broken clouds,75,11,7,1017,62,10,12,4.1,200.0,10000.0,3/24/2020 11:30
3431,Clouds,broken clouds,75,12,8,1017,62,10,12,4.1,200.0,10000.0,3/24/2020 11:40


#### check for shape

In [414]:
print('The shape of our features is:', weatherDynamic.shape,'\n', weatherDynamic.dtypes)

The shape of our features is: (3432, 13) 
 weather_main             object
weather_description      object
clouds_all                int64
main_temp                 int64
main_feels_like           int64
main_pressure             int64
main_humidity             int64
main_temp_min             int64
main_temp_max             int64
wind_speed              float64
wind_deg                float64
visibility              float64
data_entry_timestamp     object
dtype: object


#### check for Null entries

In [415]:
weatherDynamic.isnull().sum()

weather_main              0
weather_description       0
clouds_all                0
main_temp                 0
main_feels_like           0
main_pressure             0
main_humidity             0
main_temp_min             0
main_temp_max             0
wind_speed                0
wind_deg                109
visibility                7
data_entry_timestamp      0
dtype: int64

#### check for Null entries

In [416]:
weatherDynamic.loc[weatherDynamic['wind_deg'].isnull(),('wind_deg')] = weatherDynamic['wind_deg'].ffill()
weatherDynamic.loc[weatherDynamic['visibility'].isnull(),('visibility')] = weatherDynamic['visibility'].ffill()

In [417]:
weatherDynamic.isnull().sum()

weather_main            0
weather_description     0
clouds_all              0
main_temp               0
main_feels_like         0
main_pressure           0
main_humidity           0
main_temp_min           0
main_temp_max           0
wind_speed              0
wind_deg                0
visibility              0
data_entry_timestamp    0
dtype: int64

## Process and normalise dates

**Observation on spreadsheet state that results for date 28 February and 24 March are patial. Hence, they are to be dropped for data consistency.** 

### dublin bikes

In [418]:
#DATETIME DATA

# Select columns containing datetime data
continous_date_columns = bikeDynamic[['last_update', 'data_entry_timestamp']].columns

# Assign object type datetime to columns enlisted in continous_date_columns
for column in continous_date_columns:
    bikeDynamic[column] = pd.to_datetime(bikeDynamic[column])   
    
end_date = pd.to_datetime('2020-03-24')
start_date = pd.to_datetime('2020-02-28')

# Ommit partial data for dates 28-2-2020 and 24-3-2020
mask = (bikeDynamic['data_entry_timestamp'].dt.date > start_date) & (bikeDynamic['data_entry_timestamp'].dt.date < end_date)
bikeDynamic = bikeDynamic.loc[mask]

# # resolution to minutes
bikeDynamic['data_entry_timestamp'] = bikeDynamic['data_entry_timestamp'].dt.strftime("%Y-%m-%d %H:%M:00")
bikeDynamic['last_update'] = bikeDynamic['last_update'].dt.strftime("%Y-%m-%d %H:%M:00")

for column in continous_date_columns:
    bikeDynamic[column] = pd.to_datetime(bikeDynamic[column]) 
    
bikeDynamic.tail()

Unnamed: 0,id_Entry,number,status,bike_stands,available_bike_stands,available_bikes,last_update,data_entry_timestamp
361345,363090,39,OPEN,20,16,4,2020-03-23 23:48:00,2020-03-23 23:50:00
361346,363091,83,OPEN,40,21,19,2020-03-23 23:47:00,2020-03-23 23:50:00
361347,363092,92,OPEN,40,32,8,2020-03-23 23:47:00,2020-03-23 23:50:00
361348,363093,21,OPEN,30,22,8,2020-03-23 23:44:00,2020-03-23 23:50:00
361349,363094,88,OPEN,30,8,22,2020-03-23 23:47:00,2020-03-23 23:50:00


In [419]:
print('The shape of our features is:', bikeDynamic.shape,'\n', bikeDynamic.dtypes)

The shape of our features is: (355410, 8) 
 id_Entry                          int64
number                            int64
status                           object
bike_stands                       int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
data_entry_timestamp     datetime64[ns]
dtype: object


### dublin weather

In [420]:
#DATETIME DATA

# Select columns containing datetime data
continous_date_columns = weatherDynamic[['data_entry_timestamp']].columns

# Assign object type datetime to columns enlisted in continous_date_columns
for column in continous_date_columns:
    weatherDynamic[column] = pd.to_datetime(weatherDynamic[column])   
    
end_date = pd.to_datetime('2020-03-24')
start_date = pd.to_datetime('2020-02-28')

# Ommit partial data for dates 28-2-2020 and 24-3-2020
mask = (weatherDynamic['data_entry_timestamp'].dt.date > start_date) & (weatherDynamic['data_entry_timestamp'].dt.date < end_date)
weatherDynamic = weatherDynamic.loc[mask]

for column in continous_date_columns:
    weatherDynamic[column] = pd.to_datetime(weatherDynamic[column])   
    
weatherDynamic.tail()

Unnamed: 0,weather_main,weather_description,clouds_all,main_temp,main_feels_like,main_pressure,main_humidity,main_temp_min,main_temp_max,wind_speed,wind_deg,visibility,data_entry_timestamp
3356,Clouds,broken clouds,75,7,5,1021,70,6,8,1.5,100.0,10000.0,2020-03-23 23:10:00
3357,Clouds,broken clouds,75,7,4,1020,70,6,8,2.1,110.0,10000.0,2020-03-23 23:20:00
3358,Clouds,broken clouds,75,7,4,1020,70,6,8,2.1,110.0,10000.0,2020-03-23 23:30:00
3359,Clouds,broken clouds,75,7,4,1020,70,6,8,2.1,110.0,10000.0,2020-03-23 23:40:00
3360,Clouds,broken clouds,75,7,4,1020,75,6,8,2.6,150.0,10000.0,2020-03-23 23:50:00


In [421]:
print('The shape of our features is:', weatherDynamic.shape,'\n', weatherDynamic.dtypes)

The shape of our features is: (3307, 13) 
 weather_main                    object
weather_description             object
clouds_all                       int64
main_temp                        int64
main_feels_like                  int64
main_pressure                    int64
main_humidity                    int64
main_temp_min                    int64
main_temp_max                    int64
wind_speed                     float64
wind_deg                       float64
visibility                     float64
data_entry_timestamp    datetime64[ns]
dtype: object


### Merge bike and weather dataframes

In [422]:
bikeDynamic.head()

Unnamed: 0,id_Entry,number,status,bike_stands,available_bike_stands,available_bikes,last_update,data_entry_timestamp
5940,7685,42,OPEN,30,10,20,2020-02-28 23:52:00,2020-02-29
5941,7686,30,OPEN,20,20,0,2020-02-28 23:53:00,2020-02-29
5942,7687,54,OPEN,33,29,4,2020-02-28 23:54:00,2020-02-29
5943,7688,108,OPEN,40,31,9,2020-02-28 22:57:00,2020-02-29
5944,7689,56,OPEN,40,37,3,2020-02-28 23:50:00,2020-02-29


In [423]:
weatherDynamic.head()

Unnamed: 0,weather_main,weather_description,clouds_all,main_temp,main_feels_like,main_pressure,main_humidity,main_temp_min,main_temp_max,wind_speed,wind_deg,visibility,data_entry_timestamp
54,Rain,light rain,75,9,6,977,93,9,10,4.6,170.0,10000.0,2020-02-29 00:00:00
55,Rain,light rain,75,8,5,977,93,6,9,4.6,170.0,10000.0,2020-02-29 00:10:00
56,Rain,light rain,75,8,4,977,93,6,9,4.6,170.0,10000.0,2020-02-29 00:20:00
57,Rain,light rain,75,8,4,977,93,6,9,4.6,170.0,10000.0,2020-02-29 00:30:00
58,Rain,light rain,75,6,-4,976,87,5,7,13.9,260.0,6000.0,2020-02-29 00:40:00


In [424]:
dfML = pd.merge(bikeDynamic, weatherDynamic, on='data_entry_timestamp')
print(dfML.shape)
dfML.head()

(355410, 20)


Unnamed: 0,id_Entry,number,status,bike_stands,available_bike_stands,available_bikes,last_update,data_entry_timestamp,weather_main,weather_description,clouds_all,main_temp,main_feels_like,main_pressure,main_humidity,main_temp_min,main_temp_max,wind_speed,wind_deg,visibility
0,7685,42,OPEN,30,10,20,2020-02-28 23:52:00,2020-02-29,Rain,light rain,75,9,6,977,93,9,10,4.6,170.0,10000.0
1,7686,30,OPEN,20,20,0,2020-02-28 23:53:00,2020-02-29,Rain,light rain,75,9,6,977,93,9,10,4.6,170.0,10000.0
2,7687,54,OPEN,33,29,4,2020-02-28 23:54:00,2020-02-29,Rain,light rain,75,9,6,977,93,9,10,4.6,170.0,10000.0
3,7688,108,OPEN,40,31,9,2020-02-28 22:57:00,2020-02-29,Rain,light rain,75,9,6,977,93,9,10,4.6,170.0,10000.0
4,7689,56,OPEN,40,37,3,2020-02-28 23:50:00,2020-02-29,Rain,light rain,75,9,6,977,93,9,10,4.6,170.0,10000.0


In [425]:
print('The shape of our features is:', dfML.shape,'\n', dfML.dtypes)

The shape of our features is: (355410, 20) 
 id_Entry                          int64
number                            int64
status                           object
bike_stands                       int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
data_entry_timestamp     datetime64[ns]
weather_main                     object
weather_description              object
clouds_all                        int64
main_temp                         int64
main_feels_like                   int64
main_pressure                     int64
main_humidity                     int64
main_temp_min                     int64
main_temp_max                     int64
wind_speed                      float64
wind_deg                        float64
visibility                      float64
dtype: object


### Check logical integrity of data 

**Data integrity is checked for following cases:**
 - is "bike_stands" $>=$ "available_bike_stands" $+$ "available_bikes"  [Any other sequence is incorrect]
 - is "last_update" $<=$ "data_entry_timestamp" [Any other sequence is incorrect]

In [426]:
test_1 = dfML[["id_Entry","available_bike_stands","available_bikes","bike_stands"]][dfML["available_bike_stands"].add(dfML["available_bikes"], axis=0)  >\
                                                                         dfML["bike_stands"]]
print("Number of rows failing the test: ", test_1.shape[0])
test_1.head(5)

Number of rows failing the test:  18


Unnamed: 0,id_Entry,available_bike_stands,available_bikes,bike_stands
57945,65630,17,0,16
242855,250540,16,1,16
242965,250650,16,1,16
243185,250870,16,1,16
243295,250980,16,1,16


#### Remove errors by adding available_bikes and bike_stands

In [427]:
dTest_1 = test_1["id_Entry"]

for data in dTest_1: 
    dfML.loc[(dfML.id_Entry == data),'bike_stands'] = dfML.loc[(dfML.id_Entry == data),'available_bikes'] + dfML.loc[(dfML.id_Entry == data),'available_bike_stands']

In [428]:
test_1 = dfML[["id_Entry","available_bike_stands","available_bikes","bike_stands"]][dfML["available_bike_stands"].add(dfML["available_bikes"], axis=0)  >\
                                                                         dfML["bike_stands"]]
print("Number of rows failing the test: ", test_1.shape[0])
test_1.head(5)

Number of rows failing the test:  0


Unnamed: 0,id_Entry,available_bike_stands,available_bikes,bike_stands


## Encoding

#### Dictionary vectorization

In [429]:
dfML_enc = pd.get_dummies(dfML, prefix_sep='_')
# X head
dfML_enc.head()

dfML_enc.to_csv("FeaturesGetDummies.csv")

In [430]:
print('Shape of features after one-hot encoding:', dfML_enc.shape)

Shape of features after one-hot encoding: (355410, 42)


## Features and Labels

In [431]:
# return dataframe without target variable for processing
def prepareDF(dfML, targetCol):
    target = np.array(dfML[targetCol])
    
    # Remove the target variable from dataframe
    dfML= dfML.drop(targetCol, axis = 1)

    # Saving feature names for later use
    feature_list = list(dfML.columns)

    # Convert to numpy array
    dfML_array = np.array(dfML)
    
    return dfML_array, target, feature_list

## Training and Testing Sets

In [432]:
def trainTestData(features, target):
    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features, target, test_size = 0.25,
                                                                               random_state = 42)
    return train_features, test_features, train_labels, test_labels

## Establish Baseline

In [433]:
# def baseLine(test_features, feature_list, target):
#     # The baseline predictions are the historical averages
#     baseline_preds = test_features[:, feature_list.index(target)]

#     # Baseline errors, and display average baseline error
#     baseline_errors = abs(baseline_preds - test_labels)
#     print('Average baseline error: ', round(np.mean(baseline_errors), 2), 'degrees.')

## Training the Forest

In [434]:
def trainRandomForest(train_features, train_labels):
    # Instantiate model 
    rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

    # Train the random forest model
    rf.fit(train_features, train_labels)
    
    return rf

Model with different hyperparameters

In [435]:
# rf_new = RandomForestRegressor(n_estimators = 100, criterion = 'mse', max_depth = None, 
#                                min_samples_split = 2, min_samples_leaf = 1)

## Make Predictions on Test Data

In [436]:
def predict(rf, test_features, test_labels):
    # Use the forest's predict method on the test data
    predictions = rf.predict(test_features) 
    
    return predictions

## View Results

In [437]:
def result(predictions,test_labels,dates):
    # Calculate the absolute errors
    errors = abs(predictions - test_labels)

    # Print out the mean absolute error (mae)
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
    
    # Calculate mean absolute percentage error (MAPE)
    mape = 100 * (errors / test_labels)

    # Calculate and display accuracy
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')
    
    plt.plot(test_labels)
    plt.xticks(range(len(dates)), dates)
    plt.plot(predictions)
    
    plt.xlabel('Time')
    plt.ylabel('Available stands')


#     plt.xlabel('Time')
#     plt.ylabel('Available stands')

    plt.show()



## Pipeline

In [439]:
# Data processing pipeline 

dfML_pipeline = dfML_enc
targetCol = ['available_bikes','available_bike_stands']
i = 1

features, target, feature_list = prepareDF(dfML_pipeline,targetCol[i])
train_features, test_features, train_labels, test_labels = trainTestData(features, target)

dates = test_features[:, feature_list.index('data_entry_timestamp')]

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

# baseLine()

rf = trainRandomForest(train_features, train_labels)
predictions = predictRandomForest(rf,test_features, test_labels)
result(predictions,test_labels,dates)

Training Features Shape: (266557, 41)
Training Labels Shape: (266557,)
Testing Features Shape: (88853, 41)
Testing Labels Shape: (88853,)


TypeError: float() argument must be a string or a number, not 'Timestamp'

## Visualizing a Single Decision Tree

In [None]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png'); 

![Decision Tree](tree.png)

In [None]:
print('The depth of this tree is:', tree.tree_.max_depth)

Smaller tree for visualization.

In [None]:
# Limit depth of tree to 2 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3, random_state=42)
rf_small.fit(train_features, train_labels)

# Extract the small tree
tree_small = rf_small.estimators_[5]

# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('small_tree.dot')

graph.write_png('small_tree.png')

![Small Decision Tree](small_tree.PNG)

### Annotated Version of Tree

![Annotated Decision Tree](small_tree_annotated.PNG)

## Variable Importances

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

### Two Most Important Features

In [None]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Extract the two most important features
important_indices = [feature_list.index('temp_1'), feature_list.index('average')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]

# Train the random forest
rf_most_important.fit(train_important, train_labels)

# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)

errors = abs(predictions - test_labels)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape

print('Accuracy:', round(accuracy, 2), '%.')

## Visualizations

In [None]:
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt

%matplotlib inline

# Set the style
plt.style.use('fivethirtyeight')

# list of x locations for plotting
x_values = list(range(len(importances)))

# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')

# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')

# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances'); 

In [None]:
import datetime

# Dates of training values
months = features[:, feature_list.index('month')]
days = features[:, feature_list.index('day')]
years = features[:, feature_list.index('year')]

# List and then convert to datetime object
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]

# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'date': dates, 'actual': labels})

# Dates of predictions
months = test_features[:, feature_list.index('month')]
days = test_features[:, feature_list.index('day')]
years = test_features[:, feature_list.index('year')]

# Column of dates
test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]

# Convert to datetime objects
test_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in test_dates]

# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': predictions}) 

In [None]:
# Plot the actual values
plt.plot(true_data['date'], true_data['actual'], 'b-', label = 'actual')

# Plot the predicted values
plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction')
plt.xticks(rotation = '60'); 
plt.legend()

# Graph labels
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual and Predicted Values');


In [None]:
# Make the data accessible for plotting
true_data['temp_1'] = features[:, feature_list.index('temp_1')]
true_data['average'] = features[:, feature_list.index('average')]
true_data['friend'] = features[:, feature_list.index('friend')]

# Plot all the data as lines
plt.plot(true_data['date'], true_data['actual'], 'b-', label  = 'actual', alpha = 1.0)
plt.plot(true_data['date'], true_data['temp_1'], 'y-', label  = 'temp_1', alpha = 1.0)
plt.plot(true_data['date'], true_data['average'], 'k-', label = 'average', alpha = 0.8)
plt.plot(true_data['date'], true_data['friend'], 'r-', label = 'friend', alpha = 0.3)

# Formatting plot
plt.legend(); plt.xticks(rotation = '60');

# Lables and title
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual Max Temp and Variables');