# Predicting Short Term Daily Ridership (Check for feature importances)

- Same as the previous notebook, except this time, prediction is made only 10 & 30 days into the future.
- Best split day for that purpose was picked from the previous short-term prediction notebook
- Look at the feature importances

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

# Importing the subscriber data

In [2]:
# Import subscriber data and remove entries with NaN
data = pd.read_csv('../data/for_predictions/daily_p_s_pred_basic.csv')
data = data.dropna()

In [3]:
# Remove entries that have negative wind strength
data = data[data['AWND'] >= 0]

In [4]:
# Create date field and convert it to datetime object
data['date'] = data['year'].astype(str) + '-' + data['month'].astype(str) + '-' + data['day'].astype(str) 
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')

In [5]:
# Check the columns
data.columns

Index(['year', 'month', 'day', 'dayofweek', 'st_latitude', 'st_longitude',
       'closest_college_distance', 'closest_subway_distance',
       'closest_theater_distance', 'closest_museum_distance',
       'closest_park_distance', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'AWND',
       'pickups', 'date'],
      dtype='object')

In [6]:
# Reorder the columns and  preview the data
data = data[['date', 'year', 'month', 'day', 'dayofweek', 'st_latitude', 'st_longitude',
       'closest_college_distance', 'closest_subway_distance',
       'closest_theater_distance', 'closest_museum_distance',
       'closest_park_distance', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'AWND',
       'pickups']]
data.head()

Unnamed: 0,date,year,month,day,dayofweek,st_latitude,st_longitude,closest_college_distance,closest_subway_distance,closest_theater_distance,closest_museum_distance,closest_park_distance,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,pickups
0,2013-07-01,2013,7,1,0,40.767272,-73.993929,0.449863,0.836766,0.509623,0.648424,0.190527,21.3,0.0,0.0,25.0,22.2,1.4,47
1,2013-07-01,2013,7,1,0,40.719116,-74.006667,0.438224,0.012754,0.179887,0.470931,0.154135,21.3,0.0,0.0,25.0,22.2,1.4,73
2,2013-07-01,2013,7,1,0,40.711174,-74.000165,0.390559,0.372382,0.41618,0.590027,0.143915,21.3,0.0,0.0,25.0,22.2,1.4,13
3,2013-07-01,2013,7,1,0,40.683826,-73.976323,0.970719,0.176488,3.503029,0.224781,0.108015,21.3,0.0,0.0,25.0,22.2,1.4,19
4,2013-07-01,2013,7,1,0,40.741776,-74.001497,0.717327,0.062889,0.234855,0.368229,0.170808,21.3,0.0,0.0,25.0,22.2,1.4,47


In [7]:
# Check basic stats of the data
data.describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99]).round(3)

Unnamed: 0,year,month,day,dayofweek,st_latitude,st_longitude,closest_college_distance,closest_subway_distance,closest_theater_distance,closest_museum_distance,closest_park_distance,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,pickups
count,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0,540042.0
mean,2015.075,6.681,15.7,2.982,40.727,-73.983,0.672,0.284,1.211,0.694,0.151,2.983,2.013,14.315,17.109,9.128,2.402,63.516
std,1.166,3.599,8.832,2.0,0.028,0.018,0.431,0.238,1.343,0.658,0.027,8.225,15.293,57.784,10.284,9.538,1.048,62.068
min,2013.0,1.0,1.0,0.0,40.518,-74.031,0.022,0.003,0.011,0.019,0.088,0.0,0.0,0.0,-9.3,-18.2,0.3,1.0
25%,2014.0,3.0,8.0,1.0,40.705,-73.996,0.338,0.1,0.278,0.28,0.129,0.0,0.0,0.0,8.3,2.2,1.7,18.0
50%,2015.0,7.0,16.0,3.0,40.725,-73.984,0.604,0.224,0.556,0.505,0.153,0.0,0.0,0.0,17.8,8.9,2.2,44.0
75%,2016.0,10.0,23.0,5.0,40.749,-73.971,0.911,0.414,2.008,0.797,0.171,1.0,0.0,0.0,26.7,17.8,3.0,90.0
95%,2017.0,12.0,30.0,6.0,40.773,-73.948,1.447,0.774,4.081,2.275,0.189,18.3,0.0,130.0,31.7,22.8,4.4,184.0
99%,2017.0,12.0,31.0,6.0,40.79,-73.94,1.895,1.006,5.125,3.199,0.204,40.1,76.0,300.0,34.4,25.6,5.5,277.0
max,2017.0,12.0,31.0,6.0,40.804,-73.93,11.939,7.748,21.617,7.715,0.218,126.2,279.0,480.0,36.7,28.3,8.2,827.0


<b>Get location information for 3 subscriber stations</b>

In [8]:
stations_info = pd.read_csv('../data/processed/stations_info_complete.csv')

In [9]:
stations_info.head()

Unnamed: 0,st_id,st_name,st_latitude,st_longitude
0,72,W 52 St & 11 Ave,40.767272,-73.993929
1,79,Franklin St & W Broadway,40.719116,-74.006667
2,82,St James Pl & Pearl St,40.711174,-74.000165
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323
4,116,W 17 St & 8 Ave,40.741776,-74.001497


<b>10 day forecast</b>

In [10]:
# Split the data
split_date = dt.date(2016, 7, 19)
window = dt.timedelta(days=10)
past = data[data['date'] <= split_date]
future = data[(data['date'] > split_date) & (data['date'] <= (split_date + window))]

In [11]:
# Set up Xs and ys
X_past = past[past.columns[1:-1]]
X_future = future[future.columns[1:-1]]
y_past = past[past.columns[-1]]
y_future = future[future.columns[-1]]

X_past = np.array(X_past).reshape(X_past.shape)
X_future = np.array(X_future).reshape(X_future.shape)
y_past = np.array(y_past).reshape(-1, 1).ravel()
y_future = np.array(y_future).reshape(-1, 1).ravel()

In [12]:
%%time
# Train RandomForestRegressor
regressor = RandomForestRegressor(max_depth=40, n_estimators=50, n_jobs=4)
regressor.fit(X_past, y_past)

CPU times: user 2min 15s, sys: 1.38 s, total: 2min 16s
Wall time: 35.7 s


In [14]:
feature_labels = data.columns[1:-1]
importances = regressor.feature_importances_
indices = np.argsort(importances)[::-1]

print('==== Feature Importances for the last tree ====')
for f in range(len(feature_labels)):
    print('%2d) %-*s %f' % (f + 1, 20, feature_labels[indices[f]], importances[indices[f]]))

==== Feature Importances for the last tree ====
 1) closest_park_distance 0.268702
 2) st_longitude         0.144414
 3) TMIN                 0.135582
 4) st_latitude          0.064375
 5) closest_subway_distance 0.057987
 6) dayofweek            0.054982
 7) closest_theater_distance 0.041110
 8) TMAX                 0.039968
 9) closest_museum_distance 0.039708
10) PRCP                 0.036406
11) year                 0.031470
12) closest_college_distance 0.026417
13) month                0.019903
14) day                  0.019640
15) AWND                 0.014203
16) SNWD                 0.004210
17) SNOW                 0.000924


<b>30 day forecast</b>

In [15]:
# Split the data
split_date = dt.date(2016, 7, 9)
window = dt.timedelta(days=30)
past = data[data['date'] <= split_date]
future = data[(data['date'] > split_date) & (data['date'] <= (split_date + window))]

In [16]:
# Set up Xs and ys
X_past = past[past.columns[1:-1]]
X_future = future[future.columns[1:-1]]
y_past = past[past.columns[-1]]
y_future = future[future.columns[-1]]

X_past = np.array(X_past).reshape(X_past.shape)
X_future = np.array(X_future).reshape(X_future.shape)
y_past = np.array(y_past).reshape(-1, 1).ravel()
y_future = np.array(y_future).reshape(-1, 1).ravel()

In [17]:
%%time
# Train RandomForestRegressor
regressor = RandomForestRegressor(max_depth=40, n_estimators=50, n_jobs=4)
regressor.fit(X_past, y_past)

CPU times: user 2min 13s, sys: 1.57 s, total: 2min 15s
Wall time: 35.1 s


In [18]:
feature_labels = data.columns[1:-1]
importances = regressor.feature_importances_
indices = np.argsort(importances)[::-1]

print('==== Feature Importances for the last tree ====')
for f in range(len(feature_labels)):
    print('%2d) %-*s %f' % (f + 1, 20, feature_labels[indices[f]], importances[indices[f]]))

==== Feature Importances for the last tree ====
 1) closest_park_distance 0.269105
 2) st_longitude         0.143984
 3) TMIN                 0.133232
 4) st_latitude          0.063019
 5) closest_subway_distance 0.059823
 6) dayofweek            0.054758
 7) TMAX                 0.043096
 8) closest_theater_distance 0.041617
 9) closest_museum_distance 0.037747
10) PRCP                 0.037044
11) year                 0.030315
12) closest_college_distance 0.026537
13) month                0.020220
14) day                  0.019736
15) AWND                 0.014605
16) SNWD                 0.004236
17) SNOW                 0.000926
