In [1]:
import numpy as np
import pandas as pd

# # Creating Data Files for Pick-up & Dropoff predictions

In [2]:
# for daily data
file_dir = './data/processed/extended2/'
file_prefixes = ['p_s', 'p_ns', 'd_s', 'd_ns']
file_ext = '_alltime_extended.csv'

for f_prefix in file_prefixes:
    file_path = file_dir + f_prefix + file_ext
    data = pd.read_csv(file_path)
    if f_prefix.startswith('p'):
        label = 'pickups'
    else:
        label = 'dropoffs'
    
    data['dayofweek'] = pd.to_datetime(data['date'].astype(str), format='%Y%m%d').dt.dayofweek
    data['year'] = pd.to_datetime(data['date'].astype(str), format='%Y%m%d').dt.year
    data['month'] = pd.to_datetime(data['date'].astype(str), format='%Y%m%d').dt.month
    data['day'] = pd.to_datetime(data['date'].astype(str), format='%Y%m%d').dt.day
    
    features = ['year', 'month', 'day', 'dayofweek',
                        'st_latitude', 'st_longitude', 
                        'closest_college_distance',  'closest_subway_distance', 
                        'closest_theater_distance', 'closest_museum_distance',
                        'closest_park_distance', 
                        'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'AWND', 
                        label]
    
    save_file = 'daily_' + f_prefix + '_pred_basic.csv'
    save_path = './data/for_predictions/' + save_file
    data[features].to_csv(save_path, index=False)

In [3]:
# Check two fo the saved files
ps = pd.read_csv('./data/for_predictions/daily_p_s_pred_basic.csv')
dns = pd.read_csv('./data/for_predictions/daily_d_ns_pred_basic.csv')

In [4]:
ps.describe().round(3)

Unnamed: 0,year,month,day,dayofweek,st_latitude,st_longitude,closest_college_distance,closest_subway_distance,closest_theater_distance,closest_museum_distance,closest_park_distance,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,pickups
count,546683.0,546683.0,546683.0,546683.0,546393.0,546393.0,546393.0,546393.0,546393.0,546393.0,546393.0,546683.0,546683.0,546683.0,546683.0,546683.0,546683.0,546683.0
mean,2015.083,6.694,15.737,2.991,40.727,-73.983,0.673,0.284,1.213,0.696,0.151,2.954,1.989,14.2,17.122,9.138,-113.934,63.517
std,1.162,3.595,8.821,1.999,0.028,0.018,0.431,0.238,1.344,0.66,0.027,8.183,15.201,57.49,10.28,9.531,1072.377,62.093
min,2013.0,1.0,1.0,0.0,40.518,-74.031,0.022,0.003,0.011,0.019,0.088,0.0,0.0,0.0,-9.3,-18.2,-9999.0,1.0
25%,2014.0,3.0,8.0,1.0,40.705,-73.996,0.338,0.1,0.278,0.28,0.129,0.0,0.0,0.0,8.3,2.2,1.7,18.0
50%,2015.0,7.0,16.0,3.0,40.725,-73.984,0.604,0.224,0.556,0.507,0.153,0.0,0.0,0.0,17.8,8.9,2.2,44.0
75%,2016.0,10.0,23.0,5.0,40.749,-73.971,0.915,0.414,2.008,0.797,0.171,1.0,0.0,0.0,26.7,17.8,3.0,90.0
max,2017.0,12.0,31.0,6.0,40.804,-73.93,11.939,7.748,21.617,7.715,0.218,126.2,279.0,480.0,36.7,28.3,8.2,827.0


In [5]:
dns.describe().round(3)

Unnamed: 0,year,month,day,dayofweek,st_latitude,st_longitude,closest_college_distance,closest_subway_distance,closest_theater_distance,closest_museum_distance,closest_park_distance,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,dropoffs
count,397374.0,397374.0,397374.0,397374.0,396945.0,396945.0,396945.0,396945.0,396945.0,396945.0,396945.0,397367.0,397367.0,397367.0,397367.0,397367.0,397367.0,397374.0
mean,2014.981,7.168,15.78,3.093,40.73,-73.985,0.637,0.269,0.983,0.622,0.154,2.291,0.399,4.576,20.185,11.906,-118.336,11.004
std,1.137,3.139,8.827,2.019,0.026,0.017,0.395,0.229,1.157,0.595,0.025,6.633,5.695,31.723,9.055,8.405,1091.421,17.587
min,2013.0,1.0,1.0,0.0,40.518,-74.031,0.022,0.003,0.011,0.019,0.088,0.0,0.0,0.0,-9.3,-18.2,-9999.0,1.0
25%,2014.0,5.0,8.0,1.0,40.712,-73.997,0.324,0.092,0.234,0.261,0.141,0.0,0.0,0.0,13.9,5.6,1.5,2.0
50%,2015.0,8.0,16.0,3.0,40.73,-73.987,0.579,0.217,0.482,0.457,0.157,0.0,0.0,0.0,21.7,12.8,2.1,6.0
75%,2016.0,10.0,23.0,5.0,40.751,-73.975,0.882,0.389,1.171,0.729,0.172,0.5,0.0,0.0,27.8,19.4,2.7,13.0
max,2017.0,12.0,31.0,6.0,40.804,-73.93,11.939,7.748,21.617,7.715,0.218,126.2,279.0,480.0,36.7,28.3,8.2,656.0
