In [1]:
%matplotlib inline
%config Completer.use_jedi=False

In [2]:
import datetime

import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

##### Helper Functions

In [3]:
def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

## Rush-Hour Aggregated Availabilities
**Rush-hour time windows:**
1. [5:30AM, 9:30AM) 
2. [16:00PM, 20:00PM)

**Just before rush-hour time windows:**

1. [5:00AM, 5:30AM)
2. [4:30AM, 5:30AM)
3. [15:30PM, 16:00PM)
4. [15:00PM, 16:00PM)

#### Preprocessing

In [4]:
time_periods = [dt.strftime('%Y-%m-%d %H:%M:%S') for dt in 
                            datetime_range(datetime.datetime(2014, 1, 1, 0), datetime.datetime(2015, 1, 1, 0), 
                            datetime.timedelta(minutes=2))]

def create_time_index(df):
    df['time_period'] = pd.to_datetime(time_periods)
    df.set_index('time_period', inplace=True)
    
    return df

In [5]:
df_bikes = pd.read_csv('../../data/Pu_data/availability/available_bikes.csv', index_col=0)
df_docks = pd.read_csv('../../data/Pu_data/availability/available_docks.csv', index_col=0)

In [6]:
df_bikes = create_time_index(df_bikes)
df_bikes.head()

Unnamed: 0_level_0,Station_1_bikes_available,Station_2_bikes_available,Station_3_bikes_available,Station_4_bikes_available,Station_5_bikes_available,Station_6_bikes_available,Station_7_bikes_available,Station_8_bikes_available,Station_9_bikes_available,Station_10_bikes_available,...,Station_769_bikes_available,Station_770_bikes_available,Station_771_bikes_available,Station_772_bikes_available,Station_773_bikes_available,Station_774_bikes_available,Station_775_bikes_available,Station_776_bikes_available,Station_777_bikes_available,Station_778_bikes_available
time_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2014-01-01 00:02:00,,,,,,,,,,,...,,,,,,,,,,
2014-01-01 00:04:00,,,,,,,,,,,...,,,,,,,,,,
2014-01-01 00:06:00,,,,,,,,,,,...,,,,,,,,,,
2014-01-01 00:08:00,,,,,,,,,,,...,,,,,,,,,,


In [7]:
df_docks = create_time_index(df_docks)
df_docks.head()

Unnamed: 0_level_0,Station_1_spaces_available,Station_2_spaces_available,Station_3_spaces_available,Station_4_spaces_available,Station_5_spaces_available,Station_6_spaces_available,Station_7_spaces_available,Station_8_spaces_available,Station_9_spaces_available,Station_10_spaces_available,...,Station_769_spaces_available,Station_770_spaces_available,Station_771_spaces_available,Station_772_spaces_available,Station_773_spaces_available,Station_774_spaces_available,Station_775_spaces_available,Station_776_spaces_available,Station_777_spaces_available,Station_778_spaces_available
time_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2014-01-01 00:02:00,,,,,,,,,,,...,,,,,,,,,,
2014-01-01 00:04:00,,,,,,,,,,,...,,,,,,,,,,
2014-01-01 00:06:00,,,,,,,,,,,...,,,,,,,,,,
2014-01-01 00:08:00,,,,,,,,,,,...,,,,,,,,,,


#### Calculation
Let

$$A := \text{Count}(\text{timestamps}>= 5~\text{bikes/docks available})$$

and

$$B := \text{total time stamps of non-NA}.$$

Then for all stations we find the availability measure

$$\text{availability_measure} = \frac{A}{B}.$$

In [8]:
def calc_A(df, rush_start, rush_end):
    df_A = pd.DataFrame()
    
    stations = df.columns.tolist()
    
    df_filtered = df.between_time(rush_start, rush_end)
    for s in stations:
        df_tmp = df_filtered[[s]]
        df_tmp = df_tmp[df_tmp[s] >= 5]
        df_A[s] = [len(df_tmp)]
        
    return df_A

In [9]:
def calc_B(df, rush_start, rush_end):
    df_B = pd.DataFrame()
    
    stations = df.columns.tolist()
    
    df_filtered = df.between_time(rush_start, rush_end)
    for s in tqdm_notebook(stations):
        df_tmp = df_filtered[[s]]
        df_tmp = df_tmp.dropna()
        df_B[s] = [len(df_tmp)]
        
    return df_B

In [10]:
def main(df, df_save_type):
    tperiods = [('05:30:00', '09:30:59'), ('16:00:00', '19:59:59'),
                ('05:00:00', '05:29:59'), ('04:30:00', '05:29:59'),
                ('15:30:00', '15:59:59'), ('15:00:00', '15:59:59')]
    
    #bikes
    for t in tperiods:
        df_A, df_B = calc_A(df, t[0], t[1]), calc_B(df, t[0], t[1])
        df_AB = df_A / df_B
        
        df_AB.columns = ['_'.join(c.split('_')[:2]) for c in df_AB.columns]

        df_AB = df_AB.T
        df_AB.columns = ['rush-hour_aggregate_avail']
        df_AB.to_csv('../../data/Pu_data/availability/rush-hour_aggregated_availabilities/' + \
                     df_save_type + '_' + t[0].replace(':', '') + '_' + t[1].replace(':', '') + '.csv')

In [11]:
main(df_bikes, 'bikes')



















In [12]:
main(df_docks, 'docks')



















## Map to Cluster Level
> Just the average of the measure for all stations within a cluster.


In [13]:
def cluster_averages(df_cluster_mappings, df_timeperiod, cluster):
    df_stations = df_cluster_mappings[df_cluster_mappings.cluster_ID == cluster]
    df_stations['index_name'] = ['Station_' + str(s) for s in df_stations.station_id.values]
    df_stations.set_index('index_name', inplace=True)
    
    df_combine = df_stations.join(df_timeperiod, how='left')
    
    return df_combine

In [14]:
def cluster_averages_helper(df_cluster_mappings, time_period):
    df_bikes = pd.read_csv('../../data/Pu_data/availability/rush-hour_aggregated_availabilities/bikes_' + \
                          time_period[0].replace(':', '') + '_' + time_period[1].replace(':', '') + '.csv', index_col=0)
    df_docks = pd.read_csv('../../data/Pu_data/availability/rush-hour_aggregated_availabilities/docks_' + \
                          time_period[0].replace(':', '') + '_' + time_period[1].replace(':', '') + '.csv', index_col=0)
    
    df_cluster_bike_avgs = pd.DataFrame()
    df_cluster_dock_avgs = pd.DataFrame()
    
    clusters = list(set(df_cluster_mappings.cluster_ID.values.tolist()))

    for c in tqdm_notebook(clusters):
        df_bike_combine = cluster_averages(df_cluster_mappings, df_bikes, c)
        df_dock_combine = cluster_averages(df_cluster_mappings, df_docks, c)
    
        df_cluster_bike_avgs[c] = [df_bike_combine['rush-hour_aggregate_avail'].mean()]
        df_cluster_dock_avgs[c] = [df_dock_combine['rush-hour_aggregate_avail'].mean()]
        
    df_cluster_bike_avgs.columns = ['Cluster_' + str(col) for col in df_cluster_bike_avgs.columns.tolist()]
    df_cluster_dock_avgs.columns = ['Cluster_' + str(col) for col in df_cluster_dock_avgs.columns.tolist()]
    
    df_cluster_bike_avgs = df_cluster_bike_avgs.T
    df_cluster_dock_avgs = df_cluster_dock_avgs.T
    
    df_cluster_bike_avgs.columns = ['bikes_aggregate_avail']
    df_cluster_dock_avgs.columns = ['docks_aggregate_avail']
    
    
    df_final = df_cluster_bike_avgs.join(df_cluster_dock_avgs, how='left')
    
    df_final.to_csv('../../data/Pu_data/availability/aggregate_to_cluster_level/' + \
                    time_period[0].replace(':', '') + '_' + time_period[1].replace(':', '') + '.csv')

In [15]:
df_cluster_mappings = pd.read_csv('../../data/Pu_data/station_cluster_mapping.csv')

In [16]:
tperiods = [('05:30:00', '09:30:59'), ('16:00:00', '19:59:59'),
                ('05:00:00', '05:29:59'), ('04:30:00', '05:29:59'),
                ('15:30:00', '15:59:59'), ('15:00:00', '15:59:59')]

In [17]:
for tp in tperiods:
    cluster_averages_helper(df_cluster_mappings, tp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


















