In [6]:
%matplotlib inline
%config Completer.use_jedi=False

In [7]:
import datetime
import glob

import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
from collections import defaultdict

## Cluster Level Biking Distance
Let $A = \{s_{A,1}, \dots, s_{A,N}\}$ and $B = \{s_{B,1}, \dots, s_{B,M}\}$ be two arbitrary clusters. Also, let $N,M,n \in \mathbb{N}$.Define

$$ D(A,B) := \text{Avg}(d_1, \dots, d_n),~d_k \mapsto \text{Dist}(s_{A,i},~s_{B,j})~~~~~~~~~~~~~~~~~~(k \in [1, \dots, n],~i \in N,~j \in M)$$

where $d_1, \dots, d_n$ are **all possible** distances from stations in these clusters.

>* **NOTE:** Routes where there were 0 recorded trips were excluded from the data collection. Thus, there are many routes where there will be NaN values. However, on a clusger level there "shouldn't" be any NaN's.

In [29]:
def D(A, B, cluster_stations):
    all_A = cluster_stations[A]
    all_B = cluster_stations[B]
    
    #print(all_A, all_B)
    
    all_dist = []
    for aA in all_A:
        for aB in all_B:
            try:
                df = pd.read_csv('../../data/routes/bicycling/bicycling_' + str(aA) + '-' + str(aB) + '.csv')
            except Exception as e:
                try:
                    df = pd.read_csv('../../data/routes/bicycling/bicycling_' + str(aB) + '-' + str(aA) + '.csv')
                except Exception as es:
                    continue
            
            dist = df[['distance']].values.tolist()
            
            converted_dist = []
            for d in dist:
                if 'km' in d[0]:
                    converted_dist.append(float(d[0].split()[0]) * 1000)
                elif 'm' in d[0]:
                    converted_dist.append(float(d[0].split()[0]))
                else:
                    raise Exception('Neither Meter or Kilometers label.')
            
            all_dist.append(sum(converted_dist))
    
    #returns NaN if there aren't any observations from these stations
    return np.mean(all_dist)

#### Station -> Cluster mapping

In [30]:
stat_map_clus = pd.read_csv('../../data/Pu_data/station_cluster_mapping.csv')
stat_map_clus.head()

Unnamed: 0,station_id,cluster_ID
0,1,97
1,2,28
2,3,118
3,4,86
4,5,60


In [31]:
uniq_clusters = stat_map_clus.cluster_ID.unique()
uniq_clusters[:3]

array([ 97,  28, 118])

In [32]:
cluster_stations = defaultdict(list)

for uc in uniq_clusters:
    tmp_df = stat_map_clus[stat_map_clus.cluster_ID == uc]
    cluster_stations[uc] = tmp_df.station_id.values.tolist()

In [33]:
cluster_stations[24]

[607, 617, 656]

In [56]:
cluster_stations

defaultdict(list,
            {4: [682],
             5: [598, 668, 730, 753],
             6: [591],
             7: [566, 601],
             12: [678, 694, 705, 708, 709, 743],
             13: [693],
             14: [615, 681],
             15: [599, 635, 644, 655, 686, 696, 707, 711, 761],
             16: [293, 595, 608, 634, 687],
             17: [515, 527, 571, 606, 647, 657, 667, 736],
             18: [442, 555, 650, 652, 663, 740, 741, 742, 754],
             23: [685, 688, 704, 728, 752, 767],
             24: [607, 617, 656],
             25: [596, 597, 616, 619, 671, 727, 729],
             26: [626, 633, 639, 720],
             27: [38, 142, 158, 274, 660, 666],
             28: [2, 145, 168, 212, 398, 559, 611, 622],
             29: [151, 225, 337, 379, 543, 560, 661],
             30: [643, 758],
             34: [621, 629, 665, 684, 724],
             35: [628, 636, 648, 723],
             36: [618, 691, 731, 737, 738, 745],
             37: [573, 651, 739, 757],
  

#### Calculation

In [58]:
possible_cluster_routes = []

keys = cluster_stations.keys()
for k in keys:
    tmp = []
    for k_tmp in cluster_stations.keys():
        if k != k_tmp:
            tmp.append((k, k_tmp))
    assert len(tmp) == 111
    possible_cluster_routes.append(tmp)

In [59]:
avg_D = {}
for pcr in tqdm_notebook(possible_cluster_routes[:2]):
    for ctup in pcr:
        avg_D[ctup] = D(ctup[0], ctup[1], cluster_stations)
        break
    break
        




  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [60]:
df_avg_D = pd.DataFrame(avg_D, index=[0])
df_avg_D

Unnamed: 0_level_0,4
Unnamed: 0_level_1,5
0,


In [39]:
df_avg_D = df_avg_D.T
df_avg_D.columns = ['average_distance']

In [61]:
df_avg_D

Unnamed: 0_level_0,4
Unnamed: 0_level_1,5
0,
