In [1]:
import __init__
#
from taxi_common.file_handling_functions import get_all_files, load_pickle_file
#
from IPython.display import HTML, display
import plotly.plotly as py
import igraph as ig
import numpy as np
import pandas as pd
import folium

def text_display(text, font_size):
    display(HTML('<font size=%d>' % font_size + text + '</font>'))

%matplotlib inline 


Matplotlib is building the font cache using fc-list. This may take a moment.



# Datasets
* Periods 
    * Y2009: 11 months are considered (except December)
    * Y2010: 11 months are considered (except October) 
    * Y2011: 7 months are available (01, 06, 07, 08, 09, 11, 12)
    * Y2012: 9 months are available (01, 02, 03, 04, 05, 06, 07, 08, 09) 
    * Filtering in day of week
        * Only dataset recorded on **Monday to Tursday** is considered
        * On the other day of week (Friday, Saturday and Sunday), taxi drivers have different strategy for picking up passengers
* Time frames
    * **2:00 PM to 11:00 PM** (14:00 ~ 23:00)
* **Only single shift drivers** (one shift drivers who don't share his vehicle with others)

# Zone generation
* Define zones which split Singapore in grid form
    * 96 columns and 53 rows (total 5088 zones)
* Area of a grid is 0.5km X 0.5km

In [2]:
from helping_functions import draw_grid_on_map
#
from taxi_common.sg_grid_zone import get_sg_grid_xy_points
#
x_points, y_points = get_sg_grid_xy_points()
xc, yc = (x_points[0] + x_points[-1]) / float(2), (y_points[0] + y_points[-1]) / float(2)

map_osm = folium.Map(location=[yc, xc], zoom_start=11)
map_osm = draw_grid_on_map(map_osm, x_points, y_points)
map_osm

# Data processing for trip instances
* Time frame
    * Set a time frame based on trip's timestamp
* Zone
    * Find the specific zone where the trip occured, based on the start location (GPS coordinates) and end location
    * For simplicity, record zone's grid coordinates
* Example;

In [3]:
from community_analysis import ss_trips_dir, ss_trips_prefix

df = pd.read_csv('%s/%s' % (ss_trips_dir, '%s0901.csv' % ss_trips_prefix))
df.head()

Unnamed: 0,did,timeFrame,zi,zj,groupName,prevDriver,time,day,start-long,start-lat,distance,duration,fare
0,7007,14,65,14,G(0),,1230789600,1,103.90339,1.30458,5.2,420,660
1,35586,14,21,14,G(1),,1230789600,1,103.70418,1.30452,21.0,1440,1910
2,18597,14,51,28,G(2),,1230789600,1,103.83996,1.36531,6.0,660,780
3,22058,14,51,40,G(3),,1230789600,1,103.83793,1.42008,2.0,360,540
4,35583,14,59,29,G(4),,1230789600,1,103.87462,1.36855,7.2,780,800


# Distribution generation
* For each month, count the number of pick-up for time frame and zone
* Get year’s counting about the number of pick-up by aggregating months’ count
* Get the joint distribution
* Example;

# Community detection
## Pick-up distributions of single shift driver generation
* For each single shift driver, generate the distribution like the above joint distribution
## Directed weighted graph generation
    * For each month, generate a directed weighted graph generation as follows;
        * Generate a queue data structure which saves driver ids in arrival order for each zone
        * A link is generated if two drivers pick up passengers at the same zone within 30 minutes
![link_weight_increment](src/link_weight_increment.png)
        * A driver can pick up two passenger at the same zone within 30 minutes 
            * Link's weight can increase more than one
        * Ensure only updating link's weight one time when a driver pick a passenger one
        * Link weight's update with consideration in **removing homophily**
            * Initial weight of the link is 0
            * $D_{1}$: a driver’s joint distribution (time frame and zone)
            * $D_{2}$: the previous driver’s joint distribution
            * $W$: increment, $W=max(0, D_{2}(T=t~and~Z=z) - D_{1}(T=t~and~Z=z))$ 
        * Assumption
            * There is information about all drivers’ pick-up distribution
            * If a driver picks up a passenger at a specific zone at specific time often (many experiences), he can make better decision in whether or not call other drivers


### Graph aggregation
    * Aggregate graphs
    * Type
        * Year
        * Three month rolling horizon (for checking evolution of communities)

## Group partitioning
* Set a threshold values and filtering out meaningless links
    * If there are many links between drivers, it will results in few big communities which include lots of drivers
    * A threshold values represents standard for removing meaningless links
        * Links whose weight is less than a threshold value will be ignored in the following analysis
        * The following results use **99.995 percentile** for the threshold value
* Apply a paritionning algorithm at the modified links (graph)
    * Use [a Python library](https://github.com/vtraag/louvain-igraph), which uses the louvain method described in Fast unfolding of communities in large networks, Vincent D Blondel, Jean-Loup Guillaume, Renaud Lambiotte, Renaud Lefebvre, Journal of Statistical Mechanics: Theory and Experiment 2008(10), P10008 (12pp)

# Analysis results about communies
## Community statistics summary
    * Only consider groups whose minimum number of driver is bigger than 10 as communities

In [4]:
from community_analysis import CHOSEN_PERCENTILE, MIN_NUM_DRIVERS
from community_analysis import group_dir
from taxi_common.file_handling_functions import get_all_files
#
def group_summary(period):
    percentile_dirname = 'percentile(%.3f)' % CHOSEN_PERCENTILE
    percentile_dirpath = '%s/%s' % (group_dir, percentile_dirname)
    dirpath = '%s/%s' % (percentile_dirpath, period)
    headers = ['groupName', '# drivers', '# links', 'sum(weights) / # drivers']
    L_gn, L_numDrivers, L_numLinks, L_tie_strength = range(4)
    df_data = {k: [] for k in headers}
    for group_fn in get_all_files(dirpath, '', '.pkl'):
        _, _, period, g_name = group_fn[:-len('.pkl')].split('-')
        igG = ig.Graph.Read_Pickle('%s/%s' % (dirpath, group_fn))
        drivers = [v['name'] for v in igG.vs]
        if len(drivers) < MIN_NUM_DRIVERS:
            continue
        weights = [e['weight'] for e in igG.es]
        df_data[headers[L_gn]].append(g_name)
        df_data[headers[L_numDrivers]].append(len(drivers))
        df_data[headers[L_numLinks]].append(len(weights))
        df_data[headers[L_tie_strength]].append(sum(weights) / float(len(drivers)))
    df = pd.DataFrame(df_data)[headers]
    return df

In [5]:
text_display('First iteration', 3)
display(HTML(group_summary('0901').to_html()))
#
text_display('Second iteration', 3)
display(HTML(group_summary('0901_').to_html()))
#
text_display('Third iteration', 3)
display(HTML(group_summary('0901__').to_html()))
#
text_display('Fourth iteration', 3)
display(HTML(group_summary('0901___').to_html()))

Unnamed: 0,groupName,# drivers,# links,sum(weights) / # drivers
0,G(0),215,1004,17.555366
1,G(1),140,530,13.716095
2,G(2),82,479,25.994514
3,G(3),50,137,11.046364
4,G(4),45,58,4.771299
5,G(5),23,37,10.314729
6,G(6),22,33,5.729355
7,G(7),10,13,4.89046


Unnamed: 0,groupName,# drivers,# links,sum(weights) / # drivers
0,G(0),164,274,2.380931
1,G(1),130,177,1.954423
2,G(10),22,24,1.767911
3,G(11),20,25,2.082072
4,G(12),11,19,5.066706
5,G(2),112,279,3.492055
6,G(3),111,196,2.484866
7,G(4),95,134,2.323374
8,G(5),88,145,2.379754
9,G(6),77,113,2.773386


Unnamed: 0,groupName,# drivers,# links,sum(weights) / # drivers
0,G(0),205,734,12.508024
1,G(1),136,410,10.687913
2,G(2),76,434,25.286291
3,G(3),58,134,9.217971
4,G(4),47,61,4.719657
5,G(5),23,28,4.474441
6,G(6),15,19,8.646987
7,G(7),10,12,4.029007


Unnamed: 0,groupName,# drivers,# links,sum(weights) / # drivers
0,G(0),127,242,2.876544
1,G(1),109,203,2.840311
2,G(10),33,41,2.117537
3,G(11),32,31,1.565926
4,G(12),29,34,3.38257
5,G(13),26,31,1.779905
6,G(14),11,19,5.078224
7,G(2),108,136,2.240524
8,G(3),108,239,3.268901
9,G(4),88,106,1.751713


## Trip instances including previous community driver

In [6]:
from community_analysis import CHOSEN_PERCENTILE
from community_analysis import com_trips_dir, com_trips_prefix
#
percentile_dirname = 'percentile(%.3f)' % CHOSEN_PERCENTILE
percentile_dirpath = '%s/%s' % (group_dir, percentile_dirname)
#
def get_com_trip(period):
    com_trips_fpath = '%s/%s/%s%s.csv' % (com_trips_dir, percentile_dirname, com_trips_prefix, period)
    df = pd.read_csv(com_trips_fpath)
    return df

In [7]:
df = get_com_trip('0901')
display(HTML(df.tail().to_html()))

Unnamed: 0,did,timeFrame,zi,zj,groupName,prevDriver,time,day,start-long,start-lat,distance,duration,fare
669431,20387,23,53,29,G(4),4425.0,1233244740,29,103.84831,1.36844,9.1,840,1120
669432,8599,23,76,27,G(2284),,1233244740,29,103.95153,1.36121,3.1,480,580
669433,4956,23,51,14,G(0),18631.0,1233244740,29,103.83617,1.30038,17.4,1500,2160
669434,34868,23,54,12,G(1),28094.0,1233244740,29,103.85165,1.29551,4.4,540,1020
669435,33674,23,52,27,G(2230),,1233244740,29,103.84198,1.36097,18.1,1020,1800


## Distribution (Eight biggest communities)

In [8]:
text_display('First iteration', 3)
display(HTML(group_summary('0901').to_html()))

from helping_functions import draw_service_locations
#
df = get_com_trip('0901')
gn_names = ['G(%d)' % i for i in range(8)]
five_com_df = df[(df['groupName'].isin(gn_names))]
#
map_osm = draw_service_locations(five_com_df)
map_osm

Unnamed: 0,groupName,# drivers,# links,sum(weights) / # drivers
0,G(0),215,1004,17.555366
1,G(1),140,530,13.716095
2,G(2),82,479,25.994514
3,G(3),50,137,11.046364
4,G(4),45,58,4.771299
5,G(5),23,37,10.314729
6,G(6),22,33,5.729355
7,G(7),10,13,4.89046



sort(columns=....) is deprecated, use sort_values(by=.....)



In [9]:
text_display('Second iteration', 3)
display(HTML(group_summary('0901_').to_html()))
#
df = get_com_trip('0901_')
gn_names = ['G(%d)' % i for i in range(8)]
five_com_df = df[(df['groupName'].isin(gn_names))]
#
map_osm = draw_service_locations(five_com_df)
map_osm

Unnamed: 0,groupName,# drivers,# links,sum(weights) / # drivers
0,G(0),164,274,2.380931
1,G(1),130,177,1.954423
2,G(10),22,24,1.767911
3,G(11),20,25,2.082072
4,G(12),11,19,5.066706
5,G(2),112,279,3.492055
6,G(3),111,196,2.484866
7,G(4),95,134,2.323374
8,G(5),88,145,2.379754
9,G(6),77,113,2.773386


In [None]:
text_display('Third iteration', 3)
display(HTML(group_summary('0901__').to_html()))
#
df = get_com_trip('0901__')
gn_names = ['G(%d)' % i for i in range(8)]
five_com_df = df[(df['groupName'].isin(gn_names))]
#
map_osm = draw_service_locations(five_com_df)
map_osm

Unnamed: 0,groupName,# drivers,# links,sum(weights) / # drivers
0,G(0),205,734,12.508024
1,G(1),136,410,10.687913
2,G(2),76,434,25.286291
3,G(3),58,134,9.217971
4,G(4),47,61,4.719657
5,G(5),23,28,4.474441
6,G(6),15,19,8.646987
7,G(7),10,12,4.029007


## Contribution and benefit ratio

In [None]:
df = get_com_trip('0901__')
target_group = 'G(3)'
#
# target_group = 'G(3)'
num_trips, num_benefit, num_contribution = {}, {}, {}
com_df = df[(df['groupName'] == target_group)]
count = 0 
for did, prevComDriver in com_df[['did', 'prevDriver']].values:
    if not num_trips.has_key(did):
        num_trips[did] = 0
        num_benefit[did] = 0
    num_trips[did] += 1
    if type(eval(prevComDriver)) != type(eval('None')):
        num_benefit[did] += 1
        if not num_contribution.has_key(eval(prevComDriver)):
            num_contribution[eval(prevComDriver)] = 0
        num_contribution[eval(prevComDriver)] += 1
headers = ['did', '# trips', '# benefits', '# contributions', '% benefit', '% contribution']
L_did, L_num_trips, \
L_num_benefits,  L_num_contributions, \
L_ratio_benefits,  L_ratio_contributions = range(6)
df_data = {k: [] for k in headers}

for did, nt in num_trips.iteritems():
    df_data[headers[L_did]].append(did)
    df_data[headers[L_num_trips]].append(nt)
    nb = num_benefit[did]
    cn = num_contribution[did] if num_contribution.has_key(did) else 0
    df_data[headers[L_num_benefits]].append(nb)
    df_data[headers[L_num_contributions]].append(cn)
    df_data[headers[L_ratio_benefits]].append(nb / float(nt))
    df_data[headers[L_ratio_contributions]].append(cn / float(nt))
df = pd.DataFrame(df_data)[headers]
text_display('% contribution', 5)
text_display('HEAD', 3)
display(HTML(df.sort_values(by='% contribution', ascending=False).head().to_html()))
text_display('TAIL', 3)
display(HTML(df.sort_values(by='% contribution', ascending=False).tail().to_html()))
text_display('% benefit', 5)
text_display('HEAD', 3)
display(HTML(df.sort_values(by='% benefit', ascending=False).head().to_html()))
text_display('TAIL', 3)
display(HTML(df.sort_values(by='% benefit', ascending=False).tail().to_html()))