In [1]:
import __init__
#
from helping_functions import draw_grid_on_map, get_com_stats_summary
#
from taxi_common.file_handling_functions import get_all_files, load_pickle_file
#
from IPython.display import HTML, display
import plotly.plotly as py
import pandas as pd
import folium

def text_display(text, font_size):
    display(HTML('<font size=%d>' % font_size + text + '</font>'))


Matplotlib is building the font cache using fc-list. This may take a moment.



# Zone generation
* Define zones which split Singapore in grid form
    * 96 columns and 53 rows
* Area of a grid is 0.5km X 0.5km

In [2]:
from taxi_common.sg_grid_zone import get_sg_grid_xy_points
#
x_points, y_points = get_sg_grid_xy_points()
xc, yc = (x_points[0] + x_points[-1]) / float(2), (y_points[0] + y_points[-1]) / float(2)

map_osm = folium.Map(location=[yc, xc], zoom_start=11)
map_osm = draw_grid_on_map(map_osm, x_points, y_points)
map_osm

# Data processing
## Considered trip instance for analysis 
* Time frames
    * Monday to Thursday
    * **2:00PM to 11:00PM**
* **Only full-time drivers** (one shift drivers who don't share his vehicle with others)
* **Ignore last-mile trips** (less than 2km)

## Trip instance process
* Find the specific zone where the trip occured, based on the start location (GPS coordinates) and end location
    * For simplicity, record zone's grid coordinates
* Save each day's instances in a file; the following is an example

In [3]:
from community_analysis import trip_dir
df = pd.read_csv('%s/0901/20090101.csv' % trip_dir)
df.head()

Unnamed: 0,time,did,start-long,start-lat,end-long,end-lat,distance,duration,fare,si,sj,ei,ej
0,1231135200,33404,103.82989,1.35936,103.84033,1.38369,3.7,360,480,49,27,51,32
1,1231135200,33567,103.80901,1.32492,103.76879,1.33745,4.9,840,700,44,19,36,22
2,1231135200,25824,103.76673,1.38488,103.78828,1.31031,11.4,960,920,35,32,40,16
3,1231135200,20504,103.69427,1.34096,103.84461,1.284,20.7,1560,1800,19,22,52,10
4,1231135200,35196,103.90901,1.33148,103.9889,1.31425,12.4,960,1000,67,20,85,17


## Daily link process
* A link can be generated if two drivers pick up passengers at the same zone within 30 minutes
    * Each zone has a queue data structure which saves driver ids in arrival order
* Link's weight increase whenever the two drivers who form the link pick up passengers at the same location within 30 minutes
![link_weight_increment](src/link_weight_increment.png)
    * A driver can pick up two passenger at the same zone within 30 minutes 
        * The number of linkage can increase more than one
    * Ensure only one linkage increment when a driver pick a passenger one
* After counting all link's weight, **ignore links whose weight is less than two**
* An example

In [4]:
from community_analysis import ld_dir
link_daily = load_pickle_file('%s/0901/20090101.pkl' % ld_dir)
for i, (did0, num_trip, link) in enumerate(link_daily):
    print "The total number of trips in a day of driver %s is %d" % (did0, num_trip)
    for num_encounter, did1 in sorted([(num_encounter, did1) for did1, num_encounter in link.iteritems()], reverse=True)[:5]:
        print '\t Encounter with driver %s, %d times' % (did1, num_encounter)
    if i == 1:
        break         

The total number of trips in a day of driver 35234 is 6
	 Encounter with driver 31327, 4 times
	 Encounter with driver 2961, 3 times
	 Encounter with driver 26586, 3 times
	 Encounter with driver 12173, 3 times
	 Encounter with driver 9293, 2 times
The total number of trips in a day of driver 35543 is 6
	 Encounter with driver 5566, 2 times
	 Encounter with driver 37204, 2 times
	 Encounter with driver 33635, 2 times
	 Encounter with driver 29313, 2 times
	 Encounter with driver 15594, 2 times


## Annual link process
* Aggregate daily links
    * For each link, count the number of day the link appeared in a year
    * Current dataset is Y2009 (11 months considered, except December)
    * 184 days considered for the analysis
    * An example

In [5]:
from community_analysis import la_dir
link_annually = load_pickle_file('%s/sample-2009-CD(184)-N(7003)-E(5717371).pkl' % la_dir)
#
for i, ((did0, did1), num_encounter) in enumerate(link_annually.iteritems()):
    print 'Both drivers %d and %d picked up passengers at the same zones %d times in 2009' % (did0, did1, num_encounter)
    if i == 4:
        break


Both drivers 18042 and 18426 picked up passengers at the same zones 113 times in 2009
Both drivers 21951 and 35377 picked up passengers at the same zones 104 times in 2009
Both drivers 2481 and 34581 picked up passengers at the same zones 94 times in 2009
Both drivers 27834 and 36772 picked up passengers at the same zones 130 times in 2009
Both drivers 1112 and 34015 picked up passengers at the same zones 125 times in 2009


## Community detection
* Set a threshold values and filtering out meaningless links
    * If there are many links between drivers, it will results in few big communities which include lots of drivers
    * A threshold values represents standard for removing meaningless links
        *links whose weight is less than a threshold value will be ignored in the following analysis
    * Some statistics about communities depending on a threshold value
* Num of drivers, Num of community, Avg. drivers per community, S.D, Skewness, List of communities (community ID, # of drivers)


In [6]:
df = get_com_stats_summary()
df

Unnamed: 0,Threshold value (Day),Num of community,Num of drivers,Average,Median,SD,Skewness,Kurtosis,"List of communities, (community ID, # of drivers)"
0,18,11,3740,340.0,3.0,636.163815,2.250209,5.226735,"{COM(0):2034},{COM(1):708},{COM(2):807},{COM(3..."
1,36,13,2104,161.846154,6.0,273.897744,2.356558,6.048839,"{COM(0):956},{COM(1):173},{COM(2):424},{COM(3)..."
2,55,14,1044,74.571429,16.5,117.391989,1.961754,3.283762,"{COM(0):17},{COM(1):274},{COM(2):385},{COM(3):..."
3,73,15,574,38.266667,6.0,63.024334,1.934876,2.698034,"{COM(0):171},{COM(1):195},{COM(2):11},{COM(3):..."
4,82,14,427,30.5,8.5,43.957584,1.712761,1.798229,"{COM(0):132},{COM(1):119},{COM(2):5},{COM(3):5..."
5,92,13,311,23.923077,4.0,33.878365,1.521744,1.029959,"{COM(0):94},{COM(1):3},{COM(2):92},{COM(3):2},..."


## Analysis
* The following resutls are generated after fixing a threshold value (TH)
    * Two threshold values, 82 and 92 will be considered

### Community statistics summary (TH = 92)

In [7]:
from community_analysis import com_dir
df = pd.read_csv('%s/%s' % (com_dir, '2009-CD(184)-thD(92)/2009-CD(184)-thD(92)-community-summary.csv'))
df

Unnamed: 0,com-name,num-nodes,num-edges,tie-strength(# of days encounter / # of drivers)
0,COM(0),94,329,373.819149
1,COM(1),3,2,75.666667
2,COM(2),92,518,597.532609
3,COM(3),2,1,47.0
4,COM(4),13,27,222.615385
5,COM(5),45,308,794.977778
6,COM(6),4,4,120.75
7,COM(7),40,85,246.075
8,COM(8),3,2,66.0
9,COM(9),3,2,79.333333


### Community analysis (TH = 92)
#### Datasets
* Only consider top five high tie-strength communies
* At 'by_com' column, 'O' represents whether there was a previous trip that satisfies following conditions;
    * the previous trip occurred within 30 minutes
    * the previous trip occurred in the same zone
    * the previous trip's driver belongs to the same community 

In [8]:
from community_analysis import ctrip_dir
df = pd.read_csv('%s/%s' % (ctrip_dir, '2009-CD(184)-thD(92)-ctrip.csv'))
df[100:105]

Unnamed: 0,time,yy,mm,did,cn,by_com,start-long,start-lat,end-long,end-lat,distance,duration,fare,si,sj,ei,ej
100,1231136760,9,1,20944,COM(0),X,103.80581,1.28818,103.81216,1.3262,7.9,900,780,44,11,45,19
101,1231136760,9,1,3351,COM(7),X,103.98635,1.35445,103.80544,1.28804,26.0,1320,2100,84,25,44,11
102,1231136760,9,1,34530,COM(2),O,103.89352,1.31773,103.93779,1.35568,9.2,780,820,63,17,73,26
103,1231136760,9,1,19439,COM(0),O,103.9453,1.35308,103.97001,1.35826,4.8,720,640,75,25,80,26
104,1231136820,9,1,18042,COM(0),O,103.83371,1.3042,103.99036,1.36081,22.0,1260,1540,50,14,85,27


#### Statistics

In [19]:

df.groupby(['cn']).count()['duration'].apply(lambda x : (x / float(182)) / float(9))


cn
COM(0)    128.299756
COM(2)    118.195971
COM(4)     15.926129
COM(5)     38.606838
COM(7)     36.446886
Name: duration, dtype: float64

In [9]:
df['by_com'] = df['by_com'].apply(lambda x: 1 if (x == 'O' or x == 1) else 0)
df.groupby(['cn']).mean().loc[:, ['fare', 'distance', 'duration', 'by_com']].reset_index()

Unnamed: 0,cn,fare,distance,duration,by_com
0,COM(0),1152.416664,9.548029,975.137113,0.933482
1,COM(2),1241.423677,9.910518,974.602929,0.92667
2,COM(4),1212.603596,9.962663,988.377353,0.708552
3,COM(5),1396.37604,13.060772,1097.991081,0.834182
4,COM(7),1378.040938,12.517273,1068.784925,0.823668


#### Service location
* Firstly, group by community and start location
* Choose top five location where drivers of a community mostly pick up
    * For the first location where many pick-up occured, check the top five gps coordinates
        * The less the shape's side, the more pick-up occured

In [10]:
from helping_functions import draw_service_locations
map_osm = draw_service_locations(df)
map_osm


sort(columns=....) is deprecated, use sort_values(by=.....)


sort(columns=....) is deprecated, use sort_values(by=.....)



#### Evolution
![link_weight_increment](src/com_link.png)
* Process trip instances again considering top five communities
    * Only concern about links that one of drivers belongs **to the communities**
    * Weight of link represents the number of days
* Calculate statistics for each community with rolling horizon (3 months)
    * Aggregate 3 months links and construct a directed graph (network)
    * Only concern about links that one of drivers belongs **to the community**
    * Notations
        * whole-driver-num: the number of whole drivers who consist the network
        * com-driver-num: the number of drivers who are the member of the community
        * com-driver-num: the number of drivers who do not belong to the community
        * ...
        * node-order: driver IDs ordered by centrality
    * example

In [11]:
from community_analysis import cevol_dir
df = pd.read_csv('%s/%s' % (cevol_dir, '2009-CD(184)-thD(92)/2009-CD(184)-thD(92)-COM(0)-evolution.csv'))
df.loc[:1]

Unnamed: 0,duration,num-days,cn,whole-driver-num,com-driver-num,out-driver-num,com-driver-num-ratio,whole-link-num,com-link-num,out-link-num,com-link-num-ratio,whole-link-weight (# of encounter days),com-link-weight,out-link-weight,com-link-weight-ratio,node-order
0,0901-0903,50,COM(0),4746,94,4652,0.019806,458591,8742,449849,0.019063,13241658,350379,12891279,0.02646,"(18885, 87, 28942, 17678, 3606, 19728, 5860, 3..."
1,0902-0904,51,COM(0),4715,94,4621,0.019936,455275,8739,446536,0.019195,13861332,365315,13496017,0.026355,"(18885, 19728, 87, 1359, 3606, 20728, 28942, 1..."


##### Summary

In [13]:
from community_analysis import top5_com_dir

com_drivers = load_pickle_file('%s/%s' % (top5_com_dir, '2009-CD(184)-thD(92).pkl'))
for cn in ['COM(0)', 'COM(7)', 'COM(4)']:
    df = pd.read_csv('%s/%s' % (cevol_dir, '2009-CD(184)-thD(92)/2009-CD(184)-thD(92)-%s-evolution.csv' % cn))
    text_display(cn, 5)
    display(HTML(df.loc[:, ['duration', 'num-days', 'whole-driver-num', 'com-driver-num',  'com-driver-num-ratio', 'com-link-num-ratio', 'com-link-weight-ratio']].to_html()))
    core_members_by_evol = None
    for v in df['node-order'].values:
        if not core_members_by_evol:
            core_members_by_evol = set(eval(v))
        else:
            cur_members = set(eval(v))
            core_members_by_evol.intersection_update(cur_members)
    origin_com_drivers = set(com_drivers[cn])
    diff_mem = core_members_by_evol.difference(origin_com_drivers)
    text_display('Members existing always (%d) - original community (%d) = %d' % (len(core_members_by_evol), len(origin_com_drivers), len(diff_mem)), 3)


Unnamed: 0,duration,num-days,whole-driver-num,com-driver-num,com-driver-num-ratio,com-link-num-ratio,com-link-weight-ratio
0,0901-0903,50,4746,94,0.019806,0.019063,0.02646
1,0902-0904,51,4715,94,0.019936,0.019195,0.026355
2,0903-0905,51,4747,93,0.019591,0.018876,0.026074
3,0904-0906,50,4747,94,0.019802,0.019133,0.026323
4,0905-0907,50,4723,94,0.019903,0.019185,0.026858
5,0906-0908,51,4654,94,0.020198,0.019416,0.027112
6,0907-0909,51,4665,94,0.02015,0.019377,0.027093
7,0908-0910,50,4767,92,0.019299,0.018652,0.026282
8,0909-0911,50,4827,92,0.019059,0.018486,0.025521


Unnamed: 0,duration,num-days,whole-driver-num,com-driver-num,com-driver-num-ratio,com-link-num-ratio,com-link-weight-ratio
0,0901-0903,50,4744,40,0.008432,0.007992,0.01278
1,0902-0904,51,4713,40,0.008487,0.008001,0.01293
2,0903-0905,51,4748,40,0.008425,0.007923,0.013084
3,0904-0906,50,4746,40,0.008428,0.007928,0.013347
4,0905-0907,50,4722,40,0.008471,0.007982,0.013466
5,0906-0908,51,4653,40,0.008597,0.008091,0.013361
6,0907-0909,51,4663,40,0.008578,0.008084,0.013005
7,0908-0910,50,4760,39,0.008193,0.007726,0.012691
8,0909-0911,50,4823,39,0.008086,0.007658,0.012315


Unnamed: 0,duration,num-days,whole-driver-num,com-driver-num,com-driver-num-ratio,com-link-num-ratio,com-link-weight-ratio
0,0901-0903,50,4745,13,0.00274,0.002411,0.003677
1,0902-0904,51,4709,13,0.002761,0.00244,0.003622
2,0903-0905,51,4744,13,0.00274,0.002443,0.003613
3,0904-0906,50,4741,12,0.002531,0.002213,0.003595
4,0905-0907,50,4717,13,0.002756,0.002461,0.003786
5,0906-0908,51,4651,13,0.002795,0.002474,0.003851
6,0907-0909,51,4662,13,0.002789,0.002467,0.003822
7,0908-0910,50,4758,13,0.002732,0.002453,0.003605
8,0909-0911,50,4818,12,0.002491,0.002189,0.003458


### Community statistics summary (TH = 82)
#### Statistics

In [14]:
from community_analysis import ctrip_dir
df = pd.read_csv('%s/%s' % (ctrip_dir, '2009-CD(184)-thD(92)-ctrip.csv'))
df['by_com'] = df['by_com'].apply(lambda x: 1 if (x == 'O' or x == 1) else 0)
df.groupby(['cn']).mean().loc[:, ['fare', 'distance', 'duration', 'by_com']].reset_index()

Unnamed: 0,cn,fare,distance,duration,by_com
0,COM(0),1152.416664,9.548029,975.137113,0.933482
1,COM(2),1241.423677,9.910518,974.602929,0.92667
2,COM(4),1212.603596,9.962663,988.377353,0.708552
3,COM(5),1396.37604,13.060772,1097.991081,0.834182
4,COM(7),1378.040938,12.517273,1068.784925,0.823668


#### Service location

In [15]:
map_osm = draw_service_locations(df)
map_osm