In [6]:
import __init__
from community_analysis import com_trip_dir, com_dir
#
from taxi_common.file_handling_functions import get_all_files, load_pickle_file
from taxi_common.sg_grid_zone import get_sg_grid_xy_points, get_sg_zones

#
import pandas as pd
import folium
from temp import generate_3D_graph
import plotly.plotly as py


Matplotlib is building the font cache using fc-list. This may take a moment.



# Datasets for analysis 
* Considered time frames
    * Monday to Thursday
    * **2:00PM to 11:00PM**
* **Only full-time drivers** (one shift drivers who don't share his vehicle with others)
* **Ignore last-mile trips** (less than 2km)


# Data preprocessing (filtering)
* Grid specification
    * Area of a grid is 0.5km X 0.5km

In [7]:
x_points, y_points = get_sg_grid_xy_points()
zones = get_sg_zones()
xc, yc = (x_points[0] + x_points[-1]) / float(2), (y_points[0] + y_points[-1]) / float(2)

map_osm = folium.Map(location=[yc, xc], zoom_start=12)

# horizontal lines
for x in x_points:
    sx, sy, ex, ey = x, y_points[0], x, y_points[-1]
    map_osm.add_children(folium.PolyLine(locations=[(sy, sx), (ey, ex)], weight=0.5))
# vertical lines
for y in y_points:
    sx, sy, ex, ey = x_points[0], y, x_points[-1], y
    map_osm.add_children(folium.PolyLine(locations=[(sy, sx), (ey, ex)], weight=0.5))    
map_osm

* Daily link process
    * A link can be generated if two drivers pick up passengers at the same location (zone) within 30 minutes
    * Link's weight increase whenever the two drivers who form the link pick up passengers at the same location within 30 minutes
![link_weight_increment](report_source/link_weight_increment.png)
        * A driver can pick up two passenger at the same zone within 30 minutes 
            * The number of linkage can increase more than one
        * Ensure only one linkage increment when a driver pick a passenger one

    * After counting all link's weight, **ignore links whose weight is less than two**



* Annual link process
    * Aggregate daily links
        * For each link, count the number of day the link appeared in a year
        * Current dataset is Y2009 (11 months considered, except December)
        * 184 days considered for the analysis
    * Filter out links whose weight is less than a threshold value
        * The following result's threshold value is **92 days** (184 days X 0.5)
            * Somewhat strict
            * But, if the threshold is small, few (two or three) communities can be detected
            * However cannot check evolution of community properly
        * Already checked for other threshold values

# Community summary

In [12]:
target_dir = '%s/%s' % (com_dir, get_all_files(com_dir, '','')[-1])
summary_fpath = None
for fn in get_all_files(target_dir, '', ''):
    if fn.endswith('summary.csv'):
        summary_fpath = '%s/%s' % (target_dir, fn)
        break
df = pd.read_csv(summary_fpath)
df

Unnamed: 0,com-name,num-nodes,num-edges,tie-strength(# of days encounter / # of drivers)
0,COM(0),94,329,373.914894
1,COM(1),92,520,599.543478
2,COM(2),2,1,47.0
3,COM(3),45,309,797.133333
4,COM(4),40,85,246.15
5,COM(5),13,28,230.076923
6,COM(6),4,4,120.75
7,COM(7),3,2,66.0
8,COM(8),3,2,79.333333
9,COM(9),2,1,62.5


In [5]:
glayout_fn = None
for fn in get_all_files(target_dir, '', ''):
    if fn.endswith('glayout.pkl'):
        glayout_fn = fn
        break
glayout_fpath = '%s/%s' % (target_dir, glayout_fn)
labels, group, layt, Edges = load_pickle_file(glayout_fpath)
fig = generate_3D_graph(labels, group, layt, Edges)
py.iplot(fig, filename='Taxi full time driver network in Sinapore')

# Community service location
## Data loading and preprocessing

In [16]:
xaxis_unit = x_points[1] - x_points[0]
xmid = xaxis_unit / float(2)
dx_unit = xaxis_unit / float(5)
adjusts = [dx_unit / float(2) + dx_unit * i - xmid for i in xrange(5)]

In [17]:
target_fpath = '%s/%s' % (com_trip_dir, get_all_files(com_trip_dir, '', '.csv')[-1])

## Datasets

In [18]:
df = pd.read_csv(target_fpath)
df.head()

Unnamed: 0,time,yy,mm,did,cnum,start-long,start-lat,end-long,end-lat,distance,duration,fare,si,sj,ei,ej
0,1231135200,9,1,2499,1,103.85666,1.36743,103.6969,1.34543,24.6,1860,1820,55,28,20,23
1,1231135200,9,1,18580,0,103.83354,1.30369,103.85904,1.32377,5.0,1020,780,50,14,56,19
2,1231135260,9,1,11389,3,103.98984,1.36108,103.93015,1.30685,9.7,600,1060,85,27,71,15
3,1231135260,9,1,34620,1,103.84363,1.28687,103.75095,1.35746,16.8,1260,1280,52,11,32,26
4,1231135260,9,1,17678,0,103.85625,1.30017,103.88064,1.31202,4.8,780,640,55,13,60,16


## Some statistics for each community

In [12]:
df.groupby(['cnum']).mean()['fare']

cnum
0    1152.416664
1    1241.423677
3    1396.376040
4    1378.040938
5    1212.603596
Name: fare, dtype: float64

In [13]:
df.groupby(['cnum']).mean()['distance']

cnum
0     9.548029
1     9.910518
3    13.060772
4    12.517273
5     9.962663
Name: distance, dtype: float64

## Visualization
### Firstly, group by community and start location

In [24]:
sloc = df.groupby(['cnum', 'si', 'sj']).count()['did'].to_frame('total-num-trip').reset_index()
sloc.head()

Unnamed: 0,cnum,si,sj,total-num-trip
0,0,3,10,1
1,0,3,13,1
2,0,4,7,1
3,0,4,10,1
4,0,4,12,1


### Choose top five location where drivers of a community mostly pick up
* For the first location where many pick-up occured, check the top five gps coordinates
    * The less the shape's side, the more pick-up occured

In [20]:
color_map = ['red', 'green', 'blue', 'orange', 'black']

com_indices = set(df['cnum'])
map_osm = folium.Map(location=[yc, xc], zoom_start=12)
top_locations = {}
for i, cid in enumerate(com_indices):
    com_df = sloc[(sloc['cnum'] == cid)]
    for j, (_, si, sj, trip_num) in enumerate(com_df.sort('total-num-trip', ascending=False).values):
        if not top_locations.has_key(cid):
            top_locations[cid] = [i, (si, sj)]
        y, x = zones[(si, sj)].cCoor_gps
        folium.Marker((y, x + adjusts[i]),
              popup='COM(%d) %d'%(cid,j + 1),
              icon=folium.Icon(color = color_map[i])
             ).add_to(map_osm)
        if j == 4:
            break
for cid, (color_i, (si, sj)) in top_locations.iteritems():
    top_loc_df = df[(df['cnum'] == cid ) & (df['si'] == si) & (df['sj'] == sj )]
    gps_loc_df = top_loc_df.groupby(['start-long', 'start-lat']).count()['did'].to_frame('total-num-trip').reset_index()
    for j, (gps_long, gps_lat, trip_num) in enumerate(gps_loc_df.sort('total-num-trip', ascending=False).values):
        folium.RegularPolygonMarker(
        [gps_lat, gps_long],
        color=color_map[color_i],
        fill_color=color_map[color_i],
        number_of_sides=3 + j,
        radius=5
        ).add_to(map_osm)             
        if j == 4:
            break
         
# horizontal lines
for x in x_points:
    sx, sy, ex, ey = x, y_points[0], x, y_points[-1]
    map_osm.add_children(folium.PolyLine(locations=[(sy, sx), (ey, ex)], weight=0.5))
# vertical lines
for y in y_points:
    sx, sy, ex, ey = x_points[0], y, x_points[-1], y
    map_osm.add_children(folium.PolyLine(locations=[(sy, sx), (ey, ex)], weight=0.5))    
map_osm


sort(columns=....) is deprecated, use sort_values(by=.....)


sort(columns=....) is deprecated, use sort_values(by=.....)



### Result about end location

In [22]:
eloc = df.groupby(['cnum', 'ei', 'ej']).count()['did'].to_frame('total-num-trip').reset_index()
com_indices = set(eloc['cnum'])

map_osm1 = folium.Map(location=[yc, xc], zoom_start=12)
for i, cid in enumerate(com_indices):
    com_df = eloc[(eloc['cnum'] == cid)]
    for j, (_, ei, ej, trip_num) in enumerate(com_df.sort('total-num-trip', ascending=False).values):     
        y, x = zones[(ei, ej)].cCoor_gps
        folium.Marker((y, x + adjusts[i]),
              popup='COM(%d) %d'%(cid,j + 1),
              icon=folium.Icon(color = color_map[i])
             ).add_to(map_osm1)
        if j == 4:
            break
# vertical lines
for x in x_points:
    sx, sy, ex, ey = x, y_points[0], x, y_points[-1]
    map_osm1.add_children(folium.PolyLine(locations=[(sy, sx), (ey, ex)], weight=0.5))
# vertical lines
for y in y_points:
    sx, sy, ex, ey = x_points[0], y, x_points[-1], y
    map_osm1.add_children(folium.PolyLine(locations=[(sy, sx), (ey, ex)], weight=0.5))    
map_osm1


sort(columns=....) is deprecated, use sort_values(by=.....)

