## **Bus Network Graph: Experiments on real data**
-----------------------

In [37]:
from network import *
from stop import Stop
from variant import Variant
from path import Path
from queries import StopQuery, VariantQuery, PathQuery

from ipyleaflet import Map, GeoJSON
import pandas as pd
import plotly.express as px
from tqdm import tqdm

import timeit


In [128]:
net_analysis = NetworkAnalysisBetweenness()
dijkstra = BusNetworkDijkstra()
net = BusNetwork.from_ndjsons(sides_set_type='spatial')

sides_set_type =  spatial


100%|██████████| 297/297 [00:10<00:00, 28.92it/s]


In [127]:
stops = StopQuery.from_ndjson()
vars = VariantQuery.from_ndjson()

### Degree distribution of *Stop*s

In [152]:
def mod_row(row):
    if row['StopType'] in [
        'Ô sơn',
        'Biển treo',
        'Trạm tạm',
        'Bến bãi QH chung QH',
        'Ga Metro Số 2',
        'Bến Bãi QH 568'
    ]:
        row['StopType'] = 'Others'

    return row

df_nodes = pd.DataFrame([
    stop.to_dict()
    for stop in net.nodes.values()
]).apply(mod_row, axis=1)

df_nodes['Degree'] = df_nodes['Routes'].apply(lambda s: len(s.split()))
df_nodes = df_nodes[['StopId', 'StopType', 'Degree']]
df_nodes


Unnamed: 0,StopId,StopType,Degree
0,35,Bến xe,26
1,7276,Trụ dừng,21
2,7277,Trụ dừng,19
3,7278,Nhà chờ,14
4,7265,Nhà chờ,5
...,...,...,...
4392,7682,Others,1
4393,7683,Others,1
4394,7684,Others,1
4395,7685,Others,1


In [153]:
bins = [*range(15), 15, 50]
lens = df_nodes.groupby(pd.cut(df_nodes['Degree'], bins=bins)).apply(len)

bins[-1] = '15+'
fig = px.bar(y=bins[1:], x=lens, height=800, width=1400, log_x=True, text_auto=True, color=lens, orientation='h')
fig.update_yaxes(type='category')

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(
    title="Degree distribution of bus stops",
    xaxis_title="Count",
    yaxis_title="Degree (# of routes passing)",
    # legend_title="Legend Title",
    font=dict(
        size=18,
        # color="RebeccaPurple"
    )
)
# fig.update_traces(textposition='outside')

fig.show()





In [154]:
bins = [*range(15), 15, 50]
lens_obj = df_nodes.groupby('StopType').apply(
    # # print
    # axis='index',
    lambda g: (g.groupby(pd.cut(df_nodes['Degree'], bins=bins)).apply(len))
    # lambda g: g.value_counts()
).stack().to_dict()

lens_obj

def get_label(label):
    if label.right > 15:
        return '15+'
    return str(label.right)

lens_df = pd.DataFrame([
    {
        'StopType': stop_type,
        'Label': get_label(label),
        'Count': count
    }
    for (stop_type, label), count in lens_obj.items()
])
lens_df

fig = px.bar(
    lens_df, x='Count', y='Label', facet_col='StopType', facet_col_wrap=2,
    height=1350, width=1800, color='Count',
    log_x=True, text_auto=True, orientation='h'
)
fig.update_yaxes(title="Degree (# of routes passing)", type='category')
fig.update_xaxes(title="Count")

fig.update_layout(
    title="Degree distribution",
    # legend_title="Legend Title",
    font=dict(
        size=16,
    )
)

fig.show()







### Relationship between Route distance and running time

In [163]:
df_vars = pd.DataFrame([var.to_dict() for var in vars._objs.values()])

df_vars['RouteVarName'] = df_vars['StartStop'] + ' - ' + df_vars['EndStop']
df_vars['Speed'] = df_vars['Distance'] / df_vars['RunningTime'] * 0.06
df_vars['RouteId'] = df_vars['RouteIds'].apply(lambda ids: ids[0])
df_vars['RouteVarId'] = df_vars['RouteIds'].apply(lambda ids: ids[1])
df_vars = df_vars[['RouteId', 'RouteVarId', 'RouteNo', 'RouteVarName', 'Distance', 'RunningTime', 'Speed']]
df_vars

Unnamed: 0,RouteId,RouteVarId,RouteNo,RouteVarName,Distance,RunningTime,Speed
0,3,5,03,Bến xe buýt Sài Gòn - THẠNH LỘC,21456.0,70,18.390857
1,3,6,03,THẠNH LỘC - Bến xe buýt Sài Gòn,21704.0,70,18.603429
2,1,1,01,Công Trường Mê Linh - Bến xe Chợ Lớn,8381.0,35,14.367429
3,1,2,01,Bến xe Chợ Lớn - Công Trường Mê Linh,9458.0,35,16.213714
4,7,13,07,Bến xe Chợ Lớn - BÃI HẬU CẦN SỐ 1,15907.0,70,13.634571
...,...,...,...,...,...,...,...
292,314,2,HS-94,Nguyen Van Huong - TH Nguyen Bỉnh Khiem,11142.0,80,8.356500
293,313,1,HS-93,Thao Dien - TH Nguyen Bỉnh Khiem,12986.0,90,8.657333
294,313,2,HS-93,TH Nguyen Bỉnh Khiem - Thao Dien,7277.0,80,5.457750
295,337,1,SWB1,Ga tàu thuỷ Bạch Đằng - Ga tàu thuỷ Linh Đông,11680.0,52,13.476923


In [165]:
# px.histogram(filtered_vars_df['Distance'], nbins=100)
fig = px.scatter(
    df_vars, y='Distance', x='RunningTime', marginal_x='box', marginal_y='box', width=1600, height=900, 
    trendline='ols', trendline_options={"add_constant": False},
    hover_data=['RouteVarName', 'RouteId', 'RouteVarId'],
    color='Speed', color_continuous_scale=px.colors.sequential.Agsunset, range_color=[5, 40]
)
fig.update_layout(
    title="Relationship between Distance and RunningTime",
    # legend_title="Legend Title",
    font=dict(
        size=20,
    )
)
fig.show()

### Running time of construction algorithms

In [63]:
def run_gen(sides_set_type: str):
    BusNetwork.from_ndjsons(sides_set_type=sides_set_type)

NO_RUNS = 5

times = {}
for mode in ['default', 'spatial']:
    print('Running mode', mode)
    times[mode] = [
        timeit.timeit("run_gen(sides_set_type=mode)", globals=globals(), number=1)
        for _ in range(NO_RUNS)
    ]

times

Running mode default
sides_set_type =  default


100%|██████████| 297/297 [00:42<00:00,  6.92it/s]


sides_set_type =  default


100%|██████████| 297/297 [00:42<00:00,  7.00it/s]


sides_set_type =  default


100%|██████████| 297/297 [00:39<00:00,  7.54it/s]


sides_set_type =  default


100%|██████████| 297/297 [00:39<00:00,  7.60it/s]


sides_set_type =  default


100%|██████████| 297/297 [00:38<00:00,  7.71it/s]


Running mode spatial
sides_set_type =  spatial


100%|██████████| 297/297 [00:07<00:00, 38.92it/s]


sides_set_type =  spatial


100%|██████████| 297/297 [00:07<00:00, 38.38it/s]


sides_set_type =  spatial


100%|██████████| 297/297 [00:07<00:00, 38.87it/s] 


sides_set_type =  spatial


100%|██████████| 297/297 [00:07<00:00, 39.59it/s]


sides_set_type =  spatial


100%|██████████| 297/297 [00:07<00:00, 39.40it/s]


{'default': [43.38411490013823,
  42.890710399951786,
  39.85574440006167,
  39.51483920007013,
  38.97113410010934],
 'spatial': [8.083511600038037,
  8.161453299922869,
  8.059146600076929,
  7.91556299990043,
  7.955796599853784]}

In [99]:
df_times = pd.DataFrame(times, index=['Run #{}'.format(x) for x in range(1, 1 + NO_RUNS)])

NO_ITERS = 297
for mode in ['default', 'spatial']:
    df_times[mode + '_it_per_sec'] = NO_ITERS / df_times[mode]

df_times = df_times[['default', 'default_it_per_sec', 'spatial', 'spatial_it_per_sec']]
df_times

Unnamed: 0,default,default_it_per_sec,spatial,spatial_it_per_sec
Run #1,43.384115,6.845824,8.083512,36.741458
Run #2,42.89071,6.924576,8.161453,36.390578
Run #3,39.855744,7.451874,8.059147,36.852537
Run #4,39.514839,7.516164,7.915563,37.52102
Run #5,38.971134,7.621025,7.955797,37.331271


In [101]:
fin = df_times.sum() / len(df_times)
fin

default               40.923309
default_it_per_sec     7.271893
spatial                8.035094
spatial_it_per_sec    36.967373
dtype: float64

In [102]:
fin['default'] / fin['spatial']

5.0930714039938225

### Top $k = 10$ most influential *Stop*s

In [34]:
net_analysis.from_net(net=net, dijkstra_engine=dijkstra, alg='tree')

100%|██████████| 4397/4397 [02:51<00:00, 25.58it/s] 


In [40]:
top_stops_id = net_analysis.top_scores(10)
for stop_id in top_stops_id:
    print(stop_id, net_analysis.scores[stop_id])

268 2625614
267 2625614
1239 2614406
1115 2613916
1393 2607219
1152 2604321
270 2311589
271 2310622
174 2294866
510 2261665


In [41]:
for stop_id in net_analysis.top_scores():
    print(stops[stop_id])

| StopID:              268                       |
| Code:                QTB 075                   |
| Name:                Mũi tàu Cộng Hòa          |
| Type:                Trụ dừng                  |
| Zone:                Quận Tân Bình             |
| Ward:                                          |
| Address No.:         19B1                      |
| Street:              Trường Chinh              |
| Support Disability?: False                     |
| Status:              Đang khai thác            |
| Lng:                 106.634696                |
| Lat:                 10.808078                 |
| Search tokens:       MtCH, 19B1, TC            |
| Routes:              04 -> 104 -> 145 -> 16 -> 23 -> 27 -> 28 -> 62 -> 65
| StopID:              267                       |
| Code:                QTB 076                   |
| Name:                Ngã ba Chế Lan Viên       |
| Type:                Nhà chờ                   |
| Zone:                Quận Tân Bình             |
| Ward

In [98]:
im_stops_df = pd.DataFrame(
    {
        **stops[stop_id].to_dict(),
        "Score": net_analysis.scores[stop_id]
    }
    for stop_id in net_analysis.top_scores(4397)
)
im_stops_df = im_stops_df[['StopId', 'Code', 'Name', 'StopType', 'Zone', 'Street', 'Lng', 'Lat', 'Score']].sort_values(by='Score')
im_stops_df

Unnamed: 0,StopId,Code,Name,StopType,Zone,Street,Lng,Lat,Score
4396,7529,BX_bautram,Duong Bau Tram,Bến xe,Học Sinh,duong bau tram,106.577607,10.992852,2
4395,7526,BX_THCStrungan,THCS Trung An,Bến xe,Học Sinh,trung hoc co so trung an,106.591863,11.000770,2
4393,7499,BX_tamthonhiep4,Tam Thon Hiep,Bến xe,Học Sinh,tam thon hiep,106.856676,10.594148,3
4394,7498,BX_THPTannghia,THPT An Nghia,Bến xe,Học Sinh,tam thon hiep,106.830448,10.588990,3
4391,7510,BX_THCSbachdang,THCS Bach Dang,Bến xe,Học Sinh,THCS bach dang,106.677433,10.789388,5
...,...,...,...,...,...,...,...,...,...
4,1393,HHM 056,Ngã tư Trung Chánh,Nhà chờ,Huyện Hóc Môn,Quốc lộ 22,106.608152,10.853595,2607219
3,1115,Q12 122,Bến xe An Sương,Trụ dừng,Quận 12,Quốc lộ 22,106.613517,10.845803,2613916
2,1239,HHM 058,Bến xe An Sương,Trụ dừng,Huyện Hóc Môn,Quốc lộ 22,106.613522,10.845187,2614406
1,267,QTB 076,Ngã ba Chế Lan Viên,Nhà chờ,Quận Tân Bình,Trường Chinh,106.633349,10.812204,2625614


In [126]:
fig = px.scatter_geo(im_stops_df, lat='Lat', lon='Lng', color='Score', width=1400, height=1000, color_continuous_scale='rdpu', range_color=[0, 2000000])
fig.update_layout(
    title='Spatial distribution of Betweenness score',
    font=dict(
        size=17
    ),
    geo=dict(
        scope = 'asia',
        resolution = 110,
        lonaxis_range= [106.4, 106.9],
        lataxis_range= [10.6, 11],
        landcolor = 'rgb(240, 240, 240)',
    )
)
fig.show()

##### Table of Top $k=10$ *Stop*s

In [39]:
stop_list = [
    {
        **stops[stop_id].to_dict(),
        "Score": net_analysis.scores[stop_id]
    }
    for stop_id in net_analysis.top_scores()
]

df = pd.DataFrame(stop_list, index=pd.RangeIndex(1, 11))
df['Routes'] = df['Routes'].apply(lambda routes: len(routes.split()))
df['Location'] = df['Street'] + ', ' + df['Zone']

df = df[['StopId', 'Name', 'Location', 'Street', 'Zone', 'StopType', 'Score']]

df

Unnamed: 0,StopId,Name,Location,Street,Zone,StopType,Score
1,268,Mũi tàu Cộng Hòa,"Trường Chinh, Quận Tân Bình",Trường Chinh,Quận Tân Bình,Trụ dừng,2625614
2,267,Ngã ba Chế Lan Viên,"Trường Chinh, Quận Tân Bình",Trường Chinh,Quận Tân Bình,Nhà chờ,2625614
3,1239,Bến xe An Sương,"Quốc lộ 22, Huyện Hóc Môn",Quốc lộ 22,Huyện Hóc Môn,Trụ dừng,2614406
4,1115,Bến xe An Sương,"Quốc lộ 22, Quận 12",Quốc lộ 22,Quận 12,Trụ dừng,2613916
5,1393,Ngã tư Trung Chánh,"Quốc lộ 22, Huyện Hóc Môn",Quốc lộ 22,Huyện Hóc Môn,Nhà chờ,2607219
6,1152,Trung tâm Văn hóa Quận 12,"Quốc lộ 22, Quận 12",Quốc lộ 22,Quận 12,Nhà chờ,2604321
7,270,UBND Phường 15,"Trường Chinh, Quận Tân Bình",Trường Chinh,Quận Tân Bình,Nhà chờ,2311589
8,271,Khu Công Nghiệp Tân Bình,"Trường Chinh, Quận Tân Bình",Trường Chinh,Quận Tân Bình,Nhà chờ,2310622
9,174,Trạm Dệt Thành Công,"Trường Chinh, Quận Tân Phú",Trường Chinh,Quận Tân Phú,Nhà chờ,2294866
10,510,Bệnh viện Quận Tân Bình,"Hoàng Văn Thụ, Quận Tân Bình",Hoàng Văn Thụ,Quận Tân Bình,Nhà chờ,2261665


### # of edge candidates $|E|$ per *Stop*

In [6]:
def get_df(mode):
    return pd.DataFrame(
        {
            "RouteId": route_id,
            "RouteVarId": route_var_id,
            "PathLength": path_len,
            **pd.Series(per_stop).describe()
        }
        for (route_id, route_var_id), path_len, per_stop in BusNetwork._analyse_sides_set(sides_set_type=mode)
    )

In [7]:
df_spatial = get_df('spatial')
df_spatial['Type'] = 'spatial'

df_default = get_df('default')
df_default['Type'] = 'default'

sides_set_type =  spatial


100%|██████████| 297/297 [00:05<00:00, 53.04it/s]


sides_set_type =  default


100%|██████████| 297/297 [00:00<00:00, 622.27it/s]


In [19]:
df_all = pd.concat([df_default, df_spatial]).reset_index().drop(columns='index')
df_all

Unnamed: 0,RouteId,RouteVarId,PathLength,count,mean,std,min,25%,50%,75%,max,Type
0,3,5,483,53.0,483.0,0.000000,483.0,483.00,483.0,483.00,483.0,default
1,7,14,222,50.0,222.0,0.000000,222.0,222.00,222.0,222.00,222.0,default
2,1,1,249,29.0,249.0,0.000000,249.0,249.00,249.0,249.00,249.0,default
3,3,6,327,56.0,327.0,0.000000,327.0,327.00,327.0,327.00,327.0,default
4,7,13,235,44.0,235.0,0.000000,235.0,235.00,235.0,235.00,235.0,default
...,...,...,...,...,...,...,...,...,...,...,...,...
589,313,1,99,2.0,2.5,0.707107,2.0,2.25,2.5,2.75,3.0,spatial
590,314,2,73,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,spatial
591,313,2,30,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,spatial
592,337,1,54,5.0,6.8,3.898718,3.0,4.00,5.0,11.00,11.0,spatial


In [31]:
df_default['mean'].describe()

count    297.000000
mean     247.265993
std      155.000806
min        6.000000
25%      154.000000
50%      240.000000
75%      333.000000
max      781.000000
Name: mean, dtype: float64

In [32]:
df_spatial['mean'].describe()

count    297.000000
mean       6.320350
std        2.569525
min        1.000000
25%        4.650000
50%        6.275000
75%        7.660000
max       14.040000
Name: mean, dtype: float64

In [25]:
fig = px.imshow(df_default.drop(columns=['Type', 'RouteId', 'RouteVarId']).corr())
fig.show()

In [27]:
fig = px.scatter(
    df_all, x='PathLength', y='count', marginal_x='box', marginal_y='box', width=1600, height=900, 
    trendline='ols', trendline_options={"add_constant": False},
    hover_data=['RouteId', 'RouteVarId', 'count', 'mean'],
    color='Type'
)
fig.show()

In [28]:
fig = px.scatter(
    df_all, x='PathLength', y='mean', marginal_x='box', marginal_y='box', log_y=True, width=1600, height=900, 
    trendline='ols', trendline_options={"add_constant": True},
    hover_data=['RouteId', 'RouteVarId', 'count', 'mean'],
    color='Type'
)

fig.update_xaxes(title="# of Path sides")
fig.update_yaxes(title="# of candidate edges |E|")

fig.update_layout(
    font=dict(
        size=20,
    )
)
fig.show()

### Running time between Betweenness Analysis algorithms

In [88]:
def run_analysis(alg: str):
    _net_analysis = NetworkAnalysisBetweenness()
    _net_analysis.from_net(net=net, dijkstra_engine=None, alg=alg)

NO_RUNS = 5

times = {}
for mode in ['default', 'tree']:
    print('Running mode', mode)
    times[mode] = [
        timeit.timeit("run_analysis(alg=mode)", globals=globals(), number=1)
        for _ in range(NO_RUNS)
    ]

times

Running mode default


100%|██████████| 4397/4397 [09:28<00:00,  7.74it/s]
100%|██████████| 4397/4397 [08:54<00:00,  8.22it/s]
100%|██████████| 4397/4397 [07:40<00:00,  9.55it/s]
100%|██████████| 4397/4397 [07:30<00:00,  9.76it/s]
100%|██████████| 4397/4397 [07:16<00:00, 10.06it/s]


Running mode tree


100%|██████████| 4397/4397 [02:28<00:00, 29.56it/s]
100%|██████████| 4397/4397 [02:21<00:00, 31.05it/s]
100%|██████████| 4397/4397 [02:23<00:00, 30.59it/s]
100%|██████████| 4397/4397 [02:17<00:00, 32.01it/s]
100%|██████████| 4397/4397 [02:17<00:00, 31.92it/s]


{'default': [568.2074843998998,
  534.7812418001704,
  460.3806853001006,
  450.74478500010446,
  436.9792862001341],
 'tree': [148.74347339989617,
  141.63267030008137,
  143.76318740006536,
  137.3843427998945,
  137.74046280002221]}

In [122]:
df_times = pd.DataFrame(times, index=['Run #{}'.format(x) for x in range(1, 1 + NO_RUNS)])

NO_ITERS = 4397
for mode in ['default', 'tree']:
    df_times[mode + '_it_per_sec'] = NO_ITERS / df_times[mode]

df_times = df_times[['default', 'default_it_per_sec', 'tree', 'tree_it_per_sec']]
df_times

Unnamed: 0,default,default_it_per_sec,tree,tree_it_per_sec
Run #1,568.207484,7.73837,148.743473,29.560961
Run #2,534.781242,8.222054,141.63267,31.045097
Run #3,460.380685,9.550792,143.763187,30.58502
Run #4,450.744785,9.754966,137.384343,32.005103
Run #5,436.979286,10.062262,137.740463,31.922355


In [123]:
fin = df_times.sum() / len(df_times)
fin

default               490.218697
default_it_per_sec      9.065689
tree                  141.852827
tree_it_per_sec        31.023707
dtype: float64

In [125]:
print('Improvement: {}%'.format((fin['default'] / fin['tree'] * 100).round(2)))

Improvement: 345.58%
