In [2]:
import os
import re
import folium
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

In [3]:
LAT_DEFINE = 13.741163338261524
LON_DEFINE = 100.64205151814542

ZOOM_START = 10

In [4]:
gps_data_dir = 'E:/data/gps'
road_data_dir = 'E:/data/road'

In [5]:
tqdm.pandas(desc='working : ')

# Explore gps data

## find interesting road for use in project

### load GPS data
- **Columns**
  - time_stamp => เป็นวันเวลา
  - unit_id => เป็น id ของรถ
  - lat => เป็นค่า latitude
  - lon => เป็นค่า longitude
  - speed => เป็นค่าความเร็ว หน่วยเป็น km/h
  - unit_type => เป็นชนิดของรถ
    - 1 = รถโดยสารประจำทาง
    - 3 = รถโดยสารไม่ประจำทาง
    - 4 = รถโดยสารส่วนบุคคล
    - 5,8,9  = รถบรรทุกส่วนบุคคล
    - 6,7 = รถบรรทุกไม่ประจำทาง

In [18]:
# df_gps = pd.read_csv(gps_data_dir + '/2019/2019-03/2019-03-01_00.zip')
# df_gps.head()

Unnamed: 0,time_stamp,unit_id,lat,lon,speed,unit_type
0,2019-03-01 00:00:35,0390002000000000000EE085491,13.702763,100.581581,2,7.0
1,2019-03-01 00:00:26,015000500000863835028323652,9.970495,98.642845,0,3.0
2,2019-03-01 00:00:32,0390002000000000000EE107549,19.950153,99.236827,3,8.0
3,2019-03-01 00:00:28,005000600000864507030191018,14.07636,100.52067,0,8.0
4,2019-03-01 00:00:33,025000600000359857080832934,12.51695,99.97837,0,7.0


### load road data
- **Columns**
  - rid => หมายเลขสายถนน
  - rd => เลขถนน
  - km
  - ptype => ชนิดหลัก
  - lat, lon => ข้อมูลพิกัดแบบละเอียด
  - latx, lonx => ข้อมูลพิกัด ทศนิยม 3 ตำแหน่ง
  - angle => องศาของถนน 

In [4]:
# df_road = pd.read_csv(road_data_dir + '/roaddb.csv')
# df_road.head()

Unnamed: 0,rid,rd,km,ptype,lat,lon,latx,lonx,angle
0,1,3003,0.0,100m,14.492856,100.066108,14.493,100.066,180
1,1,3003,0.1,100m,14.493738,100.06629,14.494,100.066,180
2,1,3003,0.2,100m,14.494619,100.066476,14.495,100.066,179
3,1,3003,0.3,100m,14.495501,100.066658,14.496,100.067,179
4,1,3003,0.4,100m,14.496383,100.066839,14.496,100.067,179


### find interesting road

In [4]:
def get_road_dict(df_road):
    road_list = []
    df_road['coor'] = df_road.progress_apply(lambda row: f"{row['latx']:.3f}, {row['lonx']:.3f}", axis='columns')
    for rd in df_road['rd'].unique():
        coors = df_road[df_road['rd'] == rd]['coor'].to_list()
        road_dict = {
            'rd': rd,
            'coor_list': coors,
        }
        road_list.append(road_dict)
    return road_list

def get_road_interest(df_gps, road_list):
    df_gps = df_gps[df_gps['unit_type'].isin([6, 7])].reset_index(drop=True)
    df_gps = df_gps.drop_duplicates(subset=['unit_id']).reset_index(drop=True)
    df_gps['coor'] = df_gps.progress_apply(lambda row: f"{row['lat']:.3f}, {row['lon']:.3f}", axis='columns')
    for idx in range(len(road_list)):
        count_car = len(df_gps[df_gps['coor'].isin(road_list[idx]['coor_list'])])
        road_list[idx]['count_car'] = count_car

    return road_list

def find_road_interest():
    df_road = pd.read_csv(road_data_dir + '/roaddb.csv')
    df_gps = pd.read_csv(gps_data_dir + '/2019/2019-03/2019-03-08_21.zip', compression='zip')

    try:
        road_list = get_road_dict(df_road)
        road_list = get_road_interest(df_gps, road_list)

        # sort 'count_car' from highest to lowest.
        road_list.sort(key=sort_count, reverse=True)
    except:
        print(f"Error for find road interest")
    finally:
        # clear data in memory
        del df_road
        del df_gps

    return road_list

def sort_count(d):
    """
    function get value for sort list
    """
    return d['count_car']

In [27]:
road_list = find_road_interest()

working : 100%|██████████| 1093655/1093655 [00:11<00:00, 93055.49it/s]
working : 100%|██████████| 100106/100106 [00:01<00:00, 94106.82it/s]


In [28]:
# show top 10 road number have trucks
for idx in range(10):
    print(f"{road_list[idx]['rd']} : {road_list[idx]['count_car']}")

1 : 2483
4 : 2424
2 : 2247
9 : 1456
41 : 1185
7 : 1134
3901 : 1049
3902 : 1037
304 : 750
32 : 723


## find support point in data

In [7]:
def clean_df_tmp(df_tmp):
    """
    Function for clean dataframe by cleaning condition.
    """
    unit_type = df_tmp['unit_type'].isin([6, 7])
    speed = df_tmp['speed'] == 0

    df_tmp = df_tmp[unit_type & speed].reset_index(drop=True)
    return df_tmp

def load_gps_data():
    """
    Function for load and merge dataframes.
    """
    years_list = ['2019']
    months_list = ['03']
    days_list = ['01']
    df_list = []

    for year in years_list:
        for month in months_list:
            # get list filename in the folder path.
            file_names = os.listdir(gps_data_dir + f"/{year}/{year}-{month}/")
            for day in days_list:
                amount_day = 0
                for file_name in file_names:
                    file_name_part = re.split('-|_', file_name)
                    if len(file_name_part) > 2:
                        # select the day we chose from filename.
                        if file_name_part[2] == day:
                            _tmp = pd.read_csv(gps_data_dir + f"/{year}/{year}-{month}/" + file_name, compression='zip', parse_dates=['time_stamp'])
                            _tmp = clean_df_tmp(_tmp)
                            df_list.append(_tmp)
                            amount_day += 1
                            del _tmp

                    # the data collected was split into 8 sessions per day.
                    if amount_day > 7:
                        break
    
    df_gps = pd.concat(df_list, axis='rows', ignore_index=True)
    return df_gps

def get_road_dict_select(df_road):
    road_list_select = []
    df_road['coor'] = df_road.progress_apply(lambda row: f"{row['latx']:.3f}, {row['lonx']:.3f}", axis='columns')
    for rd in [1, 2, 4, 7, 9, 32, 41, 304, 3901, 3902]:
        coors = df_road[df_road['rd'] == rd]['coor'].to_list()
        road_dict_select = {
            'rd': rd,
            'coor_list': coors,
        }
        road_list_select.append(road_dict_select)
    return road_list_select

def get_point_amount(df_gps, road_list_select):
    point_list = []
    df_gps['coor'] = df_gps.progress_apply(lambda row: f"{row['lat']:.3f}, {row['lon']:.3f}", axis='columns')
    for idx in tqdm(range(len(road_list_select))):
        _tmp = df_gps[df_gps['coor'].isin(road_list_select[idx]['coor_list'])].reset_index(drop=True)
        for coor in road_list_select[idx]['coor_list']:
            _vehicle = _tmp[_tmp['coor'] == coor].reset_index(drop=True)
            _vehicle = _vehicle.drop_duplicates(subset=['unit_id']).reset_index(drop=True)
            point_dict = {
                'rd': road_list_select[idx]['rd'],
                'coor': coor,
                'amount': len(_vehicle)
            }
            point_list.append(point_dict)
            del _vehicle
        del _tmp
    return point_list
        

def find_point():
    df_road = pd.read_csv(road_data_dir + '/roaddb.csv')
    df_gps = load_gps_data()

    try:
        road_list_select = get_road_dict_select(df_road)
        point_list = get_point_amount(df_gps, road_list_select)
        df_point = pd.DataFrame(point_list)
    except:
        print(f"Error for find point")
    finally:
        # clear data in memory
        del df_road
        del df_gps
        del road_list_select
        del point_list

    return df_point

In [12]:
def plot_gps(lat, lon, popup, tooltip, mean, amount):
    if popup is None:
        popup = ['Some detail'] * len(lat)

    map = folium.Map(location=[LAT_DEFINE, LON_DEFINE], zoom_start=ZOOM_START)
    for _lat, _lon, _popup, _tooltip, _amount in zip(lat, lon, popup, tooltip, amount):
        if _amount > mean:
            folium.CircleMarker(
                location = [_lat, _lon],
                radius = 6,
                popup = _popup,
                color = 'blue',
                fill = True,
                fill_color = 'blue',
                fill_opacity = 0.7,
                tooltip = _tooltip
            ).add_to(map)

    display(map)

def visualize_point(df_point):
    mean_list = df_point.groupby(['rd']).mean()['amount'].to_list()
    idx = 0

    for rd in [1, 2, 4, 7, 9, 32, 41, 304, 3901, 3902]:
        _tmp = df_point[df_point['rd'] == rd].reset_index(drop=True)

        lat = _tmp['coor'].apply(lambda row: float(row.split(',')[0])).to_list()
        lon = _tmp['coor'].apply(lambda row: float(row.split(',')[1])).to_list()
        popup = _tmp['rd'].to_list()
        amount = _tmp['amount'].to_list()
        tooltip = _tmp['amount'].apply(lambda row: f"amount: {row}").to_list()
        plot_gps(lat, lon, popup, tooltip, mean_list[idx], amount)
        idx += 1
    

In [8]:
df_point = find_point()

working : 100%|██████████| 1093655/1093655 [00:11<00:00, 92107.01it/s]
working : 100%|██████████| 17196985/17196985 [03:51<00:00, 74124.96it/s]
100%|██████████| 10/10 [13:55<00:00, 83.56s/it]


In [10]:
df_point[df_point['amount'] > 0]

Unnamed: 0,rd,coor,amount
0,1,"14.168, 100.618",15
1,1,"14.169, 100.618",6
2,1,"14.169, 100.618",6
3,1,"14.170, 100.618",2
4,1,"14.171, 100.618",2
...,...,...,...
46336,3902,"13.801, 100.681",1
46337,3902,"13.800, 100.681",2
46338,3902,"13.799, 100.681",4
46340,3902,"13.797, 100.682",1


In [56]:
df_point.groupby(['rd']).mean()

Unnamed: 0_level_0,amount
rd,Unnamed: 1_level_1
1,49.893519
2,79.626994
4,33.984465
7,163.693315
9,194.241942
32,111.243168
41,62.100673
304,44.715179
3901,197.695183
3902,198.516584


In [None]:
# visualize_point(point_list)

## DBscan