In [1]:
import os
import re
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from folium import plugins

In [4]:
gps_data_dir = 'E:/data/gps'
road_data_dir = 'E:/data/road'

In [5]:
tqdm.pandas(desc='working : ')

# Explore gps data

## find interesting road for use in project

### load GPS data
- **Columns**
  - time_stamp => เป็นวันเวลา
  - unit_id => เป็น id ของรถ
  - lat => เป็นค่า latitude
  - lon => เป็นค่า longitude
  - speed => เป็นค่าความเร็ว หน่วยเป็น km/h
  - unit_type => เป็นชนิดของรถ
    - 1 = รถโดยสารประจำทาง
    - 3 = รถโดยสารไม่ประจำทาง
    - 4 = รถโดยสารส่วนบุคคล
    - 5,8,9  = รถบรรทุกส่วนบุคคล
    - 6,7 = รถบรรทุกไม่ประจำทาง

In [18]:
# df_gps = pd.read_csv(gps_data_dir + '/2019/2019-03/2019-03-01_00.zip')
# df_gps.head()

Unnamed: 0,time_stamp,unit_id,lat,lon,speed,unit_type
0,2019-03-01 00:00:35,0390002000000000000EE085491,13.702763,100.581581,2,7.0
1,2019-03-01 00:00:26,015000500000863835028323652,9.970495,98.642845,0,3.0
2,2019-03-01 00:00:32,0390002000000000000EE107549,19.950153,99.236827,3,8.0
3,2019-03-01 00:00:28,005000600000864507030191018,14.07636,100.52067,0,8.0
4,2019-03-01 00:00:33,025000600000359857080832934,12.51695,99.97837,0,7.0


### load road data
- **Columns**
  - rid => หมายเลขสายถนน
  - rd => เลขถนน
  - km
  - ptype => ชนิดหลัก
  - lat, lon => ข้อมูลพิกัดแบบละเอียด
  - latx, lonx => ข้อมูลพิกัด ทศนิยม 3 ตำแหน่ง
  - angle => องศาของถนน 

In [4]:
# df_road = pd.read_csv(road_data_dir + '/roaddb.csv')
# df_road.head()

Unnamed: 0,rid,rd,km,ptype,lat,lon,latx,lonx,angle
0,1,3003,0.0,100m,14.492856,100.066108,14.493,100.066,180
1,1,3003,0.1,100m,14.493738,100.06629,14.494,100.066,180
2,1,3003,0.2,100m,14.494619,100.066476,14.495,100.066,179
3,1,3003,0.3,100m,14.495501,100.066658,14.496,100.067,179
4,1,3003,0.4,100m,14.496383,100.066839,14.496,100.067,179


### find interesting road

In [26]:
def get_road_dict(df_road):
    road_list = []
    df_road['coor'] = df_road.progress_apply(lambda row: f"{row['latx']:.3f}, {row['lonx']:.3f}", axis='columns')
    for rd in df_road['rd'].unique():
        coors = df_road[df_road['rd'] == rd]['coor'].to_list()
        road_dict = {
            'rd': rd,
            'coor_list': coors,
        }
        road_list.append(road_dict)
    return road_list

def get_road_interest(df_gps, road_list):
    df_gps = df_gps[df_gps['unit_type'].isin([6, 7])].reset_index(drop=True)
    df_gps = df_gps.drop_duplicates(subset=['unit_id']).reset_index(drop=True)
    df_gps['coor'] = df_gps.progress_apply(lambda row: f"{row['lat']:.3f}, {row['lon']:.3f}", axis='columns')
    for idx in range(len(road_list)):
        count_car = len(df_gps[df_gps['coor'].isin(road_list[idx]['coor_list'])])
        road_list[idx]['count_car'] = count_car

    return road_list

def find_road_interest():
    df_road = pd.read_csv(road_data_dir + '/roaddb.csv')
    df_gps = pd.read_csv(gps_data_dir + '/2019/2019-03/2019-03-08_21.zip', compression='zip')

    road_list = get_road_dict(df_road)
    road_list = get_road_interest(df_gps, road_list)

    # sort 'count_car' from highest to lowest.
    road_list.sort(key=sort_count, reverse=True)

    # clear data in memory
    del df_road
    del df_gps

    return road_list

def sort_count(d):
    """
    function get value for sort list
    """
    return d['count_car']

In [27]:
road_list = find_road_interest()

working : 100%|██████████| 1093655/1093655 [00:11<00:00, 93055.49it/s]
working : 100%|██████████| 100106/100106 [00:01<00:00, 94106.82it/s]


In [28]:
# show top 10 road number have trucks
for idx in range(10):
    print(f"{road_list[idx]['rd']} : {road_list[idx]['count_car']}")

1 : 2483
4 : 2424
2 : 2247
9 : 1456
41 : 1185
7 : 1134
3901 : 1049
3902 : 1037
304 : 750
32 : 723


## find support point in data

In [7]:
def clean_df_tmp(df_tmp):
    """
    Function for clean dataframe by cleaning condition.
    """
    df_tmp = df_tmp[df_tmp['unit_type'].isin([6, 7])].reset_index(drop=True)
    return df_tmp

def load_gps_data():
    """
    Function for load and merge dataframes.
    """
    list_years = ['2019']
    list_months = ['03']
    list_days = ['01']
    list_df = []

    for year in list_years:
        for month in list_months:
            # get list filename in the folder path.
            file_names = os.listdir(gps_data_dir + f"/{year}/{year}-{month}/")
            for day in list_days:
                amount_day = 0
                for file_name in file_names:
                    file_name_part = re.split('-|_', file_name)
                    if len(file_name_part) > 2:
                        # select the day we chose from filename.
                        if file_name_part[2] == day:
                            _tmp = pd.read_csv(gps_data_dir + f"/{year}/{year}-{month}/" + file_name, compression='zip', parse_dates=['time_stamp'])
                            _tmp = clean_df_tmp(_tmp)
                            list_df.append(_tmp)
                            amount_day += 1
                            del _tmp

                    # the data collected was split into 8 sessions per day.
                    if amount_day > 7:
                        break
    
    df_gps = pd.concat(list_df, axis='rows', ignore_index=True)
    return df_gps

def find_point():
    df_road = pd.read_csv(road_data_dir + '/roaddb.csv')
    df_gps = load_gps_data()

    del df_road
    del df_gps

In [8]:
find_point()

In [65]:
df_gps

Unnamed: 0,time_stamp,unit_id,lat,lon,speed,unit_type
0,2019-03-01 00:00:35,0390002000000000000EE085491,13.702763,100.581581,2,7.0
1,2019-03-01 00:00:33,025000600000359857080832934,12.516950,99.978370,0,7.0
2,2019-03-01 00:00:57,0390002000000000000EE089999,13.128385,100.901567,0,7.0
3,2019-03-01 00:00:33,0390002000000000000EE000223,14.142565,100.636618,0,6.0
4,2019-03-01 00:00:09,005000800000868998032600856,13.704470,100.291100,0,6.0
...,...,...,...,...,...,...
24006469,2019-03-01 23:59:00,025000200000863835024634110,14.380935,100.954948,0,6.0
24006470,2019-03-01 23:59:09,025000400000864507037937207,17.385570,103.289258,65,7.0
24006471,2019-03-01 23:59:00,025000300000864507032321514,13.089945,100.972131,0,7.0
24006472,2019-03-01 23:59:00,025000300000864507032354671,19.492980,100.270191,0,7.0
