Preprocessing to extract trips from GNSS records

In [1]:
import os
import pandas as pd
import multiprocessing as mp
from glob import glob
import numpy as np

In [2]:
FOLDER_PATH = 'gps_dataset/'
TIME_COL = 'UTC DATETIME'
TIME_GAP_THRESHOLD = 180     # seconds

In [3]:
def count_trips(file_path):
    """count number of trips of a user"""
    df = pd.read_csv(file_path)
    df[TIME_COL] = pd.to_datetime(df[TIME_COL])
    time_diff = df[TIME_COL].diff().dt.total_seconds().fillna(0)
    trip_starts = (time_diff > TIME_GAP_THRESHOLD).sum() + 1
    return trip_starts

In [4]:
ind_df = pd.read_csv('individuals_dataset.csv')
id_with_gps = ind_df[ind_df.GPS_RECORD == True].ID.values

In [5]:
def get_OD_trips(df_path):
    """get OD pairs for all trips of a single user, 
    return one dataframe contains start and time, start and end latitude and longitude, user ID"""
    trips_df = pd.read_csv(df_path)
    if len(trips_df) < 2:
        return None
    else:
        trips_df[TIME_COL] = pd.to_datetime(trips_df[TIME_COL])
        trips_df['time_diff'] = trips_df[TIME_COL].diff().dt.total_seconds().fillna(0)
        start_points = np.array(trips_df[trips_df['time_diff']>TIME_GAP_THRESHOLD].index)
        end_points = start_points -1

        start_points = np.concatenate(([0],start_points))
        end_points = np.concatenate((end_points, [trips_df.index[-1]]))

        origin_df = trips_df.iloc[start_points][[TIME_COL,'LATITUDE','LONGITUDE']].rename(columns={TIME_COL: 'start_time', 'LATITUDE': 'ori_lat', 'LONGITUDE': 'ori_lon'})
        origin_df = origin_df.reset_index(drop=True)

        destination_df = trips_df.iloc[end_points][[TIME_COL,'LATITUDE','LONGITUDE']].rename(columns={TIME_COL: 'end_time', 'LATITUDE': 'dst_lat', 'LONGITUDE': 'dst_lon'})
        destination_df = destination_df.reset_index(drop=True)
        
        od_df = pd.merge(origin_df, destination_df,left_index=True, right_index=True)
        ID = df_path.split('/')[-1].split('.')[0]
        od_df['ID'] = ID
        return od_df

In [6]:
def get_all_OD(valid_ID_list):
    files = [os.path.join(FOLDER_PATH, f) for f in os.listdir(FOLDER_PATH) if f.endswith('.csv')]
    valid_files = [path for path in files if path.split('/')[-1].split('.')[0] in valid_ID_list]
    cores = min(32, mp.cpu_count())

    with mp.Pool(cores) as pool:
        dataframes = pool.map(get_OD_trips, valid_files)

    result_df = pd.concat(dataframes, ignore_index=True)
    return result_df

In [7]:
all_trips_df = get_all_OD(id_with_gps)

In [9]:
all_trips_df.to_csv("processed_trips.csv", index=False)