In [1]:
import os
import pandas as pd
import multiprocessing as mp
from glob import glob
import numpy as np

In [2]:
FOLDER_PATH = 'gps_dataset/'
TIME_COL = 'UTC DATETIME'
TIME_GAP_THRESHOLD = 180     # seconds

In [3]:
def count_trips(file_path):
    """count number of trips of a user"""
    df = pd.read_csv(file_path)
    df[TIME_COL] = pd.to_datetime(df[TIME_COL])
    time_diff = df[TIME_COL].diff().dt.total_seconds().fillna(0)
    trip_starts = (time_diff > TIME_GAP_THRESHOLD).sum() + 1
    return trip_starts

In [4]:
ind_df = pd.read_csv('individuals_dataset.csv')
id_with_gps = ind_df[ind_df.GPS_RECORD == True].ID.values

In [5]:
def get_OD_trips(df_path):
    """get OD pairs for all trips of a single user, 
    return one dataframe contains start and time, start and end latitude and longitude, user ID"""
    trips_df = pd.read_csv(df_path)
    if len(trips_df) < 2:
        return None
    else:
        trips_df[TIME_COL] = pd.to_datetime(trips_df[TIME_COL])
        trips_df['time_diff'] = trips_df[TIME_COL].diff().dt.total_seconds().fillna(0)
        start_points = np.array(trips_df[trips_df['time_diff']>TIME_GAP_THRESHOLD].index)
        end_points = start_points -1

        start_points = np.concatenate(([0],start_points))
        end_points = np.concatenate((end_points, [trips_df.index[-1]]))

        origin_df = trips_df.iloc[start_points][[TIME_COL,'LATITUDE','LONGITUDE']].rename(columns={TIME_COL: 'start_time', 'LATITUDE': 'ori_lat', 'LONGITUDE': 'ori_lon'})
        origin_df = origin_df.reset_index(drop=True)

        destination_df = trips_df.iloc[end_points][[TIME_COL,'LATITUDE','LONGITUDE']].rename(columns={TIME_COL: 'end_time', 'LATITUDE': 'dst_lat', 'LONGITUDE': 'dst_lon'})
        destination_df = destination_df.reset_index(drop=True)
        
        od_df = pd.merge(origin_df, destination_df,left_index=True, right_index=True)
        ID = df_path.split('/')[-1].split('.')[0]
        od_df['ID'] = ID
        return od_df

In [6]:
def get_all_OD(valid_ID_list):
    files = [os.path.join(FOLDER_PATH, f) for f in os.listdir(FOLDER_PATH) if f.endswith('.csv')]
    valid_files = [path for path in files if path.split('/')[-1].split('.')[0] in valid_ID_list]
    cores = min(32, mp.cpu_count())

    with mp.Pool(cores) as pool:
        dataframes = pool.map(get_OD_trips, valid_files)

    result_df = pd.concat(dataframes, ignore_index=True)
    return result_df

In [7]:
all_trips_df = get_all_OD(id_with_gps)

In [9]:
all_trips_df.to_csv("processed_trips.csv", index=False)

In [10]:
pd.read_csv("processed_trips.csv")

Unnamed: 0,start_time,ori_lat,ori_lon,end_time,dst_lat,dst_lon,ID
0,2022-12-12 06:55:01,48.607980,2.304657,2022-12-12 06:59:59,48.607980,2.304657,50_1587
1,2022-12-12 07:03:01,48.607980,2.304657,2022-12-12 07:27:00,48.612165,2.304178,50_1587
2,2022-12-12 07:56:01,48.610726,2.306309,2022-12-12 08:24:59,48.631089,2.437251,50_1587
3,2022-12-12 17:10:00,48.630281,2.436163,2022-12-12 17:30:00,48.612855,2.347746,50_1587
4,2022-12-12 17:35:00,48.609469,2.312745,2022-12-12 17:40:00,48.609469,2.312745,50_1587
...,...,...,...,...,...,...,...
81286,2023-03-31 16:06:35,48.945900,2.364043,2023-03-31 16:27:00,48.942793,2.446667,13_3571
81287,2023-03-31 17:30:00,48.941988,2.445574,2023-03-31 17:49:00,48.926344,2.493898,13_3571
81288,2023-04-01 09:31:04,48.927149,2.494991,2023-04-01 09:44:59,48.881830,2.476410,13_3571
81289,2023-04-01 15:16:02,48.927149,2.494991,2023-04-01 15:39:58,48.949349,2.527224,13_3571
