In [7]:
import pandas as pd
import glob
import os
import h3

# Data Processing

labels:
| Label ID | Transportation Mode |
|----------|---------------------|
| **1**    | Walk               |
| **2**    | Bike               |
| **3**    | Bus                |
| **4**    | Car                |
| **5**    | Subway             |
| **6**    | Train              |
| **7**    | Airplane           |
| **8**    | Boat               |
| **9**    | Run                |
| **10**   | Motorcycle         |
| **11**   | Taxi               |

In [8]:


mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s : i + 1 for i, s in enumerate(mode_names)}
def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None)

    # Converting the timestamp columns (5 and 6) into a single datetime column
    points['time'] = pd.to_datetime(points[5] + ' ' + points[6], format='%Y-%m-%d %H:%M:%S') 
    points.drop(inplace=True, columns=[5, 6])
    # Renaming the columns for clarity
    points.rename(inplace=True, columns={'5_6': 'time', 0: 'lat', 1: 'lon', 3: 'alt'})

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4])

    return points


def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None, sep='\s+')

    # Convert the timestamp columns into datetime format
    labels['start_time'] = pd.to_datetime(labels[0] + ' ' + labels[1], format='%Y/%m/%d %H:%M:%S')
    labels['end_time'] = pd.to_datetime(labels[2] + ' ' + labels[3], format='%Y/%m/%d %H:%M:%S')


    # Rename columns
    labels.rename(columns={4: 'label'}, inplace=True)

    # Map text labels to integer encodings
    labels['label'] = labels['label'].map(mode_ids)

    # Drop original time columns
    labels.drop(columns=[0, 1, 2, 3], inplace=True)

    return labels

def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1
    no_label = (indices < 0) | (points['time'].values >= labels['end_time'].iloc[indices].values)
    points['label'] = labels['label'].iloc[indices].values
    points.loc[no_label, 'label'] = 0 

def read_user(user_folder):
    labels = None

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    df = pd.concat([read_plt(f) for f in plt_files])

    labels_file = os.path.join(user_folder, 'labels.txt')
    # Already labeled data  is separated into a different file but just in case
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)
    else:
        df['label'] = 0

    return df

def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        df = read_user(os.path.join(folder,sf))
        df['user'] = int(sf)
        dfs.append(df)
    return pd.concat(dfs)

  labels = pd.read_csv(labels_file, skiprows=1, header=None, sep='\s+')


In [None]:
df = read_all_users(os.getenv('LABELED_DATA_PATH'))

In [None]:
df.head(10)

Unnamed: 0,lat,lon,alt,time,label,user
0,39.921712,116.472343,13.0,2007-08-04 03:30:32,0,0
1,39.921705,116.472343,13.0,2007-08-04 03:30:33,0,0
2,39.921695,116.472345,13.0,2007-08-04 03:30:34,0,0
3,39.921683,116.472342,13.0,2007-08-04 03:30:35,0,0
4,39.921672,116.472342,13.0,2007-08-04 03:30:36,0,0
5,39.921583,116.472315,13.0,2007-08-04 03:30:38,0,0
6,39.921572,116.472315,13.0,2007-08-04 03:30:39,0,0
7,39.92156,116.47229,13.0,2007-08-04 03:30:40,0,0
8,39.921565,116.47229,13.0,2007-08-04 03:30:41,0,0
9,39.92157,116.472288,13.0,2007-08-04 03:30:42,0,0


In [None]:
len(df) - (df['label'] == 0).sum() # Checking how many labeled data points will remain if I drop the unlabeled ones

np.int64(5427117)

In [None]:
df.index[df['label'] == 0].tolist()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [None]:
df = df[df['label'] != 0]

In [None]:
df.head(10)

Unnamed: 0,lat,lon,alt,time,label,user
1,39.894178,116.3182,-777.0,2008-03-28 14:54:40,6,0
2,39.894505,116.321132,-777.0,2008-03-28 14:55:14,6,0
3,39.894953,116.326452,-777.0,2008-03-28 14:56:13,6,0
4,39.8946,116.332542,-777.0,2008-03-28 14:57:12,6,0
5,39.889622,116.33704,-777.0,2008-03-28 14:58:11,6,0
6,39.88209,116.338353,-777.0,2008-03-28 14:59:10,6,0
7,39.873873,116.338455,-777.0,2008-03-28 15:00:10,6,0
8,39.865182,116.338058,-777.0,2008-03-28 15:01:09,6,0
9,39.855403,116.33771,-777.0,2008-03-28 15:02:08,6,0
10,39.844532,116.334362,-777.0,2008-03-28 15:03:07,6,0


In [None]:
# Convert the time column to datetime format
df['time'] = pd.to_datetime(df['time'])

# Calculate time difference between rows
df['time_diff'] = df['time'].diff().dt.total_seconds()
df.loc[df.index[0], 'time_diff'] = 0

In [None]:
df.head(10)

Unnamed: 0,lat,lon,alt,time,label,user,time_diff
1,39.894178,116.3182,-777.0,2008-03-28 14:54:40,6,0,0.0
2,39.894505,116.321132,-777.0,2008-03-28 14:55:14,6,0,34.0
3,39.894953,116.326452,-777.0,2008-03-28 14:56:13,6,0,59.0
4,39.8946,116.332542,-777.0,2008-03-28 14:57:12,6,0,59.0
5,39.889622,116.33704,-777.0,2008-03-28 14:58:11,6,0,59.0
6,39.88209,116.338353,-777.0,2008-03-28 14:59:10,6,0,59.0
7,39.873873,116.338455,-777.0,2008-03-28 15:00:10,6,0,60.0
8,39.865182,116.338058,-777.0,2008-03-28 15:01:09,6,0,59.0
9,39.855403,116.33771,-777.0,2008-03-28 15:02:08,6,0,59.0
10,39.844532,116.334362,-777.0,2008-03-28 15:03:07,6,0,59.0


In [None]:
processed_data_path = os.getenv('PROCESSED_DATA_PATH')
file_path = os.path.join(processed_data_path,"processed_data.csv")
df.to_csv(file_path,index=False)

## Convert the coordinates to Trajectories

In [None]:
processed_data_path = os.getenv('PROCESSED_DATA_PATH')
file_path = os.path.join(processed_data_path,"processed_data.csv")
df_traj = pd.read_csv(file_path)

In [None]:
df_traj["trajectory_break"] = (df_traj["user"].ne(df["user"].shift())) | (df_traj["label"].ne(df_traj["label"].shift())) # checking if the current row is not equal to the prev one (False : 0 , True : 1)

df_traj["trajectory_id"] = df_traj["trajectory_break"].cumsum() # cummulative sum to set the traj ID

# Reset time_diff for the first row of each new trajectory
df_traj.loc[df_traj["trajectory_break"], "time_diff"] = 0

# Drop unnecessary column
df_traj.drop(columns=["trajectory_break"], inplace=True)
# converting lat long to a cell. used to be geo_to_h3, but according to the new naming scheme of h3 it was changed (https://h3geo.org/docs/api/indexing/#latlngtocell)
df_traj["h3_index"] = df.apply(lambda row: h3.latlng_to_cell(row["lat"], row["lon"], 9), axis=1)


In [None]:
df_traj.head()

Unnamed: 0,lat,lon,alt,time,label,user,time_diff,trajectory_id,h3_index
0,39.894178,116.3182,-777.0,2008-03-28 14:54:40,6,0,0.0,1,8931aa43117ffff
1,39.894505,116.321132,-777.0,2008-03-28 14:55:14,6,0,34.0,1,8931aa431abffff
2,39.894953,116.326452,-777.0,2008-03-28 14:56:13,6,0,59.0,1,8931aa404dbffff
3,39.8946,116.332542,-777.0,2008-03-28 14:57:12,6,0,59.0,1,8931aa404c7ffff
4,39.889622,116.33704,-777.0,2008-03-28 14:58:11,6,0,59.0,1,8931aa4040bffff


In [None]:
df_traj.tail()

Unnamed: 0,lat,lon,alt,time,label,user,time_diff,trajectory_id,h3_index
5427112,39.95458,116.452703,721.8,2008-12-15 04:12:59,1,9,2.0,5977,8931aa55d5bffff
5427113,39.954585,116.452703,721.8,2008-12-15 04:13:01,1,9,2.0,5977,8931aa55d5bffff
5427114,39.95459,116.452692,721.8,2008-12-15 04:13:03,1,9,2.0,5977,8931aa55d5bffff
5427115,39.954597,116.452643,721.8,2008-12-15 04:13:05,1,9,2.0,5977,8931aa55d5bffff
5427116,39.954563,116.452622,721.8,2008-12-15 04:13:11,1,9,6.0,5977,8931aa55d5bffff


In [None]:
processed_data_path = os.getenv('PROCESSED_DATA_PATH')
traj_file_path = os.path.join(processed_data_path,"separate_traj_data.csv")
df_traj.to_csv(traj_file_path,index=False)

NameError: name 'df_traj' is not defined

## Tokenize Trajectories

In [None]:
processed_data_path = os.getenv('PROCESSED_DATA_PATH')
traj_file_path = os.path.join(processed_data_path,"separate_traj_data.csv")
tokenized_trajectories_df = pd.read_csv(traj_file_path)

In [None]:
# Grouped each trajectory in one row together.
tokenized_trajectories_df = tokenized_trajectories_df.groupby("trajectory_id").agg({
    "h3_index": list,
    "time_diff": list,
    "label": "first"
}).reset_index()

tokenized_trajectories_df.rename(columns={"h3_index": "traj", "time_diff": "time_diffs","label":"label"}, inplace=True)

In [None]:
tokenized_trajectories_df.head(10)

Unnamed: 0,trajectory_id,traj,time_diffs,label
0,1,"[8931aa43117ffff, 8931aa431abffff, 8931aa404db...","[0.0, 34.0, 59.0, 59.0, 59.0, 59.0, 60.0, 59.0...",6
1,2,"[8920c91701bffff, 8920c917053ffff, 8920c917623...","[0.0, 0.0, 60.0, 59.0, 59.0, 59.0, 59.0, 59.0,...",11
2,3,"[8920c917017ffff, 8920c917017ffff, 8920c91708f...","[0.0, 59.0, 60.0, 59.0, 59.0, 59.0, 59.0, 60.0...",6
3,4,"[8920c0cecc7ffff, 8920c0cecdbffff, 8920c0c5273...","[0.0, 59.0, 60.0, 59.0, 59.0, 59.0, 59.0, 60.0...",11
4,5,"[8920c0c5677ffff, 8920c0c5677ffff, 8920c0c5677...","[0.0, 60.0, 57.0, 2.0, 59.0, 61.0, 61.0, 57.0]",1
5,6,"[8920c0c562bffff, 8920c0c5627ffff, 8920c0c5287...","[0.0, 59.0, 59.0, 59.0, 59.0, 60.0, 59.0, 59.0]",11
6,7,"[8920c0ceccfffff, 8920c0ce137ffff, 8920c0ce10f...","[0.0, 59.0, 59.0, 60.0, 59.0, 59.0, 59.0, 59.0...",6
7,8,"[8920f251e27ffff, 8920f251e33ffff, 8920f251e0f...","[0.0, 56.0, 59.0, 59.0, 59.0, 60.0]",11
8,9,"[8920f251347ffff, 8920f251347ffff, 8920f251347...","[0.0, 61.0, 59.0, 59.0, 59.0, 66891.0, 0.0, 1....",1
9,10,"[8920f251227ffff, 8920f251227ffff, 8920f251227...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",11


In [None]:
processed_data_path = os.getenv('PROCESSED_DATA_PATH')
traj_file_path = os.path.join(processed_data_path,"tokenized_trajectories.csv")
tokenized_trajectories_df.to_csv(traj_file_path,index=False)

Will save trajectories of cars and walking in a separate file to reduce it's size for uploading it on colab for training (the number of car to walk is too small so I will add taxi as car)

In [10]:
processed_data_path = os.getenv('PROCESSED_DATA_PATH')
traj_file_path = os.path.join(processed_data_path,"tokenized_trajectories.csv")

In [11]:
car_taxi_walk_df = pd.read_csv(traj_file_path)
car_taxi_walk_df = car_taxi_walk_df[car_taxi_walk_df["label"].isin([1, 4, 11])]

In [12]:
len(car_taxi_walk_df[car_taxi_walk_df["label"].isin([4,11])])

761

In [13]:
car_taxi_walk_df["label"] = car_taxi_walk_df["label"].replace({4: 0, 11: 0})

In [14]:
car_taxi_walk_df.head()

Unnamed: 0,trajectory_id,traj,time_diffs,label
1,2,"['8920c91701bffff', '8920c917053ffff', '8920c9...","[0.0, 0.0, 60.0, 59.0, 59.0, 59.0, 59.0, 59.0,...",0
3,4,"['8920c0cecc7ffff', '8920c0cecdbffff', '8920c0...","[0.0, 59.0, 60.0, 59.0, 59.0, 59.0, 59.0, 60.0...",0
4,5,"['8920c0c5677ffff', '8920c0c5677ffff', '8920c0...","[0.0, 60.0, 57.0, 2.0, 59.0, 61.0, 61.0, 57.0]",1
5,6,"['8920c0c562bffff', '8920c0c5627ffff', '8920c0...","[0.0, 59.0, 59.0, 59.0, 59.0, 60.0, 59.0, 59.0]",0
7,8,"['8920f251e27ffff', '8920f251e33ffff', '8920f2...","[0.0, 56.0, 59.0, 59.0, 59.0, 60.0]",0


In [15]:
car_taxi_walk_df["label"].value_counts()

label
1    2584
0     761
Name: count, dtype: int64

In [16]:
# Example of downsampling the majority class:
label_1_df = car_taxi_walk_df[car_taxi_walk_df["label"] == 1]
label_0_df = car_taxi_walk_df[car_taxi_walk_df["label"] == 0]

# Downsample label 1 to the same count as label 0:
label_1_downsampled = label_1_df.sample(n=len(label_0_df), random_state=42)
car_taxi_balanced_df = pd.concat([label_1_downsampled, label_0_df])


In [17]:
car_taxi_balanced_df["label"].value_counts()

label
1    761
0    761
Name: count, dtype: int64

In [18]:
car_taxi_balanced_df.to_csv(os.path.join(processed_data_path,"balanced_car_taxi_walk_tokenized_trajectories.csv"),index=False)

In [None]:
car_walk_df = pd.read_csv(traj_file_path)
car_walk_df = car_walk_df[car_walk_df["label"].isin([1, 4])]
car_walk_df["label"] = car_walk_df["label"].replace({4: 0})

In [None]:
car_walk_df["label"].value_counts()

label
1    2584
0     291
Name: count, dtype: int64

In [None]:
# Example of downsampling the majority class:
label_1_df = car_walk_df[car_walk_df["label"] == 1]
label_0_df = car_walk_df[car_walk_df["label"] == 0]

# Downsample label 1 to the same count as label 0:
label_1_downsampled = label_1_df.sample(n=len(label_0_df), random_state=42)
balanced_df = pd.concat([label_1_downsampled, label_0_df])


In [None]:
balanced_df["label"].value_counts()

label
1    291
0    291
Name: count, dtype: int64

In [None]:
balanced_df.to_csv(os.path.join(processed_data_path,"balanced_car_walk_tokenized_traj.csv"),index=False)
car_walk_df.to_csv(os.path.join(processed_data_path,"car_walk_tokenized_traj.csv"),index=False)