In [21]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler


In [22]:
ROOT = r"D:\Randome shit\TourX\ml-services\Geolife Trajectories 1.3\Data"

def load_all_geolife(root_path):
    all_data = []

    for user in os.listdir(root_path):
        user_path = os.path.join(root_path, user, "Trajectory")

        if os.path.exists(user_path):

            for file in os.listdir(user_path):
                if file.endswith(".plt"):
                    file_path = os.path.join(user_path, file)

                    df = pd.read_csv(
                        file_path,
                        skiprows=6,
                        header=None,
                        names=["lat","lon","unused","alt","date","date2","time"]
                    )

                    df["user_id"] = user
                    df["trajectory_id"] = file

                    all_data.append(df)

    return pd.concat(all_data, ignore_index=True)

df = load_all_geolife(ROOT)

print("Loaded shape:", df.shape)
df.head()


KeyboardInterrupt: 

In [None]:
# Convert Excel serial to numeric
df["date"] = pd.to_numeric(df["date"], errors="coerce")

# Convert to timestamp
df["timestamp"] = pd.to_datetime(
    df["date"],
    unit="D",
    origin="1899-12-30"
)

# Clean precision noise
df["timestamp"] = df["timestamp"].dt.round("1s")

# Drop bad rows
df = df.dropna(subset=["timestamp"])

# Sort correctly
df = df.sort_values(["user_id", "trajectory_id", "timestamp"])

print("Timestamp fixed")
df.head()


Timestamp fixed


Unnamed: 0,lat,lon,unused,alt,date,date2,time,user_id,trajectory_id,timestamp
0,39.984702,116.318417,0,492.0,39744.120185,2008-10-23,02:53:04,0,20081023025304.plt,2008-10-23 02:53:04
1,39.984683,116.31845,0,492.0,39744.120255,2008-10-23,02:53:10,0,20081023025304.plt,2008-10-23 02:53:10
2,39.984686,116.318417,0,492.0,39744.120313,2008-10-23,02:53:15,0,20081023025304.plt,2008-10-23 02:53:15
3,39.984688,116.318385,0,492.0,39744.12037,2008-10-23,02:53:20,0,20081023025304.plt,2008-10-23 02:53:20
4,39.984655,116.318263,0,492.0,39744.120428,2008-10-23,02:53:25,0,20081023025304.plt,2008-10-23 02:53:25


In [None]:
df["time_gap"] = (
    df.groupby(["user_id","trajectory_id"])["timestamp"]
      .diff()
      .dt.total_seconds()
)


In [None]:
df["prev_lat"] = df.groupby(["user_id","trajectory_id"])["lat"].shift()
df["prev_lon"] = df.groupby(["user_id","trajectory_id"])["lon"].shift()


In [None]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # meters
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi/2)**2 + \
        np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2

    return 2 * R * np.arcsin(np.sqrt(a))

df["distance_delta"] = haversine(
    df["prev_lat"], df["prev_lon"],
    df["lat"], df["lon"]
)


In [None]:
df["speed"] = df["distance_delta"] / df["time_gap"]


In [None]:
df["prev_speed"] = df.groupby(["user_id","trajectory_id"])["speed"].shift()

df["acceleration"] = (
    (df["speed"] - df["prev_speed"]) / df["time_gap"]
)


In [None]:
def compute_bearing(lat1, lon1, lat2, lon2):
    lat1 = np.radians(lat1)
    lat2 = np.radians(lat2)
    delta_lon = np.radians(lon2 - lon1)

    x = np.sin(delta_lon) * np.cos(lat2)
    y = (
        np.cos(lat1) * np.sin(lat2)
        - np.sin(lat1) * np.cos(lat2) * np.cos(delta_lon)
    )

    bearing = np.degrees(np.arctan2(x, y))
    bearing = (bearing + 360) % 360
    return bearing

df["bearing"] = compute_bearing(
    df["prev_lat"],
    df["prev_lon"],
    df["lat"],
    df["lon"]
)


In [None]:
df["prev_bearing"] = df.groupby(
    ["user_id","trajectory_id"]
)["bearing"].shift()

df["direction_change"] = abs(df["bearing"] - df["prev_bearing"])

df["direction_change"] = np.minimum(
    df["direction_change"],
    360 - df["direction_change"]
)


In [None]:
df = df.dropna()

print("After cleaning:", df.shape)


After cleaning: (24213481, 20)


In [23]:
df.head()

Unnamed: 0,lat,lon,unused,alt,date,date2,time,user_id,trajectory_id,timestamp,time_gap,prev_lat,prev_lon,distance_delta,speed,prev_speed,acceleration,bearing,prev_bearing,direction_change
2,39.984686,116.318417,0,492.0,39744.120313,2008-10-23,02:53:15,0,20081023025304.plt,2008-10-23 02:53:15,5.0,39.984683,116.31845,2.831299,0.56626,0.586148,-0.003978,276.766339,126.922277,149.844061
3,39.984688,116.318385,0,492.0,39744.12037,2008-10-23,02:53:20,0,20081023025304.plt,2008-10-23 02:53:20,5.0,39.984686,116.318417,2.735434,0.547087,0.56626,-0.003835,274.663284,276.766339,2.103055
4,39.984655,116.318263,0,492.0,39744.120428,2008-10-23,02:53:25,0,20081023025304.plt,2008-10-23 02:53:25,5.0,39.984688,116.318385,11.023008,2.204602,0.547087,0.331503,250.555849,274.663284,24.107435
5,39.984611,116.318026,0,493.0,39744.120486,2008-10-23,02:53:30,0,20081023025304.plt,2008-10-23 02:53:30,5.0,39.984655,116.318263,20.776544,4.155309,2.204602,0.390141,256.379828,250.555849,5.823979
6,39.984608,116.317761,0,493.0,39744.120544,2008-10-23,02:53:35,0,20081023025304.plt,2008-10-23 02:53:35,5.0,39.984611,116.318026,22.580319,4.516064,4.155309,0.072151,269.153609,256.379828,12.773781


In [None]:
import os

BASE_PATH = r"D:\Randome shit\TourX\ml-services"

csv_path = os.path.join(BASE_PATH, "geolife_processed.csv")

print("Saving CSV...")
df.to_csv(csv_path, index=False)


print("All files saved successfully!")


Saving CSV...
All files saved successfully!


In [26]:
# import os
# import pyarrow
# BASE_PATH = r"D:\Randome shit\TourX\ml-services"
# parquet_path = os.path.join(BASE_PATH, "geolife_processed.parquet")

# # df.to_parqueta
#     parquet_path,
#     engine="pyarrow",
#     compression="snappy",
#     index=False
# )

# print("Saved in fastest & smallest format (Parquet + Snappy)")


In [27]:
features = [
    "distance_delta",
    "speed",
    "acceleration",
    "time_gap",
    "direction_change"
]

df_model = df[["user_id", "trajectory_id"] + features].copy()

df_model.head()


Unnamed: 0,user_id,trajectory_id,distance_delta,speed,acceleration,time_gap,direction_change
2,0,20081023025304.plt,2.831299,0.56626,-0.003978,5.0,149.844061
3,0,20081023025304.plt,2.735434,0.547087,-0.003835,5.0,2.103055
4,0,20081023025304.plt,11.023008,2.204602,0.331503,5.0,24.107435
5,0,20081023025304.plt,20.776544,4.155309,0.390141,5.0,5.823979
6,0,20081023025304.plt,22.580319,4.516064,0.072151,5.0,12.773781


In [29]:
df = df[df["time_gap"] > 0]


In [30]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)


Unnamed: 0,lat,lon,unused,alt,date,date2,time,user_id,trajectory_id,timestamp,time_gap,prev_lat,prev_lon,distance_delta,speed,prev_speed,acceleration,bearing,prev_bearing,direction_change
2,39.984686,116.318417,0,492.000000,39744.120313,2008-10-23,02:53:15,000,20081023025304.plt,2008-10-23 02:53:15,5.0,39.984683,116.318450,2.831299,0.566260,0.586148,-0.003978,276.766339,126.922277,149.844061
3,39.984688,116.318385,0,492.000000,39744.120370,2008-10-23,02:53:20,000,20081023025304.plt,2008-10-23 02:53:20,5.0,39.984686,116.318417,2.735434,0.547087,0.566260,-0.003835,274.663284,276.766339,2.103055
4,39.984655,116.318263,0,492.000000,39744.120428,2008-10-23,02:53:25,000,20081023025304.plt,2008-10-23 02:53:25,5.0,39.984688,116.318385,11.023008,2.204602,0.547087,0.331503,250.555849,274.663284,24.107435
5,39.984611,116.318026,0,493.000000,39744.120486,2008-10-23,02:53:30,000,20081023025304.plt,2008-10-23 02:53:30,5.0,39.984655,116.318263,20.776544,4.155309,2.204602,0.390141,256.379828,250.555849,5.823979
6,39.984608,116.317761,0,493.000000,39744.120544,2008-10-23,02:53:35,000,20081023025304.plt,2008-10-23 02:53:35,5.0,39.984611,116.318026,22.580319,4.516064,4.155309,0.072151,269.153609,256.379828,12.773781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24876973,40.914867,111.710500,0,3802.493438,39521.152731,2008-03-14,03:39:56,181,20080314025755.plt,2008-03-14 03:39:56,112.0,40.915433,111.711450,101.698458,0.908022,0.849449,0.000523,231.714749,234.423756,2.709007
24876974,40.914267,111.710333,0,3795.931759,39521.153669,2008-03-14,03:41:17,181,20080314025755.plt,2008-03-14 03:41:17,81.0,40.914867,111.710500,68.171002,0.841617,0.908022,-0.000820,191.855062,231.714749,39.859687
24876975,40.912467,111.710667,0,3795.931759,39521.154884,2008-03-14,03:43:02,181,20080314025755.plt,2008-03-14 03:43:02,105.0,40.914267,111.710333,202.101290,1.924774,0.841617,0.010316,172.033395,191.855062,19.821667
24876976,40.911517,111.711317,0,3779.527559,39521.155185,2008-03-14,03:43:28,181,20080314025755.plt,2008-03-14 03:43:28,26.0,40.912467,111.710667,118.921029,4.573886,1.924774,0.101889,152.657698,172.033395,19.375697


In [31]:
df = df.dropna()


In [32]:
print("Any inf left:", np.isinf(df[features]).sum().sum())
print("Any NaN left:", df[features].isna().sum().sum())


Any inf left: 0
Any NaN left: 0


In [33]:
scaler = MinMaxScaler()

df_model = df[["user_id", "trajectory_id"] + features].copy()
df_model[features] = scaler.fit_transform(df_model[features])

print("Scaling complete")


Scaling complete


In [34]:
def create_sequences_limited(df, features, seq_len, max_per_traj=2000):
    sequences = []

    grouped = df.groupby(["user_id", "trajectory_id"])

    for _, group in grouped:
        data = group[features].values

        if len(data) < seq_len:
            continue

        count = 0
        for i in range(len(data) - seq_len):
            sequences.append(data[i:i+seq_len])
            count += 1

            if count >= max_per_traj:
                break

    return np.array(sequences)


In [35]:
SEQ_LEN = 20

X_sequences = create_sequences_limited(
    df_model,
    features,
    SEQ_LEN,
    max_per_traj=1500
)

print("Shape:", X_sequences.shape)


Shape: (12332162, 20, 5)
