# Loading_Data
1. Loding single trajectory for single user.
2. Loading all trajectories for single user.
3. Loading all user.
4. Keeping sample=10 for analysis.
5. Calculating difference in time stamp for consective location for each user.

In [1]:
import pandas as pd

def load_single_trajectory(file_path):
    df = pd.read_csv(
        file_path,
        skiprows=6,
        header=None
    )

    df.columns = [
        'lat',
        'lon',
        'unused',
        'altitude',
        'days',
        'date',
        'time'
    ]

    df['timestamp'] = pd.to_datetime(
        df['date'] + ' ' + df['time'],
        errors='coerce'
    )

    df = df.dropna(subset=['timestamp'])

    return df[['lat', 'lon', 'altitude', 'timestamp']]


In [2]:
from pathlib import Path

def load_user_trajectories(user_path):
    all_trips = []

    traj_path = Path(user_path) / "Trajectory"
    for file in traj_path.glob("*.plt"):
        df = load_single_trajectory(file)
        df['source_file'] = file.name
        all_trips.append(df)

    return pd.concat(all_trips, ignore_index=True)


In [3]:
user_000 = load_user_trajectories("Data/raw/000")
print(user_000.head())


         lat         lon  altitude           timestamp         source_file
0  39.984702  116.318417       492 2008-10-23 02:53:04  20081023025304.plt
1  39.984683  116.318450       492 2008-10-23 02:53:10  20081023025304.plt
2  39.984686  116.318417       492 2008-10-23 02:53:15  20081023025304.plt
3  39.984688  116.318385       492 2008-10-23 02:53:20  20081023025304.plt
4  39.984655  116.318263       492 2008-10-23 02:53:25  20081023025304.plt


In [4]:
def load_all_users(base_path, max_users=None):
    users = []
    base = Path(base_path)

    user_dirs = sorted(base.iterdir())
    if max_users:
        user_dirs = user_dirs[:max_users]

    for user_dir in user_dirs:
        df = load_user_trajectories(user_dir)
        df['user_id'] = user_dir.name
        users.append(df)

    return pd.concat(users, ignore_index=True)


In [5]:
gps_raw = load_all_users("Data/raw", max_users=10)

In [6]:
gps_raw = gps_raw.reset_index(drop=True)

In [7]:
gps_raw.info()
gps_raw.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1845936 entries, 0 to 1845935
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   lat          float64       
 1   lon          float64       
 2   altitude     int64         
 3   timestamp    datetime64[ns]
 4   source_file  object        
 5   user_id      object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 84.5+ MB


Unnamed: 0,lat,lon,altitude,timestamp
count,1845936.0,1845936.0,1845936.0,1845936
mean,39.50336,116.5659,233.4352,2009-02-01 03:20:04.200627456
min,22.14758,110.7199,-4532.0,2008-10-23 02:53:04
25%,39.92709,116.3196,113.0,2008-11-17 16:00:06.750000128
50%,39.98919,116.3271,160.0,2009-01-18 10:08:20
75%,40.00258,116.3458,233.0,2009-04-06 05:13:05
max,41.36709,122.6515,28876.0,2009-07-29 06:16:11
std,1.951175,1.36335,391.2023,


In [8]:
gps_raw = gps_raw.sort_values(
    by=['user_id', 'timestamp']
).reset_index(drop=True)


In [9]:
gps_raw.head()

Unnamed: 0,lat,lon,altitude,timestamp,source_file,user_id
0,39.984702,116.318417,492,2008-10-23 02:53:04,20081023025304.plt,0
1,39.984683,116.31845,492,2008-10-23 02:53:10,20081023025304.plt,0
2,39.984686,116.318417,492,2008-10-23 02:53:15,20081023025304.plt,0
3,39.984688,116.318385,492,2008-10-23 02:53:20,20081023025304.plt,0
4,39.984655,116.318263,492,2008-10-23 02:53:25,20081023025304.plt,0


In [10]:
gps_raw['delta_time_s'] = (
    gps_raw
    .groupby('user_id')['timestamp']
    .diff()
    .dt.total_seconds().reset_index(drop=True)
)


In [11]:
gps_raw.head()

Unnamed: 0,lat,lon,altitude,timestamp,source_file,user_id,delta_time_s
0,39.984702,116.318417,492,2008-10-23 02:53:04,20081023025304.plt,0,
1,39.984683,116.31845,492,2008-10-23 02:53:10,20081023025304.plt,0,6.0
2,39.984686,116.318417,492,2008-10-23 02:53:15,20081023025304.plt,0,5.0
3,39.984688,116.318385,492,2008-10-23 02:53:20,20081023025304.plt,0,5.0
4,39.984655,116.318263,492,2008-10-23 02:53:25,20081023025304.plt,0,5.0


In [12]:
gps_raw['delta_time_s'] = gps_raw['delta_time_s'].fillna(0)
# first row has no previous time stamps for each user to compute distance hence filling with 0

In [13]:
gps_raw['large_gap'] = gps_raw['delta_time_s'] > 1800
#large gap of more then 30 min

In [14]:
gps_raw['large_gap'].mean()

np.float64(0.0019496883965641279)

In [15]:
gps_raw['delta_time_s'].describe()


count    1.845936e+06
mean     6.257101e+01
std      7.793395e+03
min      0.000000e+00
25%      5.000000e+00
50%      5.000000e+00
75%      5.000000e+00
max      9.156457e+06
Name: delta_time_s, dtype: float64

## Distance Computation

Distance between consecutive GPS points is computed using a custom implementation
of the Haversine formula. This avoids reliance on external geospatial libraries
and ensures portability across environments.


In [16]:
import numpy as np

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth radius in meters

    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)

    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)

    a = (
        np.sin(dphi / 2)**2 +
        np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2)**2
    )

    return 2 * R * np.arcsin(np.sqrt(a))



In [17]:
def compute_distance(df):
    df = df.reset_index(drop=True)
    distances = [0.0]

    for i in range(1, len(df)):
        d = haversine_distance(
            df.loc[i-1, 'lat'], df.loc[i-1, 'lon'],
            df.loc[i, 'lat'], df.loc[i, 'lon']
        )
        distances.append(d)

    df['delta_dist_m'] = distances
    return df


In [18]:
gps_raw = (
    gps_raw
    .groupby('user_id', group_keys=False)
    .apply(compute_distance).reset_index(drop=True)
)

  .apply(compute_distance).reset_index(drop=True)


In [19]:
gps_raw.head()

Unnamed: 0,lat,lon,altitude,timestamp,source_file,user_id,delta_time_s,large_gap,delta_dist_m
0,39.984702,116.318417,492,2008-10-23 02:53:04,20081023025304.plt,0,0.0,False,0.0
1,39.984683,116.31845,492,2008-10-23 02:53:10,20081023025304.plt,0,6.0,False,3.516886
2,39.984686,116.318417,492,2008-10-23 02:53:15,20081023025304.plt,0,5.0,False,2.831299
3,39.984688,116.318385,492,2008-10-23 02:53:20,20081023025304.plt,0,5.0,False,2.735434
4,39.984655,116.318263,492,2008-10-23 02:53:25,20081023025304.plt,0,5.0,False,11.023008


## Speed Estimation

Instantaneous speed is calculated as distance divided by time difference between
consecutive GPS points. Observations with zero time difference are assigned zero
speed to avoid division errors.

Unrealistic speeds (>60 m/s) are treated as noise and excluded from analysis.


In [20]:
gps_raw['speed_mps'] = (
    gps_raw['delta_dist_m'] / gps_raw['delta_time_s']
)

gps_raw.loc[gps_raw['delta_time_s'] == 0, 'speed_mps'] = 0


In [21]:
gps_raw.loc[gps_raw['speed_mps'] > 60, 'speed_mps'] = None


## Movement Detection

A point is classified as movement if instantaneous speed exceeds 0.5 m/s.
This threshold filters GPS jitter while preserving slow human motion such as walking.


In [22]:
gps_raw['is_moving'] = gps_raw['speed_mps'] > 0.5


## Data Quality Checks

Summary statistics and movement proportions were inspected to validate
distance and speed calculations and identify anomalies.


In [23]:
gps_raw[['delta_dist_m', 'speed_mps']].describe()


Unnamed: 0,delta_dist_m,speed_mps
count,1845936.0,1845227.0
mean,28.38073,3.862326
std,3383.306,6.130569
min,0.0,0.0
25%,2.447771,0.4770108
50%,6.742943,1.550514
75%,15.47954,4.072176
max,2001218.0,59.98481


In [24]:
gps_raw['is_moving'].value_counts(normalize=True)


is_moving
True     0.743395
False    0.256605
Name: proportion, dtype: float64

## Trip Boundary Logic

Trip boundaries are identified using a rule-based approach that combines
temporal gaps and motion characteristics. A new trip is initiated when
either a large time gap (>30 minutes) is observed or when prolonged
stationarity (>10 minutes with speed <0.5 m/s) occurs.


In [25]:
gps_raw['trip_break'] = (
    (gps_raw['delta_time_s'] > 1800) |
    (
        (gps_raw['speed_mps'] < 0.5) &
        (gps_raw['delta_time_s'] > 600)
    )
)


## Trip Identifier Construction

Trip identifiers are generated per user using a cumulative sum over detected
trip boundaries. This ensures sequential, interpretable trip IDs without
reliance on file structure or external metadata.



In [27]:
gps_raw['trip_id'] = (
    gps_raw
    .groupby('user_id')['trip_break']
    .cumsum()
)


## Trip Validation

Very short or noisy trips are filtered out using minimum thresholds on
number of points and total distance. This removes GPS artifacts while
preserving meaningful mobility behavior.


In [29]:
trip_stats = (
    gps_raw
    .groupby(['user_id', 'trip_id'])
    .agg(
        points=('timestamp', 'count'),
        total_dist=('delta_dist_m', 'sum'),
        duration=('delta_time_s', 'sum')
    )
    .reset_index()
)

valid_trips = trip_stats[
    (trip_stats['points'] >= 10) &
    (trip_stats['total_dist'] >= 100)
]


In [31]:
gps_raw.to_csv("Data/staging/gps_enriched.csv", index=False)
