In [2]:
import pandas as pd
import numpy as np

# Load the data
# parse_dates tells pandas to treat the 'timestamp' column as a proper date/time object
df = pd.read_csv('../data/raw/sample_data.csv', parse_dates=['timestamp'])

# IMPORTANT: Always sort your data by time. Calculations depend on the correct order.
df = df.sort_values(by='timestamp').reset_index(drop=True)

print("Original Data:")
df

Original Data:


Unnamed: 0,timestamp,latitude,longitude
0,2025-09-05 10:00:00+00:00,13.0827,80.2707
1,2025-09-05 10:00:10+00:00,13.083,80.271
2,2025-09-05 10:00:20+00:00,13.0833,80.2713
3,2025-09-05 10:05:30+00:00,13.0833,80.2713
4,2025-09-05 10:05:40+00:00,13.0833,80.2713
5,2025-09-05 10:05:50+00:00,13.087,80.275
6,2025-09-05 10:06:00+00:00,13.287,80.375


In [3]:
# The .diff() method calculates the difference between an element and the one before it.
df['time_diff_seconds'] = df['timestamp'].diff().dt.total_seconds()

# The first row will have no difference, so we fill it with 0.
df['time_diff_seconds'] = df['time_diff_seconds'].fillna(0)

print("Data with Time Difference:")
df

Data with Time Difference:


Unnamed: 0,timestamp,latitude,longitude,time_diff_seconds
0,2025-09-05 10:00:00+00:00,13.0827,80.2707,0.0
1,2025-09-05 10:00:10+00:00,13.083,80.271,10.0
2,2025-09-05 10:00:20+00:00,13.0833,80.2713,10.0
3,2025-09-05 10:05:30+00:00,13.0833,80.2713,310.0
4,2025-09-05 10:05:40+00:00,13.0833,80.2713,10.0
5,2025-09-05 10:05:50+00:00,13.087,80.275,10.0
6,2025-09-05 10:06:00+00:00,13.287,80.375,10.0


In [9]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points
    on the earth (specified in decimal degrees) in meters.
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371000 # Radius of earth in meters.
    return c * r

# To calculate distance, we need the previous point's coordinates.
# The .shift(1) method creates a new column with the values from the row above.
df['prev_lat'] = df['latitude'].shift(1)
df['prev_lon'] = df['longitude'].shift(1)

# Apply the haversine function row by row
df['distance_meters'] = df.apply(
    lambda row: haversine(row['prev_lat'], row['prev_lon'], row['latitude'], row['longitude'])
    if pd.notnull(row['prev_lat'])
    else 0,
    axis=1
)

print("Data with Distance:")
# We can drop the temporary 'prev' columns now
df[['timestamp', 'latitude', 'longitude', 'time_diff_seconds', 'distance_meters']]

Data with Distance:


Unnamed: 0,timestamp,latitude,longitude,time_diff_seconds,distance_meters
0,2025-09-05 10:00:00+00:00,13.0827,80.2707,0.0,0.0
1,2025-09-05 10:00:10+00:00,13.083,80.271,10.0,46.567781
2,2025-09-05 10:00:20+00:00,13.0833,80.2713,10.0,46.567754
3,2025-09-05 10:05:30+00:00,13.0833,80.2713,310.0,0.0
4,2025-09-05 10:05:40+00:00,13.0833,80.2713,10.0,0.0
5,2025-09-05 10:05:50+00:00,13.087,80.275,10.0,574.33336
6,2025-09-05 10:06:00+00:00,13.287,80.375,10.0,24734.200045


In [5]:
# Calculate speed in meters per second (m/s)
# We must handle the case where time_diff is 0 to avoid division by zero errors.
df['speed_mps'] = df['distance_meters'] / df['time_diff_seconds']

# Replace any 'inf' values (from division by zero) with 0 and fill any NaN with 0
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)


# Let's clean up the final DataFrame
final_df = df[['timestamp', 'latitude', 'longitude', 'time_diff_seconds', 'distance_meters', 'speed_mps']]

print("Final DataFrame with All Features:")
final_df

Final DataFrame with All Features:


Unnamed: 0,timestamp,latitude,longitude,time_diff_seconds,distance_meters,speed_mps
0,2025-09-05 10:00:00+00:00,13.0827,80.2707,0.0,0.0,0.0
1,2025-09-05 10:00:10+00:00,13.083,80.271,10.0,46.567781,4.656778
2,2025-09-05 10:00:20+00:00,13.0833,80.2713,10.0,46.567754,4.656775
3,2025-09-05 10:05:30+00:00,13.0833,80.2713,310.0,0.0,0.0
4,2025-09-05 10:05:40+00:00,13.0833,80.2713,10.0,0.0,0.0
5,2025-09-05 10:05:50+00:00,13.087,80.275,10.0,574.33336,57.433336
6,2025-09-05 10:06:00+00:00,13.287,80.375,10.0,24734.200045,2473.420004


In [6]:
# Rule 1: Flag any point where the speed is over 100 m/s (360 km/h)
# This creates a new column with True/False values.
# This is a more robust way to assign a new column
final_df.loc[:, 'anomaly_teleport'] = final_df['speed_mps'] > 100

print("DataFrame with Teleport Anomaly Flag:")
# Show only the rows where the new flag is True
final_df[final_df['anomaly_teleport'] == True]

DataFrame with Teleport Anomaly Flag:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.loc[:, 'anomaly_teleport'] = final_df['speed_mps'] > 100


Unnamed: 0,timestamp,latitude,longitude,time_diff_seconds,distance_meters,speed_mps,anomaly_teleport
6,2025-09-05 10:06:00+00:00,13.287,80.375,10.0,24734.200045,2473.420004,True


In [7]:
# Rule 2: Flag any point where the time since the last update is over 10 minutes (600 seconds)
final_df['anomaly_missing'] = final_df['time_diff_seconds'] > 600

# Let's see if any were flagged in our sample
final_df[final_df['anomaly_missing'] == True]

Unnamed: 0,timestamp,latitude,longitude,time_diff_seconds,distance_meters,speed_mps,anomaly_teleport,anomaly_missing


In [8]:
# THE FIX: Set the 'timestamp' column as the index of the DataFrame.
# This is required for time-based rolling windows.
final_df.set_index('timestamp', inplace=True)

# Rule 3: Flag prolonged inactivity.
# Now the .rolling() function will work correctly on the new index.
final_df['distance_in_5min'] = final_df.rolling(window='300s')['distance_meters'].sum()

# Now, we flag as an anomaly if that sum is less than our threshold (20 meters)
final_df['anomaly_inactive'] = final_df['distance_in_5min'] < 20

# Let's turn the index back into a regular column so our table looks the same as before.
final_df.reset_index(inplace=True)

print("DataFrame with All Anomaly Flags:")
# Let's look at all the anomaly columns together and see the result
print(final_df[final_df['anomaly_inactive'] == True])

DataFrame with All Anomaly Flags:
                  timestamp  latitude  longitude  time_diff_seconds  \
0 2025-09-05 10:00:00+00:00   13.0827    80.2707                0.0   
3 2025-09-05 10:05:30+00:00   13.0833    80.2713              310.0   
4 2025-09-05 10:05:40+00:00   13.0833    80.2713               10.0   

   distance_meters  speed_mps  anomaly_teleport  anomaly_missing  \
0              0.0        0.0             False            False   
3              0.0        0.0             False            False   
4              0.0        0.0             False            False   

   distance_in_5min  anomaly_inactive  
0               0.0              True  
3               0.0              True  
4               0.0              True  
