In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from math import radians, sin, cos, sqrt, atan2


In [4]:
file_path = 'DBtrainrides.csv'  # Update with your actual file path
df = pd.read_csv(file_path)


In [5]:
# Parse the 'ID' column to extract 'route_id', 'departure_time_str', and 'station_number'
def parse_id(id_str):
    if pd.isnull(id_str):
        return None, None, None
    parts = id_str.split('-')
    if len(parts) == 3:
        route_id = parts[0]
        departure_time_str = parts[1]
        station_number = parts[2]
    elif len(parts) == 4 and parts[0] == '':
        # This is when route_id starts with a minus sign
        route_id = '-' + parts[1]
        departure_time_str = parts[2]
        station_number = parts[3]
    else:
        # ID does not conform to expected pattern
        return None, None, None
    return route_id, departure_time_str, station_number


In [6]:
def parse_departure_time(departure_time_str):
    if not isinstance(departure_time_str, str) or len(departure_time_str) != 10:
        return None
    try:
        year = int('20' + departure_time_str[0:2])  # Assuming years are 2020+
        month = int(departure_time_str[2:4])
        day = int(departure_time_str[4:6])
        hour = int(departure_time_str[6:8])
        minute = int(departure_time_str[8:10])
        dt = datetime(year, month, day, hour, minute)
    except ValueError:
        dt = None
    return dt

In [None]:
# Apply parsing functions
df[['route_id', 'departure_time_str', 'station_number']] = df['ID'].apply(
    lambda x: pd.Series(parse_id(x))
)

# Convert 'departure_time_str' to datetime
df['departure_time'] = df['departure_time_str'].apply(parse_departure_time)

# Convert 'station_number' to numeric
df['station_number'] = pd.to_numeric(df['station_number'], errors='coerce')

# Convert 'arrival_plan' and 'departure_plan' to datetime
df['arrival_plan'] = pd.to_datetime(df['arrival_plan'])
df['departure_plan'] = pd.to_datetime(df['departure_plan'])

# Ensure 'arrival_delay_m' and 'departure_delay_m' are numeric
df['arrival_delay_m'] = pd.to_numeric(df['arrival_delay_m'], errors='coerce')
df['departure_delay_m'] = pd.to_numeric(df['departure_delay_m'], errors='coerce')

# Sort the DataFrame
df = df.sort_values(by=['route_id', 'departure_time', 'station_number'])

# Reset index
df.reset_index(drop=True, inplace=True)

# Display the sorted data
print("\nData after parsing and sorting:")
df[['ID', 'route_id', 'departure_time', 'station_number']].head()

In [None]:
# Reset index
df.reset_index(drop=True, inplace=True)

# Display the sorted data
print("\nData after parsing and sorting:")
print(df[['ID', 'route_id', 'departure_time', 'station_number']].head())

In [None]:
# ## 1. Incorporate Delay from Previous Stations

# %% [markdown]
# ### 1.1 Add Previous Arrival and Departure Delays

# %%
# Group the data by journey
df['prev_arrival_delay_m'] = df.groupby(['route_id', 'departure_time'])['arrival_delay_m'].shift(1)
df['prev_departure_delay_m'] = df.groupby(['route_id', 'departure_time'])['departure_delay_m'].shift(1)

# Replace NaN values (which occur at the first station) with 0
df['prev_arrival_delay_m'] = df['prev_arrival_delay_m'].fillna(0)
df['prev_departure_delay_m'] = df['prev_departure_delay_m'].fillna(0)

# Display the new features
print("\nData with previous delays:")
print(df[['route_id', 'departure_time', 'station_number', 'arrival_delay_m', 'prev_arrival_delay_m']].head(10))




In [None]:
# ### 1.2 Add Weighted Average Delay of All Previous Stops (Optimized)

# %%
# Optimized function to calculate weighted average delay without for loops
def calculate_weighted_avg_delay_vectorized(group):
    delays = group['arrival_delay_m'].fillna(0).values
    weights = np.arange(1, len(delays) + 1)
    weighted_delays = delays * weights
    numerator = np.cumsum(weighted_delays)
    denominator = np.cumsum(weights)
    weighted_avg = numerator / denominator
    # Shift weighted_avg by one to exclude current delay
    weighted_avg_prev = np.insert(weighted_avg[:-1], 0, 0)
    group['weighted_avg_prev_delay'] = weighted_avg_prev
    return group

In [None]:
# Apply the function to each journey
df = df.groupby(['route_id', 'departure_time'], group_keys=False).apply(calculate_weighted_avg_delay_vectorized)

# Display the new feature
print("\nData with weighted average previous delay (Optimized):")
df[['route_id', 'departure_time', 'station_number', 'weighted_avg_prev_delay']].head(10)

In [None]:
# ### 1.3 Add Feature Representing Gain in Delay Over Stations (Optimized)

# %%
# Calculate the cumulative delay at each station
df['cumulative_delay'] = df.groupby(['route_id', 'departure_time'])['arrival_delay_m'].cumsum()

# Calculate the gain in delay over stations
df['delay_gain'] = df.groupby(['route_id', 'departure_time'])['cumulative_delay'].diff().fillna(0)

# Display the new feature
print("\nData with delay gain (Optimized):")
print(df[['route_id', 'departure_time', 'station_number', 'arrival_delay_m', 'cumulative_delay', 'delay_gain']].head(10))

In [None]:
# ## 2. Represent Interconnectedness of Stops within a Route

# %% [markdown]
# ### 2.1 Add Station Number Features

# %%
# Calculate the max station number for each journey
df['max_station_number'] = df.groupby(['route_id', 'departure_time'])['station_number'].transform('max')

# Calculate the ratio of current station number to max station number
df['station_progress'] = df['station_number'] / df['max_station_number']

# Display the new features
print("\nData with station number features:")
df[['route_id', 'departure_time', 'station_number', 'max_station_number', 'station_progress']].head(10)

In [None]:
# ### 2.2 Add Time-Based Features

# %%
# Calculate the planned departure time from the origin station for each journey
df['origin_departure_plan'] = df.groupby(['route_id', 'departure_time'])['departure_plan'].transform('first')

# Calculate planned elapsed time since departure from origin station
df['planned_elapsed_time'] = (df['arrival_plan'] - df['origin_departure_plan']).dt.total_seconds() / 60  # in minutes

# Calculate total planned time for the journey
df['total_planned_time'] = (df.groupby(['route_id', 'departure_time'])['arrival_plan'].transform('last') - df['origin_departure_plan']).dt.total_seconds() / 60  # in minutes

# Calculate ratio of elapsed time to total time
df['time_progress'] = df['planned_elapsed_time'] / df['total_planned_time']

# Calculate planned travel time to the next stop
df['next_arrival_plan'] = df.groupby(['route_id', 'departure_time'])['arrival_plan'].shift(-1)
df['planned_travel_time_to_next_stop'] = (df['next_arrival_plan'] - df['departure_plan']).dt.total_seconds() / 60  # in minutes

# Display the new features
print("\nData with time-based features:")
df[['route_id', 'departure_time', 'station_number', 'planned_elapsed_time', 'total_planned_time', 'time_progress', 'planned_travel_time_to_next_stop']].head(10)

# %% [markdown]
# ### 2.25 Combine Progress Ratios

# %%
# Calculate the ratio of station progress to time progress (progress_ratio = station_progress / time_progress)
df['progress_ratio'] = df['station_progress'] / df['time_progress'].replace({0: np.nan})

# Handle infinite or NaN values
df['progress_ratio'] = df['progress_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Display the new feature
print("\nData with progress ratio:")
df[['route_id', 'departure_time', 'station_number', 'station_progress', 'time_progress', 'progress_ratio']].head(10)


In [None]:
# ### 2.3 Add Distance-Based Features (Optimized)

# %%
# Ensure 'long' and 'lat' are numeric
df['long'] = pd.to_numeric(df['long'], errors='coerce')
df['lat'] = pd.to_numeric(df['lat'], errors='coerce')

# Remove entries with missing coordinates
df = df.dropna(subset=['long', 'lat'])

In [None]:
# Optimized function to calculate distance features without for loops
def calculate_distance_features_vectorized(group):
    group = group.sort_values('station_number')
    latitudes = group['lat'].values
    longitudes = group['long'].values
    R = 6371  # Earth radius in kilometers

    # Convert degrees to radians
    lat_rad = np.radians(latitudes)
    lon_rad = np.radians(longitudes)

    # Compute differences between consecutive coordinates
    delta_phi = np.diff(lat_rad)
    delta_lambda = np.diff(lon_rad)

    # Compute haversine formula
    a = np.sin(delta_phi / 2.0) ** 2 + np.cos(lat_rad[:-1]) * np.cos(lat_rad[1:]) * np.sin(delta_lambda / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distances = R * c  # distances between consecutive points

    # distances_to_prev_stop
    distances_to_prev = np.insert(distances, 0, 0)  # Insert 0 for the first stop

    # distances_from_origin
    distances_from_origin = np.cumsum(distances_to_prev)

    # total_distance
    total_distance = distances_from_origin[-1] if len(distances_from_origin) > 0 else 0

    # distance_to_next_stop
    distances_to_next = np.append(distances, 0)  # Append 0 for the last stop

    group = group.copy()
    group['distance_to_prev_stop'] = distances_to_prev
    group['distance_from_origin'] = distances_from_origin
    group['total_distance'] = total_distance  # Same for all rows in the group
    group['distance_to_next_stop'] = distances_to_next

    return group


In [None]:
# Apply the function to each journey
df = df.groupby(['route_id', 'departure_time'], group_keys=False).apply(calculate_distance_features_vectorized)

# Display the new features
print("\nData with distance-based features (Optimized):")
print(df[['route_id', 'departure_time', 'station_number', 'distance_to_prev_stop', 'distance_to_next_stop', 'distance_from_origin', 'total_distance']].head(10))


In [None]:
# %% [markdown]
# ### 2.35 Link Distance Features to Progress

# %%
# Calculate ratio of distance from origin to total distance
df['distance_progress'] = df['distance_from_origin'] / df['total_distance'].replace({0: np.nan})

# Handle infinite or NaN values
df['distance_progress'] = df['distance_progress'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Display the new feature
print("\nData with distance progress:")
df[['route_id', 'departure_time', 'station_number', 'distance_progress']].head(10)


In [None]:
# ### 2.4 Compute Average Delay of a Region Using City

# %%
# Calculate average delay per city
city_avg_delay = df.groupby('city')['arrival_delay_m'].transform('mean')

# Add the average city delay as a feature
df['avg_city_delay'] = city_avg_delay

# Display the new feature
print("\nData with average city delay:")
df[['city', 'arrival_delay_m', 'avg_city_delay']].head(10)


In [None]:
print("\nFinal Data with New Features:")
pd.set_option('display.max_columns', None)
df.head(20)

In [None]:
# Save the processed DataFrame to a new CSV file
df.to_csv('DBtrainrides_processed_optimized_2.csv', index=False)