In [1]:
import os
import pandas as pd

# Manual download

we need the ASD station

In [2]:
def process_traffic_file(file_path):
    """
    Complete pipeline for processing traffic files.
    Cleans numeric columns, parses dates, and selects important columns.
    """
    # Read the CSV
    df = pd.read_csv(file_path, on_bad_lines='skip')

    # Clean numeric columns
    numeric_columns = ["duration_minutes"]
    for col in numeric_columns:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(r'[^\d.-]', '', regex=True)
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Parse datetime
    if "start_time" in df.columns:
        df["start_time"] = pd.to_datetime(df["start_time"], errors='coerce')
        df["start_time_date"] = df["start_time"].dt.date

    # Select important columns
    important_columns = [
        "rdt_station_codes", "cause_en", "cause_group", "start_time_date", "duration_minutes"
    ]
    available_columns = [col for col in important_columns if col in df.columns]
    return df[available_columns]

In [3]:
directory_path = "../Data_Raw/Traffic/"
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

combined_df = pd.DataFrame()
for file in csv_files:
    file_path = os.path.join(directory_path, file)
    print(f"Processing: {file}")
    df = process_traffic_file(file_path)
    df['source_file'] = file
    combined_df = pd.concat([combined_df, df], ignore_index=True)

Processing: disruptions-2022.csv
Processing: disruptions-2023.csv
Processing: disruptions-2024.csv


In [4]:
# Filter for ASD station
filtered_df = combined_df[combined_df['rdt_station_codes'].str.contains('ASD', na=False)]

# Save cleaned data
cleaned_data_path = "../../Data_Sources/Data_Modelling/Traffic/"
os.makedirs(cleaned_data_path, exist_ok=True)
output_file = os.path.join(cleaned_data_path, "disruptions_data_historical.csv")
filtered_df.to_csv(output_file, index=False)