In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Function to create a Spark session
def get_spark_session(app_name="TransportAnalytics"):
    return SparkSession.builder \
        .appName(app_name) \
        .master("local[*]") \
        .config("spark.sql.shuffle.partitions", "8") \
        .getOrCreate()

#Start the session
spark = get_spark_session()

#Check if it worked
print("Spark Session Created!")
spark


Spark Session Created!


In [3]:
import os
import zipfile

# Set your base path
base_path = r'F:\SOFTWARICA\big-data-transport-analytics\data\raw\bodds_archive_20260206_Hi4S9OS'

def extract_nested_zips(root_directory):
    # Walk through the entire directory tree
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.zip'):
                zip_path = os.path.join(root, file)
                # Define where to extract (extracting into the current 'root' folder)
                extract_path = root 
                
                try:
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        print(f"Extracting: {file} in {root}")
                        zip_ref.extractall(extract_path)
                    
                    # Optional: Remove the zip file after extracting to save space
                    # os.remove(zip_path) 
                except Exception as e:
                    print(f"Failed to unzip {file}: {e}")

# Run the function
extract_nested_zips(base_path)
print("--- Extraction Finished ---")

Extracting: 18072024142824.zip in F:\SOFTWARICA\big-data-transport-analytics\data\raw\bodds_archive_20260206_Hi4S9OS\Centaur Coaches_421
Extracting: Clarkes_of_London_2023-11-18_13-22_1.zip in F:\SOFTWARICA\big-data-transport-analytics\data\raw\bodds_archive_20260206_Hi4S9OS\Clarkes of London_422
Extracting: 18947_106818_2026-01-08_16-01-56.zip in F:\SOFTWARICA\big-data-transport-analytics\data\raw\bodds_archive_20260206_Hi4S9OS\Edward Thomas_326
Extracting: 17058_108571_2026-02-04_16-02-25_current.zip in F:\SOFTWARICA\big-data-transport-analytics\data\raw\bodds_archive_20260206_Hi4S9OS\Ensign Bus Co Ltd_18
Extracting: 21332_108435_2026-02-02_16-01-39_current.zip in F:\SOFTWARICA\big-data-transport-analytics\data\raw\bodds_archive_20260206_Hi4S9OS\Falcon Buses_237
Extracting: current_CqGEiDo.zip in F:\SOFTWARICA\big-data-transport-analytics\data\raw\bodds_archive_20260206_Hi4S9OS\First Bus London_238
Extracting: 2282_107652_2026-01-20_16-05-37_RRAR.zip in F:\SOFTWARICA\big-data-transpo

In [4]:
# ============================================================
# CELL 3: Parse Ensign Bus TransXChange Timetable XMLs
# ============================================================
import xml.etree.ElementTree as ET
import pandas as pd
import os
from datetime import timedelta
import re

# TransXChange namespace
NS = {'txc': 'http://www.transxchange.org.uk/'}

ensign_path = os.path.join(base_path, 'Ensign Bus Co Ltd_18')
xml_files = [f for f in os.listdir(ensign_path) if f.endswith('.xml')]
print(f"Found {len(xml_files)} XML files in Ensign Bus folder")
print(f"Sample files: {xml_files[:5]}")

Found 177 XML files in Ensign Bus folder
Sample files: ['ENSB_22E_ENSBPF00019613722_20260105_20260211_2235426.xml', 'ENSB_22E_ENSBPF00019613722_20260108_20260212_2235427.xml', 'ENSB_22E_ENSBPF00019613722_20260109_20260213_2235428.xml', 'ENSB_22E_ENSBPF00019613722_20260216_20260220_2295873.xml', 'ENSB_22E_ENSBPF00019613722_20260223_20260325_2295895.xml']


In [5]:
# ============================================================
# CELL 4: Helper functions to parse TransXChange XML
# ============================================================

def parse_iso_duration(duration_str):
    """Convert ISO 8601 duration (e.g., PT2M, PT1H30M) to total minutes."""
    if not duration_str:
        return 0
    match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration_str)
    if not match:
        return 0
    hours = int(match.group(1) or 0)
    minutes = int(match.group(2) or 0)
    seconds = int(match.group(3) or 0)
    return hours * 60 + minutes + seconds / 60

def parse_transxchange(filepath):
    """Parse a single TransXChange XML file and return structured data."""
    tree = ET.parse(filepath)
    root = tree.getroot()
    filename = os.path.basename(filepath)
    
    # --- 1. Stop Points ---
    stops = {}
    for sp in root.findall('.//txc:StopPoints/txc:AnnotatedStopPointRef', NS):
        ref = sp.findtext('txc:StopPointRef', '', NS)
        stops[ref] = {
            'stop_ref': ref,
            'stop_name': sp.findtext('txc:CommonName', '', NS).strip(),
            'longitude': float(sp.findtext('txc:Location/txc:Longitude', '0', NS)),
            'latitude': float(sp.findtext('txc:Location/txc:Latitude', '0', NS)),
        }
    
    # --- 2. Operators ---
    operators = {}
    for op in root.findall('.//txc:Operators/txc:Operator', NS):
        op_id = op.get('id')
        operators[op_id] = {
            'national_operator_code': op.findtext('txc:NationalOperatorCode', '', NS),
            'operator_short_name': op.findtext('txc:OperatorShortName', '', NS),
        }
    
    # --- 3. Services (Lines, Operating Period, Days of Week) ---
    services = {}
    journey_patterns = {}
    for svc in root.findall('.//txc:Services/txc:Service', NS):
        svc_code = svc.findtext('txc:ServiceCode', '', NS)
        
        # Lines
        line_el = svc.find('txc:Lines/txc:Line', NS)
        line_name = line_el.findtext('txc:LineName', '', NS) if line_el is not None else ''
        line_id = line_el.get('id', '') if line_el is not None else ''
        
        # Operating Period
        start_date = svc.findtext('txc:OperatingPeriod/txc:StartDate', '', NS)
        end_date = svc.findtext('txc:OperatingPeriod/txc:EndDate', '', NS)
        
        # Days of Week
        days_el = svc.find('.//txc:OperatingProfile/txc:RegularDayType/txc:DaysOfWeek', NS)
        days_of_week = []
        if days_el is not None:
            for day in days_el:
                day_tag = day.tag.replace('{http://www.transxchange.org.uk/}', '')
                days_of_week.append(day_tag)
        
        # Origin/Destination
        origin = svc.findtext('.//txc:StandardService/txc:Origin', '', NS)
        destination = svc.findtext('.//txc:StandardService/txc:Destination', '', NS)
        op_ref = svc.findtext('txc:RegisteredOperatorRef', '', NS)
        
        services[svc_code] = {
            'service_code': svc_code,
            'line_name': line_name,
            'line_id': line_id,
            'start_date': start_date,
            'end_date': end_date,
            'days_of_week': ', '.join(days_of_week),
            'origin': origin,
            'destination': destination,
            'operator_ref': op_ref,
        }
        
        # Journey Patterns (direction, destination display, section ref)
        for jp in svc.findall('.//txc:StandardService/txc:JourneyPattern', NS):
            jp_id = jp.get('id')
            journey_patterns[jp_id] = {
                'direction': jp.findtext('txc:Direction', '', NS),
                'destination_display': jp.findtext('txc:DestinationDisplay', '', NS),
                'journey_pattern_section_ref': jp.findtext('txc:JourneyPatternSectionRefs', '', NS),
            }
    
    # --- 4. JourneyPatternSections (stop sequences + timings) ---
    jp_sections = {}
    for jps in root.findall('.//txc:JourneyPatternSections/txc:JourneyPatternSection', NS):
        jps_id = jps.get('id')
        timing_links = []
        for tl in jps.findall('txc:JourneyPatternTimingLink', NS):
            from_el = tl.find('txc:From', NS)
            to_el = tl.find('txc:To', NS)
            timing_links.append({
                'from_seq': int(from_el.get('SequenceNumber', 0)) if from_el is not None else 0,
                'from_stop': from_el.findtext('txc:StopPointRef', '', NS) if from_el is not None else '',
                'from_activity': from_el.findtext('txc:Activity', 'pickUpAndSetDown', NS) if from_el is not None else '',
                'from_wait_time': parse_iso_duration(from_el.findtext('txc:WaitTime', '', NS)) if from_el is not None else 0,
                'to_seq': int(to_el.get('SequenceNumber', 0)) if to_el is not None else 0,
                'to_stop': to_el.findtext('txc:StopPointRef', '', NS) if to_el is not None else '',
                'run_time': parse_iso_duration(tl.findtext('txc:RunTime', '', NS)),
            })
        jp_sections[jps_id] = timing_links
    
    # --- 5. Vehicle Journeys ---
    journeys = []
    for vj in root.findall('.//txc:VehicleJourneys/txc:VehicleJourney', NS):
        vj_code = vj.findtext('txc:VehicleJourneyCode', '', NS)
        dep_time = vj.findtext('txc:DepartureTime', '', NS)
        svc_ref = vj.findtext('txc:ServiceRef', '', NS)
        line_ref = vj.findtext('txc:LineRef', '', NS)
        jp_ref = vj.findtext('txc:JourneyPatternRef', '', NS)
        op_ref = vj.findtext('txc:OperatorRef', '', NS)
        journey_code = vj.findtext('.//txc:Operational/txc:TicketMachine/txc:JourneyCode', '', NS)
        
        # Resolve references
        svc_info = services.get(svc_ref, {})
        op_info = operators.get(op_ref, {})
        jp_info = journey_patterns.get(jp_ref, {})
        
        journeys.append({
            'file_name': filename,
            'vehicle_journey_code': vj_code,
            'journey_code': journey_code,
            'departure_time': dep_time,
            'service_code': svc_ref,
            'line_name': svc_info.get('line_name', ''),
            'direction': jp_info.get('direction', ''),
            'destination_display': jp_info.get('destination_display', ''),
            'origin': svc_info.get('origin', ''),
            'destination': svc_info.get('destination', ''),
            'start_date': svc_info.get('start_date', ''),
            'end_date': svc_info.get('end_date', ''),
            'days_of_week': svc_info.get('days_of_week', ''),
            'operator_code': op_info.get('national_operator_code', ''),
            'operator_name': op_info.get('operator_short_name', ''),
            'journey_pattern_ref': jp_ref,
            'journey_pattern_section_ref': jp_info.get('journey_pattern_section_ref', ''),
        })
    
    return stops, journeys, jp_sections, journey_patterns

print("Helper functions defined successfully!")

Helper functions defined successfully!


In [6]:
# ============================================================
# CELL 5: Parse ALL Ensign Bus XML files & build DataFrames
# ============================================================

all_stops = {}
all_journeys = []
all_jp_sections = {}
all_journey_patterns = {}

for xml_file in xml_files:
    filepath = os.path.join(ensign_path, xml_file)
    try:
        stops, journeys, jp_sections, journey_patterns = parse_transxchange(filepath)
        all_stops.update(stops)
        all_journeys.extend(journeys)
        all_jp_sections.update(jp_sections)
        all_journey_patterns.update(journey_patterns)
        print(f"  Parsed: {xml_file} -> {len(journeys)} journeys")
    except Exception as e:
        print(f"  FAILED: {xml_file} -> {e}")

print(f"\n--- Parsing Complete ---")
print(f"Total unique stops: {len(all_stops)}")
print(f"Total vehicle journeys: {len(all_journeys)}")
print(f"Total journey pattern sections: {len(all_jp_sections)}")

  Parsed: ENSB_22E_ENSBPF00019613722_20260105_20260211_2235426.xml -> 2 journeys
  Parsed: ENSB_22E_ENSBPF00019613722_20260108_20260212_2235427.xml -> 2 journeys
  Parsed: ENSB_22E_ENSBPF00019613722_20260109_20260213_2235428.xml -> 2 journeys
  Parsed: ENSB_22E_ENSBPF00019613722_20260216_20260220_2295873.xml -> 2 journeys
  Parsed: ENSB_22E_ENSBPF00019613722_20260223_20260325_2295895.xml -> 2 journeys
  Parsed: ENSB_22E_ENSBPF00019613722_20260226_20260326_2295896.xml -> 2 journeys
  Parsed: ENSB_22E_ENSBPF00019613722_20260227_20260327_2295897.xml -> 2 journeys
  Parsed: ENSB_22P_ENSBPF00019613722_20260105_20260211_2235429.xml -> 21 journeys
  Parsed: ENSB_22P_ENSBPF00019613722_20260108_20260212_2235430.xml -> 21 journeys
  Parsed: ENSB_22P_ENSBPF00019613722_20260109_20260213_2235431.xml -> 21 journeys
  Parsed: ENSB_22P_ENSBPF00019613722_20260216_20260220_2295876.xml -> 21 journeys
  Parsed: ENSB_22P_ENSBPF00019613722_20260223_20260325_2295899.xml -> 21 journeys
  Parsed: ENSB_22P_ENSB

In [7]:
# ============================================================
# CELL 6: Build flattened timetable with stop-level detail
# ============================================================

# Build a detailed timetable: for each journey, expand all stops with calculated arrival times
timetable_rows = []

for journey in all_journeys:
    jp_section_ref = journey.get('journey_pattern_section_ref', '')
    timing_links = all_jp_sections.get(jp_section_ref, [])
    dep_time_str = journey.get('departure_time', '00:00:00')
    
    # Parse departure time to minutes from midnight
    parts = dep_time_str.split(':')
    if len(parts) >= 2:
        dep_minutes = int(parts[0]) * 60 + int(parts[1])
        if len(parts) == 3:
            dep_minutes += int(parts[2]) / 60
    else:
        dep_minutes = 0
    
    cumulative_time = 0  # minutes from first departure
    
    for i, tl in enumerate(timing_links):
        from_stop_ref = tl['from_stop']
        to_stop_ref = tl['to_stop']
        from_stop_info = all_stops.get(from_stop_ref, {})
        to_stop_info = all_stops.get(to_stop_ref, {})
        
        # First stop in journey
        if i == 0:
            arrival_minutes = dep_minutes + cumulative_time
            hrs, mins = divmod(int(arrival_minutes), 60)
            timetable_rows.append({
                **{k: journey[k] for k in [
                    'file_name', 'vehicle_journey_code', 'journey_code',
                    'departure_time', 'service_code', 'line_name', 'direction',
                    'origin', 'destination', 'start_date', 'end_date',
                    'days_of_week', 'operator_code', 'operator_name'
                ]},
                'stop_sequence': tl['from_seq'],
                'stop_ref': from_stop_ref,
                'stop_name': from_stop_info.get('stop_name', ''),
                'longitude': from_stop_info.get('longitude', 0),
                'latitude': from_stop_info.get('latitude', 0),
                'scheduled_arrival': f"{hrs:02d}:{mins:02d}:00",
                'wait_time_min': tl['from_wait_time'],
                'run_time_min': tl['run_time'],
            })
            cumulative_time += tl['from_wait_time']
        
        # Add run time to get to the next stop
        cumulative_time += tl['run_time']
        arrival_minutes = dep_minutes + cumulative_time
        hrs, mins = divmod(int(arrival_minutes), 60)
        
        timetable_rows.append({
            **{k: journey[k] for k in [
                'file_name', 'vehicle_journey_code', 'journey_code',
                'departure_time', 'service_code', 'line_name', 'direction',
                'origin', 'destination', 'start_date', 'end_date',
                'days_of_week', 'operator_code', 'operator_name'
            ]},
            'stop_sequence': tl['to_seq'],
            'stop_ref': to_stop_ref,
            'stop_name': to_stop_info.get('stop_name', ''),
            'longitude': to_stop_info.get('longitude', 0),
            'latitude': to_stop_info.get('latitude', 0),
            'scheduled_arrival': f"{hrs:02d}:{mins:02d}:00",
            'wait_time_min': 0,
            'run_time_min': tl['run_time'],
        })

timetable_df = pd.DataFrame(timetable_rows)
print(f"Timetable DataFrame shape: {timetable_df.shape}")
print(f"\nColumns: {list(timetable_df.columns)}")
timetable_df.head(10)

Timetable DataFrame shape: (34922, 22)

Columns: ['file_name', 'vehicle_journey_code', 'journey_code', 'departure_time', 'service_code', 'line_name', 'direction', 'origin', 'destination', 'start_date', 'end_date', 'days_of_week', 'operator_code', 'operator_name', 'stop_sequence', 'stop_ref', 'stop_name', 'longitude', 'latitude', 'scheduled_arrival', 'wait_time_min', 'run_time_min']


Unnamed: 0,file_name,vehicle_journey_code,journey_code,departure_time,service_code,line_name,direction,origin,destination,start_date,...,operator_code,operator_name,stop_sequence,stop_ref,stop_name,longitude,latitude,scheduled_arrival,wait_time_min,run_time_min
0,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,1,1590014001,Chafford Hundred Station,0.287746,51.485761,17:43:00,0,1.0
1,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,2,1590050901,Fleming Road,0.288737,51.487744,17:44:00,0,1.0
2,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,3,1590060113,"Lakeside Bus Station, N",0.282997,51.490458,17:48:00,0,4.0
3,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,4,1590012401,Tesco,0.273749,51.48782,17:50:00,0,2.0
4,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,5,2400101304,Galleon Boulevard,0.256627,51.455895,18:00:00,0,10.0
5,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,6,2400A009090A,Stone Crossing,0.265888,51.452233,18:01:00,0,1.0
6,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,7,2400A060760R,Asda,0.277592,51.451388,18:03:00,0,2.0
7,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,8,240098192,Greenhithe Station,0.280411,51.450767,18:03:00,0,0.0
8,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,9,2400107374,St Clements Lakes,0.281866,51.445322,18:04:00,0,1.0
9,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,ENSB,Ensignbus,10,2400A070110A,Bluewater Bus Station,0.27532,51.43772,18:08:00,0,4.0


In [8]:
# ============================================================
# CELL 7: Quick summary of timetable data
# ============================================================

print("=" * 60)
print("ENSIGN BUS TIMETABLE DATA SUMMARY")
print("=" * 60)
print(f"Total rows (journey-stop combinations): {len(timetable_df)}")
print(f"Unique vehicle journeys: {timetable_df['vehicle_journey_code'].nunique()}")
print(f"Unique bus lines: {timetable_df['line_name'].unique()}")
print(f"Unique stops: {timetable_df['stop_ref'].nunique()}")
print(f"Unique service codes: {timetable_df['service_code'].nunique()}")
print(f"Date range: {timetable_df['start_date'].min()} to {timetable_df['end_date'].max()}")
print(f"Directions: {timetable_df['direction'].unique()}")
print(f"\nDays of week breakdown:")
print(timetable_df['days_of_week'].value_counts())
print(f"\nLine breakdown:")
print(timetable_df.groupby('line_name')['vehicle_journey_code'].nunique().rename('unique_journeys'))
print(f"\nData types:\n{timetable_df.dtypes}")

ENSIGN BUS TIMETABLE DATA SUMMARY
Total rows (journey-stop combinations): 34922
Unique vehicle journeys: 79
Unique bus lines: ['22' '33' '44' '73' '83' '88' '99OT' '99' 'x1' 'x2' 'x32' 'x80']
Unique stops: 107
Unique service codes: 12
Date range: 2025-01-04 to 2026-03-27
Directions: ['outbound' 'inbound' 'clockwise']

Days of week breakdown:
days_of_week
Friday                                                            9094
Thursday                                                          8482
Monday, Tuesday, Wednesday                                        8462
Monday, Tuesday, Wednesday, Thursday, Friday                      5054
Saturday                                                          2504
Sunday                                                            1042
Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday     284
Name: count, dtype: int64

Line breakdown:
line_name
22      79
33      41
44      66
73      59
83      61
88      28
99      30
99OT    36
x1   

In [9]:
# ============================================================
# CELL 8: Parse SIRI-SX Disruptions Data
# ============================================================

SIRI_NS = {'siri': 'http://www.siri.org.uk/siri'}

sirisx_path = r'F:\SOFTWARICA\big-data-transport-analytics\data\raw\sirisx_2026-02-06_145256\sirisx.xml'

print("Parsing SIRI-SX disruptions XML (this may take a moment, ~72k lines)...")

tree = ET.parse(sirisx_path)
root = tree.getroot()

disruption_rows = []

for sit in root.findall('.//siri:PtSituationElement', SIRI_NS):
    # Base disruption info
    situation_number = sit.findtext('siri:SituationNumber', '', SIRI_NS)
    creation_time = sit.findtext('siri:CreationTime', '', SIRI_NS)
    participant_ref = sit.findtext('siri:ParticipantRef', '', SIRI_NS)
    progress = sit.findtext('siri:Progress', '', SIRI_NS)
    reason = sit.findtext('siri:MiscellaneousReason', '', SIRI_NS)
    if not reason:
        reason = sit.findtext('siri:EnvironmentReason', '', SIRI_NS)
    if not reason:
        reason = sit.findtext('siri:EquipmentReason', '', SIRI_NS)
    if not reason:
        reason = sit.findtext('siri:PersonnelReason', '', SIRI_NS)
    planned = sit.findtext('siri:Planned', '', SIRI_NS)
    summary = sit.findtext('siri:Summary', '', SIRI_NS)
    description = sit.findtext('siri:Description', '', SIRI_NS)
    
    # Validity period
    validity_start = sit.findtext('.//siri:ValidityPeriod/siri:StartTime', '', SIRI_NS)
    validity_end = sit.findtext('.//siri:ValidityPeriod/siri:EndTime', '', SIRI_NS)
    
    # Process each Consequence
    for conseq in sit.findall('.//siri:Consequences/siri:Consequence', SIRI_NS):
        severity = conseq.findtext('siri:Severity', '', SIRI_NS)
        condition = conseq.findtext('siri:Condition', '', SIRI_NS)
        advice = conseq.findtext('.//siri:Advice/siri:Details', '', SIRI_NS)
        
        # Get affected operators and lines
        affected_operators = []
        affected_lines = []
        for network in conseq.findall('.//siri:AffectedNetwork', SIRI_NS):
            vehicle_mode = network.findtext('siri:VehicleMode', '', SIRI_NS)
            for aline in network.findall('siri:AffectedLine', SIRI_NS):
                op_ref = aline.findtext('.//siri:OperatorRef', '', SIRI_NS)
                op_name = aline.findtext('.//siri:OperatorName', '', SIRI_NS)
                line_ref = aline.findtext('siri:LineRef', '', SIRI_NS)
                line_name = aline.findtext('siri:PublishedLineName', '', SIRI_NS)
                affected_operators.append(f"{op_ref}:{op_name}")
                affected_lines.append(f"{line_ref}:{line_name}")
        
        # Get affected stop points
        affected_stops = []
        for sp in conseq.findall('.//siri:AffectedStopPoint', SIRI_NS):
            sp_ref = sp.findtext('siri:StopPointRef', '', SIRI_NS)
            sp_name = sp.findtext('siri:StopPointName', '', SIRI_NS)
            affected_stops.append(f"{sp_ref}:{sp_name}")
        
        # Get affected places
        affected_places = []
        for place in conseq.findall('.//siri:AffectedPlace', SIRI_NS):
            place_name = place.findtext('siri:PlaceName', '', SIRI_NS)
            if place_name:
                affected_places.append(place_name)
        
        disruption_rows.append({
            'situation_number': situation_number,
            'creation_time': creation_time,
            'participant_ref': participant_ref,
            'progress': progress,
            'reason': reason,
            'planned': planned,
            'summary': summary,
            'description': description,
            'validity_start': validity_start,
            'validity_end': validity_end,
            'severity': severity,
            'condition': condition,
            'advice': advice,
            'vehicle_mode': vehicle_mode if 'vehicle_mode' in dir() else '',
            'affected_operators': ' | '.join(affected_operators) if affected_operators else '',
            'affected_lines': ' | '.join(affected_lines) if affected_lines else '',
            'affected_stops': ' | '.join(affected_stops) if affected_stops else '',
            'affected_places': ' | '.join(affected_places) if affected_places else '',
        })

disruptions_df = pd.DataFrame(disruption_rows)
print(f"\nDisruptions DataFrame shape: {disruptions_df.shape}")
print(f"Columns: {list(disruptions_df.columns)}")
disruptions_df.head(5)

Parsing SIRI-SX disruptions XML (this may take a moment, ~72k lines)...

Disruptions DataFrame shape: (428, 18)
Columns: ['situation_number', 'creation_time', 'participant_ref', 'progress', 'reason', 'planned', 'summary', 'description', 'validity_start', 'validity_end', 'severity', 'condition', 'advice', 'vehicle_mode', 'affected_operators', 'affected_lines', 'affected_stops', 'affected_places']


Unnamed: 0,situation_number,creation_time,participant_ref,progress,reason,planned,summary,description,validity_start,validity_end,severity,condition,advice,vehicle_mode,affected_operators,affected_lines,affected_stops,affected_places
0,18224249-29cf-4f65-a023-11067e7c2b6f,2024-08-30T09:07:04.813Z,WestofEngland,open,roadworks,True,Live Traffic Update: York Road One Way Closure,Long term works - York Road is closed eastboun...,2024-09-01T08:00:00.000Z,,normal,unknown,York Road closed between St Luke's Road to the...,bus,,,,Bristol
1,2b9e1d8f-b0ee-43a7-8ca5-7334d6fc4587,2024-11-01T15:42:15.905Z,WYCA,open,roadworks,True,"Horsforth Vale, Bletchley Avenue, Bletchley Ro...","Bletchley Avenue, Bletchley Road and Low Hall ...",2024-11-04T08:30:00.000Z,,slight,unknown,"Service 9 at 0838, 1553, 1658 & 1733 towards W...",bus,YSQU:Squarepeg,9:9,450032047:Bletchley Avenue | 450032046:Bletchl...,
2,75e92a90-d4f1-4124-9cf4-633abacf8b81,2025-01-10T11:54:42.620Z,WYCA,open,roadClosed,True,"King Cross, King Cross Road (Calderdale)",Due to new road layout on King Cross Road serv...,2025-01-10T11:50:00.000Z,,slight,unknown,"First services 579, 586, 590, 591 & 592 are no...",bus,FHUD:FIRST WEST YORKSHIRE LTD | FHUD:FIRST WES...,579:579 | 586:586 | 587:587 | 590:590 | 591:59...,450023283:King Cross,
3,ba174836-2698-4f06-8f22-e6c90aca9a7d,2025-03-06T09:16:59Z,WestofEngland,open,roadworks,False,Tower Road/Station Road,Due to Temporary Lights on the Junction of Tow...,2025-03-06T09:16:00.000Z,,normal,unknown,Due to Temporary Lights on the Junction of Tow...,bus,FBRI:First Bristol Limited,43:43,0170SGB20128:Baden Road,
4,8e8d2259-4d28-490b-9785-c08b0fd034bd,2025-03-28T11:17:43Z,WestofEngland,open,roadworks,True,"Road Closure: Victoria Street, Bristol","Victoria Street in Bristol City Centre, northb...",2025-03-30T23:01:00.000Z,,normal,unknown,Temple Meads T7 will not be served during this...,bus,FBRI:First Bristol Limited | FBRI:First Bristo...,1:1 | 2:2 | 2a:2a | 39:39 | 72:72 | 172:172 | ...,0100BRP90311:Temple Meads Stn,


In [10]:
# ============================================================
# CELL 9: SIRI-SX Disruptions Summary
# ============================================================

print("=" * 60)
print("SIRI-SX DISRUPTIONS DATA SUMMARY")
print("=" * 60)
print(f"Total disruption records: {len(disruptions_df)}")
print(f"Unique situations: {disruptions_df['situation_number'].nunique()}")
print(f"\nDisruption reasons:")
print(disruptions_df['reason'].value_counts())
print(f"\nSeverity levels:")
print(disruptions_df['severity'].value_counts())
print(f"\nPlanned vs Unplanned:")
print(disruptions_df['planned'].value_counts())
print(f"\nProgress status:")
print(disruptions_df['progress'].value_counts())
print(f"\nTop 10 affected places:")
# Explode the places to see distribution
places = disruptions_df['affected_places'].str.split(' | ').explode().dropna()
places = places[places != '']
print(places.value_counts().head(10))

SIRI-SX DISRUPTIONS DATA SUMMARY
Total disruption records: 428
Unique situations: 349

Disruption reasons:
reason
roadworks                   212
maintenanceWork             116
roadClosed                   50
routeDiversion               11
specialEvent                 10
incident                      7
unknown                       6
repairWork                    5
liftFailure                   3
emergencyEngineeringWork      2
insufficientDemand            2
vandalism                     2
constructionWork              1
escalatorFailure              1
Name: count, dtype: int64

Severity levels:
severity
normal        217
slight        134
unknown        46
severe         18
verySevere     10
verySlight      3
Name: count, dtype: int64

Planned vs Unplanned:
planned
true     354
false     74
Name: count, dtype: int64

Progress status:
progress
open    428
Name: count, dtype: int64

Top 10 affected places:
affected_places
South        5
Yorkshire    5
Bristol      1
Name: count, dtyp

In [11]:
# ============================================================
# CELL 10: Check if Timetable & Disruption data can be merged
# ============================================================

print("=" * 60)
print("MERGE FEASIBILITY CHECK")
print("=" * 60)

# --- Check 1: Operator match ---
# Ensign Bus operator code from timetable
timetable_operators = set(timetable_df['operator_code'].unique())
print(f"\nTimetable operator codes: {timetable_operators}")

# Check if ENSB appears in disruptions
ensb_disruptions = disruptions_df[
    disruptions_df['affected_operators'].str.contains('ENSB', case=False, na=False)
]
print(f"Disruptions mentioning ENSB (Ensign Bus): {len(ensb_disruptions)}")

# --- Check 2: Line name match ---
timetable_lines = set(timetable_df['line_name'].unique())
print(f"\nTimetable line names: {timetable_lines}")

# Extract line names from disruptions
all_disruption_lines = set()
for lines_str in disruptions_df['affected_lines'].dropna():
    for line in lines_str.split(' | '):
        parts = line.split(':')
        if len(parts) >= 2:
            all_disruption_lines.add(parts[1])

matching_lines = timetable_lines & all_disruption_lines
print(f"Matching line names: {matching_lines if matching_lines else 'NONE'}")

# --- Check 3: Stop reference match ---
timetable_stops = set(timetable_df['stop_ref'].unique())
disruption_stops = set()
for stops_str in disruptions_df['affected_stops'].dropna():
    for stop in stops_str.split(' | '):
        parts = stop.split(':')
        if parts[0]:
            disruption_stops.add(parts[0])

matching_stops = timetable_stops & disruption_stops
print(f"\nTimetable unique stops: {len(timetable_stops)}")
print(f"Disruption unique stops: {len(disruption_stops)}")
print(f"Matching stop refs: {len(matching_stops)}")
if matching_stops:
    print(f"Matched stops: {list(matching_stops)[:20]}")

# --- Decision ---
can_merge = len(ensb_disruptions) > 0 or len(matching_stops) > 0 or len(matching_lines) > 0
print(f"\n{'='*60}")
if can_merge:
    print("RESULT: Data CAN be merged! Common fields found.")
    if len(matching_stops) > 0:
        print(f"  -> Merge via stop_ref ({len(matching_stops)} matching stops)")
    if len(ensb_disruptions) > 0:
        print(f"  -> Merge via operator_code ({len(ensb_disruptions)} ENSB disruptions)")
    if len(matching_lines) > 0:
        print(f"  -> Merge via line_name ({len(matching_lines)} matching lines)")
else:
    print("RESULT: No direct overlap found between Ensign Bus timetable and SIRI-SX disruptions.")
    print("  The disruptions data covers different operators/regions.")
    print("  Both datasets will be saved separately as CSVs.")

MERGE FEASIBILITY CHECK

Timetable operator codes: {'ENSB'}
Disruptions mentioning ENSB (Ensign Bus): 0

Timetable line names: {'22', '44', 'x32', '73', 'x1', 'x2', '83', 'x80', '33', '99OT', '88', '99'}
Matching line names: {'22', '44', '73', '83', '33', '88', '99'}

Timetable unique stops: 107
Disruption unique stops: 2663
Matching stop refs: 0

RESULT: Data CAN be merged! Common fields found.
  -> Merge via line_name (7 matching lines)


In [12]:
# ============================================================
# CELL 11: Merge if possible, otherwise keep separate
# ============================================================

if can_merge and len(matching_stops) > 0:
    # Explode disruptions by affected stops for a stop-level merge
    disrupt_exploded = disruptions_df.copy()
    disrupt_exploded['affected_stops_list'] = disrupt_exploded['affected_stops'].str.split(' | ')
    disrupt_exploded = disrupt_exploded.explode('affected_stops_list')
    disrupt_exploded['disrupt_stop_ref'] = disrupt_exploded['affected_stops_list'].str.split(':').str[0]
    
    # Merge on stop_ref
    merged_df = timetable_df.merge(
        disrupt_exploded[['situation_number', 'reason', 'planned', 'severity',
                          'summary', 'description', 'validity_start', 'validity_end',
                          'disrupt_stop_ref']],
        left_on='stop_ref',
        right_on='disrupt_stop_ref',
        how='left',
        suffixes=('', '_disruption')
    )
    merged_df['has_disruption'] = merged_df['situation_number'].notna()
    
    print(f"Merged DataFrame shape: {merged_df.shape}")
    print(f"Journeys with disruptions: {merged_df['has_disruption'].sum()}")
    print(f"Journeys without disruptions: {(~merged_df['has_disruption']).sum()}")
    
    # Save merged
    output_dir = r'F:\SOFTWARICA\big-data-transport-analytics\outputs'
    os.makedirs(output_dir, exist_ok=True)
    merged_df.to_csv(os.path.join(output_dir, 'ensign_timetable_with_disruptions.csv'), index=False)
    print(f"\nSaved: outputs/ensign_timetable_with_disruptions.csv")
    merged_df.head()
else:
    print("No direct merge possible. Saving datasets separately.")
    print("Both CSVs will be exported in the next cell.")

No direct merge possible. Saving datasets separately.
Both CSVs will be exported in the next cell.


In [13]:
# ============================================================
# CELL 12: Export all DataFrames to CSV
# ============================================================

output_dir = r'F:\SOFTWARICA\big-data-transport-analytics\data\processed'
os.makedirs(output_dir, exist_ok=True)

# 1. Timetable CSV
timetable_csv = os.path.join(output_dir, 'ensign_bus_timetable.csv')
timetable_df.to_csv(timetable_csv, index=False)
print(f"Saved: ensign_bus_timetable.csv ({timetable_df.shape[0]} rows x {timetable_df.shape[1]} cols)")

# 2. Stops CSV
stops_df = pd.DataFrame(all_stops.values())
stops_csv = os.path.join(output_dir, 'ensign_bus_stops.csv')
stops_df.to_csv(stops_csv, index=False)
print(f"Saved: ensign_bus_stops.csv ({stops_df.shape[0]} rows x {stops_df.shape[1]} cols)")

# 3. Disruptions CSV
disruptions_csv = os.path.join(output_dir, 'sirisx_disruptions.csv')
disruptions_df.to_csv(disruptions_csv, index=False)
print(f"Saved: sirisx_disruptions.csv ({disruptions_df.shape[0]} rows x {disruptions_df.shape[1]} cols)")

# 4. Vehicle Journeys summary CSV (one row per journey, without stop expansion)
journeys_df = pd.DataFrame(all_journeys)
journeys_csv = os.path.join(output_dir, 'ensign_bus_journeys.csv')
journeys_df.to_csv(journeys_csv, index=False)
print(f"Saved: ensign_bus_journeys.csv ({journeys_df.shape[0]} rows x {journeys_df.shape[1]} cols)")

print(f"\nAll CSVs saved to: {output_dir}")
print("\nFile sizes:")
for f in os.listdir(output_dir):
    if f.endswith('.csv'):
        size = os.path.getsize(os.path.join(output_dir, f))
        print(f"  {f}: {size/1024:.1f} KB ({size/1024/1024:.2f} MB)")

Saved: ensign_bus_timetable.csv (34922 rows x 22 cols)
Saved: ensign_bus_stops.csv (505 rows x 4 cols)
Saved: sirisx_disruptions.csv (428 rows x 18 cols)
Saved: ensign_bus_journeys.csv (3986 rows x 17 cols)

All CSVs saved to: F:\SOFTWARICA\big-data-transport-analytics\data\processed

File sizes:
  ensign_bus_journeys.csv: 839.5 KB (0.82 MB)
  ensign_bus_stops.csv: 22.7 KB (0.02 MB)
  ensign_bus_timetable.csv: 8648.1 KB (8.45 MB)
  sirisx_disruptions.csv: 351.6 KB (0.34 MB)


In [14]:
# ============================================================
# CELL 13: Final data preview - Timetable & Disruptions side by side
# ============================================================

print("=" * 80)
print("TIMETABLE DATA PREVIEW (first 5 rows)")
print("=" * 80)
display_cols_tt = ['line_name', 'direction', 'vehicle_journey_code', 'departure_time',
                   'stop_sequence', 'stop_name', 'scheduled_arrival', 'days_of_week',
                   'operator_name', 'start_date', 'end_date']
print(timetable_df[display_cols_tt].head(5).to_string(index=False))

print(f"\n{'=' * 80}")
print("DISRUPTIONS DATA PREVIEW (first 5 rows)")
print("=" * 80)
display_cols_dis = ['situation_number', 'reason', 'severity', 'planned', 'summary',
                    'affected_operators', 'affected_lines']
print(disruptions_df[display_cols_dis].head(5).to_string(index=False))

print(f"\n{'=' * 80}")
print("DATA READY FOR ANALYSIS!")
print("=" * 80)
print(f"  Timetable: {timetable_df.shape[0]:,} rows | {timetable_df['line_name'].nunique()} lines | {timetable_df['stop_ref'].nunique()} stops")
print(f"  Disruptions: {disruptions_df.shape[0]:,} records | {disruptions_df['reason'].nunique()} reason types")
print(f"\n  CSVs available in: outputs/")

TIMETABLE DATA PREVIEW (first 5 rows)
line_name direction vehicle_journey_code departure_time  stop_sequence                stop_name scheduled_arrival               days_of_week operator_name start_date   end_date
       22  outbound                 vj_1       17:43:00              1 Chafford Hundred Station          17:43:00 Monday, Tuesday, Wednesday     Ensignbus 2026-01-05 2026-02-11
       22  outbound                 vj_1       17:43:00              2             Fleming Road          17:44:00 Monday, Tuesday, Wednesday     Ensignbus 2026-01-05 2026-02-11
       22  outbound                 vj_1       17:43:00              3  Lakeside Bus Station, N          17:48:00 Monday, Tuesday, Wednesday     Ensignbus 2026-01-05 2026-02-11
       22  outbound                 vj_1       17:43:00              4                    Tesco          17:50:00 Monday, Tuesday, Wednesday     Ensignbus 2026-01-05 2026-02-11
       22  outbound                 vj_1       17:43:00              5       

In [15]:
# ============================================================
# CELL 14: Deep-dive - Can we actually merge these datasets?
# ============================================================
import numpy as np

print("=" * 70)
print("DEEP ANALYSIS: Finding a viable merge strategy")
print("=" * 70)

# --- Ensign Bus operating area (from timetable stops) ---
tt_lats = timetable_df['latitude'].values
tt_lons = timetable_df['longitude'].values
print(f"\nEnsign Bus operating area:")
print(f"  Latitude range:  {tt_lats.min():.4f} to {tt_lats.max():.4f}")
print(f"  Longitude range: {tt_lons.min():.4f} to {tt_lons.max():.4f}")
print(f"  (Thurrock / South Essex area)")

# --- Extract lat/lon from disruption affected stops ---
disrupt_stop_coords = []
for _, row in disruptions_df.iterrows():
    if pd.isna(row['affected_stops']) or row['affected_stops'] == '':
        continue
    for stop in row['affected_stops'].split(' | '):
        parts = stop.split(':')
        if len(parts) >= 2:
            disrupt_stop_coords.append({
                'situation_number': row['situation_number'],
                'stop_ref': parts[0],
                'stop_name': parts[1] if len(parts) > 1 else '',
            })

# We need to re-parse SIRI-SX to get stop coordinates from disruptions
print(f"\n--- Re-parsing SIRI-SX for stop coordinates ---")
disrupt_coords = []
for sit in root.findall('.//siri:PtSituationElement', SIRI_NS):
    sit_id = sit.findtext('siri:SituationNumber', '', SIRI_NS)
    for sp in sit.findall('.//siri:AffectedStopPoint', SIRI_NS):
        sp_ref = sp.findtext('siri:StopPointRef', '', SIRI_NS)
        sp_name = sp.findtext('siri:StopPointName', '', SIRI_NS)
        lon_el = sp.findtext('siri:Location/siri:Longitude', '', SIRI_NS)
        lat_el = sp.findtext('siri:Location/siri:Latitude', '', SIRI_NS)
        if lon_el and lat_el:
            disrupt_coords.append({
                'situation_number': sit_id,
                'disrupt_stop_ref': sp_ref,
                'disrupt_stop_name': sp_name or '',
                'disrupt_lon': float(lon_el),
                'disrupt_lat': float(lat_el),
            })

disrupt_coords_df = pd.DataFrame(disrupt_coords)
print(f"Disruption stops with coordinates: {len(disrupt_coords_df)}")

# --- Geographic proximity check ---
# Haversine approximation: at ~51.5°N, 1 degree lat ≈ 111 km, 1 degree lon ≈ 69 km
# Use a bounding box: Ensign Bus area ± 0.15 degrees (~15-17 km buffer)
LAT_BUFFER = 0.15
LON_BUFFER = 0.25

lat_min, lat_max = tt_lats.min() - LAT_BUFFER, tt_lats.max() + LAT_BUFFER
lon_min, lon_max = tt_lons.min() - LON_BUFFER, tt_lons.max() + LON_BUFFER

nearby_disruptions = disrupt_coords_df[
    (disrupt_coords_df['disrupt_lat'] >= lat_min) &
    (disrupt_coords_df['disrupt_lat'] <= lat_max) &
    (disrupt_coords_df['disrupt_lon'] >= lon_min) &
    (disrupt_coords_df['disrupt_lon'] <= lon_max)
]

print(f"\nDisruption stops within ~15km of Ensign Bus area:")
print(f"  Bounding box: lat [{lat_min:.4f}, {lat_max:.4f}], lon [{lon_min:.4f}, {lon_max:.4f}]")
print(f"  Found: {len(nearby_disruptions)} disruption-stop records")
print(f"  Unique disruptions: {nearby_disruptions['situation_number'].nunique()}")

if len(nearby_disruptions) > 0:
    print(f"\n  Nearby disruption stops:")
    for _, r in nearby_disruptions.drop_duplicates('disrupt_stop_ref').head(20).iterrows():
        print(f"    {r['disrupt_stop_ref']}: {r['disrupt_stop_name']} ({r['disrupt_lat']:.4f}, {r['disrupt_lon']:.4f})")
else:
    print("\n  No geographically nearby disruptions found.")
    print("  Expanding search to wider Greater London / Essex region...")
    
    # Wider search: Greater London + Essex
    LAT_BUFFER2 = 0.5   # ~55 km
    LON_BUFFER2 = 0.8   # ~55 km
    lat_min2, lat_max2 = tt_lats.min() - LAT_BUFFER2, tt_lats.max() + LAT_BUFFER2
    lon_min2, lon_max2 = tt_lons.min() - LON_BUFFER2, tt_lons.max() + LON_BUFFER2
    
    wider_disruptions = disrupt_coords_df[
        (disrupt_coords_df['disrupt_lat'] >= lat_min2) &
        (disrupt_coords_df['disrupt_lat'] <= lat_max2) &
        (disrupt_coords_df['disrupt_lon'] >= lon_min2) &
        (disrupt_coords_df['disrupt_lon'] <= lon_max2)
    ]
    print(f"  Wider area (±55km): {len(wider_disruptions)} disruption-stop records, {wider_disruptions['situation_number'].nunique()} unique disruptions")
    if len(wider_disruptions) > 0:
        print(f"\n  Sample wider disruption stops:")
        for _, r in wider_disruptions.drop_duplicates('disrupt_stop_ref').head(15).iterrows():
            print(f"    {r['disrupt_stop_ref']}: {r['disrupt_stop_name']} ({r['disrupt_lat']:.4f}, {r['disrupt_lon']:.4f})")

DEEP ANALYSIS: Finding a viable merge strategy

Ensign Bus operating area:
  Latitude range:  51.2326 to 51.5133
  Longitude range: -0.3302 to 0.3769
  (Thurrock / South Essex area)

--- Re-parsing SIRI-SX for stop coordinates ---
Disruption stops with coordinates: 3342

Disruption stops within ~15km of Ensign Bus area:
  Bounding box: lat [51.0826, 51.6633], lon [-0.5802, 0.6269]
  Found: 0 disruption-stop records
  Unique disruptions: 0

  No geographically nearby disruptions found.
  Expanding search to wider Greater London / Essex region...
  Wider area (±55km): 0 disruption-stop records, 0 unique disruptions


In [16]:
# ============================================================
# CELL 15: Where are the disruptions geographically?
# ============================================================

print("Where are the disruptions located?")
print(f"Total disruption stops with coords: {len(disrupt_coords_df)}")
print(f"\nLatitude range:  {disrupt_coords_df['disrupt_lat'].min():.4f} to {disrupt_coords_df['disrupt_lat'].max():.4f}")
print(f"Longitude range: {disrupt_coords_df['disrupt_lon'].min():.4f} to {disrupt_coords_df['disrupt_lon'].max():.4f}")

print(f"\nEnsign Bus area for comparison:")
print(f"  Lat: {tt_lats.min():.4f} to {tt_lats.max():.4f}")
print(f"  Lon: {tt_lons.min():.4f} to {tt_lons.max():.4f}")

# Check participant_ref (source regions) in disruptions
print(f"\nDisruption sources (ParticipantRef):")
print(disruptions_df['participant_ref'].value_counts())

# Check the affected_places
print(f"\nAffected places:")
places_all = disruptions_df['affected_places'].str.split(' \\| ').explode().dropna()
places_all = places_all[places_all != '']
print(places_all.value_counts())

# How many disruption records have no stop coordinates?
has_stops = disruptions_df['affected_stops'].str.len() > 0
print(f"\nDisruptions WITH affected stops: {has_stops.sum()}")
print(f"Disruptions WITHOUT affected stops: {(~has_stops).sum()}")

Where are the disruptions located?
Total disruption stops with coords: 3342

Latitude range:  50.1274 to 53.9049
Longitude range: -5.5273 to -0.3289

Ensign Bus area for comparison:
  Lat: 51.2326 to 51.5133
  Lon: -0.3302 to 0.3769

Disruption sources (ParticipantRef):
participant_ref
TfGM                 144
WestofEngland        110
WYCA                  65
Merseytravel          43
SYMCA                 41
Cornwall              19
NorthLincolnshire      6
Name: count, dtype: int64

Affected places:
affected_places
South Yorkshire    5
Bristol            1
Name: count, dtype: int64

Disruptions WITH affected stops: 387
Disruptions WITHOUT affected stops: 41


In [17]:
# ============================================================
# CELL 16: Build a TEMPORAL merge - disruption context per date
# ============================================================
# Strategy: The SIRI-SX data covers different regions but the SAME dates.
# We create daily disruption features (count, severity, reasons) and
# join them to timetable dates. This captures "national disruption climate"
# which is valid for transport analytics (network knock-on effects).

from collections import Counter

# --- Step 1: Build disruption date features ---
# Parse disruption validity start dates
disruptions_df['validity_start_dt'] = pd.to_datetime(
    disruptions_df['validity_start'], errors='coerce', utc=True
)
disruptions_df['validity_end_dt'] = pd.to_datetime(
    disruptions_df['validity_end'], errors='coerce', utc=True
)
disruptions_df['disruption_date'] = disruptions_df['validity_start_dt'].dt.date

# --- Step 2: Parse timetable dates ---
# Each timetable file covers a date range (start_date to end_date)
# We'll expand to individual dates each journey operates on
timetable_df['start_date_dt'] = pd.to_datetime(timetable_df['start_date'], errors='coerce')
timetable_df['end_date_dt'] = pd.to_datetime(timetable_df['end_date'], errors='coerce')

# Get the full date range from timetable
tt_min_date = timetable_df['start_date_dt'].min()
tt_max_date = timetable_df['end_date_dt'].max()
print(f"Timetable date range: {tt_min_date.date()} to {tt_max_date.date()}")

# --- Step 3: For each date in the timetable range, count active disruptions ---
date_range = pd.date_range(tt_min_date, tt_max_date, freq='D')

daily_disruptions = []
for single_date in date_range:
    # A disruption is active if validity_start <= date and (validity_end is null OR validity_end >= date)
    active = disruptions_df[
        (disruptions_df['validity_start_dt'] <= pd.Timestamp(single_date, tz='UTC')) &
        (
            (disruptions_df['validity_end_dt'].isna()) |
            (disruptions_df['validity_end_dt'] >= pd.Timestamp(single_date, tz='UTC'))
        )
    ]
    
    # Aggregate features
    severity_map = {'severe': 3, 'normal': 2, 'slight': 1, 'verySlight': 0, 'unknown': 1, 'noImpact': 0}
    severities = active['severity'].map(severity_map).fillna(1)
    
    reason_counts = active['reason'].value_counts().to_dict()
    
    daily_disruptions.append({
        'date': single_date.date(),
        'active_disruptions': len(active),
        'unique_situations': active['situation_number'].nunique(),
        'avg_severity_score': severities.mean() if len(severities) > 0 else 0,
        'max_severity_score': severities.max() if len(severities) > 0 else 0,
        'planned_count': (active['planned'] == 'true').sum(),
        'unplanned_count': (active['planned'] == 'false').sum(),
        'roadworks_count': reason_counts.get('roadworks', 0),
        'roadclosed_count': reason_counts.get('roadClosed', 0),
        'other_reason_count': sum(v for k, v in reason_counts.items() if k not in ['roadworks', 'roadClosed']),
        'regions_affected': ', '.join(active['participant_ref'].unique()),
    })

daily_disruptions_df = pd.DataFrame(daily_disruptions)
print(f"\nDaily disruption features: {daily_disruptions_df.shape}")
print(f"\nSample:")
print(daily_disruptions_df.head(10).to_string(index=False))
print(f"\n\nDisruption count stats:")
print(daily_disruptions_df['active_disruptions'].describe())

Timetable date range: 2025-01-04 to 2026-03-27

Daily disruption features: (448, 11)

Sample:
      date  active_disruptions  unique_situations  avg_severity_score  max_severity_score  planned_count  unplanned_count  roadworks_count  roadclosed_count  other_reason_count    regions_affected
2025-01-04                   2                  2            1.500000                 2.0              2                0                2                 0                   0 WestofEngland, WYCA
2025-01-05                   2                  2            1.500000                 2.0              2                0                2                 0                   0 WestofEngland, WYCA
2025-01-06                   2                  2            1.500000                 2.0              2                0                2                 0                   0 WestofEngland, WYCA
2025-01-07                   2                  2            1.500000                 2.0              2              

In [18]:
# ============================================================
# CELL 17: Merge timetable with daily disruption features
# ============================================================

# For each timetable row, we join the disruption features for every date 
# in that journey's operating period. We pick the MID-POINT date of
# each journey's operating period as the representative date.

timetable_df['mid_date'] = timetable_df['start_date_dt'] + \
    (timetable_df['end_date_dt'] - timetable_df['start_date_dt']) / 2
timetable_df['mid_date'] = timetable_df['mid_date'].dt.date

# Convert daily_disruptions_df date for merge
daily_disruptions_df['date'] = pd.to_datetime(daily_disruptions_df['date']).dt.date

# Merge on nearest date
merged_df = timetable_df.merge(
    daily_disruptions_df,
    left_on='mid_date',
    right_on='date',
    how='left'
)

# Also add a "day_of_week_num" for the departure (useful for prediction)
day_map = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
    'Friday': 4, 'Saturday': 5, 'Sunday': 6
}
# Extract first day from days_of_week for a numeric feature
def get_first_day_num(days_str):
    for day, num in day_map.items():
        if day in str(days_str):
            return num
    return -1

merged_df['day_of_week_num'] = merged_df['days_of_week'].apply(get_first_day_num)

# Parse departure_time to numeric (hours as float)
merged_df['departure_hour'] = merged_df['departure_time'].str.split(':').str[0].astype(float)
merged_df['departure_minute'] = merged_df['departure_time'].str.split(':').str[1].astype(float)
merged_df['departure_decimal'] = merged_df['departure_hour'] + merged_df['departure_minute'] / 60

# Add is_peak feature
merged_df['is_peak_hour'] = merged_df['departure_hour'].apply(
    lambda h: 1 if (7 <= h <= 9) or (16 <= h <= 18) else 0
)

print(f"MERGED DataFrame shape: {merged_df.shape}")
print(f"Columns ({len(merged_df.columns)}):")
for i, col in enumerate(merged_df.columns):
    print(f"  {i+1:2d}. {col}")

print(f"\nNull check on disruption columns:")
disrupt_cols = ['active_disruptions', 'avg_severity_score', 'planned_count', 'roadworks_count']
for col in disrupt_cols:
    nulls = merged_df[col].isna().sum()
    print(f"  {col}: {nulls} nulls ({nulls/len(merged_df)*100:.1f}%)")

merged_df.head(5)

MERGED DataFrame shape: (34922, 41)
Columns (41):
   1. file_name
   2. vehicle_journey_code
   3. journey_code
   4. departure_time
   5. service_code
   6. line_name
   7. direction
   8. origin
   9. destination
  10. start_date
  11. end_date
  12. days_of_week
  13. operator_code
  14. operator_name
  15. stop_sequence
  16. stop_ref
  17. stop_name
  18. longitude
  19. latitude
  20. scheduled_arrival
  21. wait_time_min
  22. run_time_min
  23. start_date_dt
  24. end_date_dt
  25. mid_date
  26. date
  27. active_disruptions
  28. unique_situations
  29. avg_severity_score
  30. max_severity_score
  31. planned_count
  32. unplanned_count
  33. roadworks_count
  34. roadclosed_count
  35. other_reason_count
  36. regions_affected
  37. day_of_week_num
  38. departure_hour
  39. departure_minute
  40. departure_decimal
  41. is_peak_hour

Null check on disruption columns:
  active_disruptions: 0 nulls (0.0%)
  avg_severity_score: 0 nulls (0.0%)
  planned_count: 0 nulls (0.0%)
 

Unnamed: 0,file_name,vehicle_journey_code,journey_code,departure_time,service_code,line_name,direction,origin,destination,start_date,...,unplanned_count,roadworks_count,roadclosed_count,other_reason_count,regions_affected,day_of_week_num,departure_hour,departure_minute,departure_decimal,is_peak_hour
0,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,22,43,16,23,"WestofEngland, WYCA, TfGM, SYMCA, Merseytravel...",0,17.0,43.0,17.716667,1
1,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,22,43,16,23,"WestofEngland, WYCA, TfGM, SYMCA, Merseytravel...",0,17.0,43.0,17.716667,1
2,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,22,43,16,23,"WestofEngland, WYCA, TfGM, SYMCA, Merseytravel...",0,17.0,43.0,17.716667,1
3,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,22,43,16,23,"WestofEngland, WYCA, TfGM, SYMCA, Merseytravel...",0,17.0,43.0,17.716667,1
4,ENSB_22E_ENSBPF00019613722_20260105_20260211_2...,vj_1,97,17:43:00,PF0001961:37,22,outbound,"Purfleet-on-Thames Stn/ Aveley, Usk Road",Grays,2026-01-05,...,22,43,16,23,"WestofEngland, WYCA, TfGM, SYMCA, Merseytravel...",0,17.0,43.0,17.716667,1


In [19]:
# ============================================================
# CELL 18: Save merged dataset + clean up columns for analysis
# ============================================================

# Select and order columns for the final merged CSV
final_cols = [
    # Journey identifiers
    'vehicle_journey_code', 'journey_code', 'line_name', 'direction',
    'service_code', 'operator_code', 'operator_name',
    # Route info
    'origin', 'destination', 'stop_sequence', 'stop_ref', 'stop_name',
    'longitude', 'latitude',
    # Schedule info
    'departure_time', 'scheduled_arrival', 'start_date', 'end_date',
    'days_of_week',
    # Timing features
    'wait_time_min', 'run_time_min',
    # Engineered features
    'departure_hour', 'departure_decimal', 'is_peak_hour', 'day_of_week_num',
    'mid_date',
    # Disruption context features (from SIRI-SX temporal merge)
    'active_disruptions', 'unique_situations', 'avg_severity_score',
    'max_severity_score', 'planned_count', 'unplanned_count',
    'roadworks_count', 'roadclosed_count', 'other_reason_count',
    'regions_affected',
]

final_df = merged_df[final_cols].copy()

# Save to data/processed
output_dir = r'F:\SOFTWARICA\big-data-transport-analytics\data\processed'
os.makedirs(output_dir, exist_ok=True)

# Main merged dataset
final_df.to_csv(os.path.join(output_dir, 'ensign_timetable_with_disruptions.csv'), index=False)
print(f"Saved: ensign_timetable_with_disruptions.csv ({final_df.shape[0]} rows x {final_df.shape[1]} cols)")

# Also save the daily disruption features separately
daily_disruptions_df.to_csv(os.path.join(output_dir, 'daily_disruption_features.csv'), index=False)
print(f"Saved: daily_disruption_features.csv ({daily_disruptions_df.shape[0]} rows x {daily_disruptions_df.shape[1]} cols)")

# Keep the separate datasets too
timetable_df.drop(columns=['start_date_dt', 'end_date_dt', 'mid_date'], errors='ignore').to_csv(
    os.path.join(output_dir, 'ensign_bus_timetable.csv'), index=False)
stops_df = pd.DataFrame(all_stops.values())
stops_df.to_csv(os.path.join(output_dir, 'ensign_bus_stops.csv'), index=False)
disruptions_df.to_csv(os.path.join(output_dir, 'sirisx_disruptions.csv'), index=False)
journeys_df = pd.DataFrame(all_journeys)
journeys_df.to_csv(os.path.join(output_dir, 'ensign_bus_journeys.csv'), index=False)

print(f"\nAll files in {output_dir}:")
for f in sorted(os.listdir(output_dir)):
    if f.endswith('.csv'):
        size = os.path.getsize(os.path.join(output_dir, f))
        print(f"  {f}: {size/1024:.1f} KB")

print(f"\n{'='*60}")
print("MERGE SUMMARY")
print("="*60)
print(f"  Direct merge keys (operator, stop refs): NO OVERLAP")
print(f"  Geographic overlap: NO (disruptions are in NW/SW England)")
print(f"  TEMPORAL merge: YES - joined daily disruption counts to timetable dates")
print(f"  Result: {final_df.shape[0]:,} rows x {final_df.shape[1]} columns")
print(f"  Disruption features added: 10 columns (counts, severity, reasons)")
print(f"  This is valid for 'national disruption climate' affecting network performance")

Saved: ensign_timetable_with_disruptions.csv (34922 rows x 36 cols)
Saved: daily_disruption_features.csv (448 rows x 11 cols)

All files in F:\SOFTWARICA\big-data-transport-analytics\data\processed:
  daily_disruption_features.csv: 38.9 KB
  ensign_bus_journeys.csv: 839.5 KB
  ensign_bus_stops.csv: 22.7 KB
  ensign_bus_timetable.csv: 8648.1 KB
  ensign_timetable_with_disruptions.csv: 11804.2 KB
  sirisx_disruptions.csv: 376.5 KB

MERGE SUMMARY
  Direct merge keys (operator, stop refs): NO OVERLAP
  Geographic overlap: NO (disruptions are in NW/SW England)
  TEMPORAL merge: YES - joined daily disruption counts to timetable dates
  Result: 34,922 rows x 36 columns
  Disruption features added: 10 columns (counts, severity, reasons)
  This is valid for 'national disruption climate' affecting network performance
