# Fetch Historical Data
Fetches historical data from the Trafiklab API Kollektivtrafikens Datalabb (KoDa).

## Imports

In [1]:
import os
from pathlib import Path
import sys

root_dir = Path().absolute()
# Strip subdirectories if the notebook started in any
if root_dir.parts[-1:] == ('pipeline',):
    root_dir = Path(*root_dir.parts[:-1])
if root_dir.parts[-1:] == ('src',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 

os.chdir(root_dir)
print(f"Root dir: {Path.cwd()}")

from datetime import date, timedelta
from src.data_utils.filter import *
from src.data_utils.ingest import *

Root dir: C:\Users\royli\Desktop\Courses\ID2223_Scalable_Machine_Learning_and_Deep_Learning\Project


## Fetch Data

### Decide How Many Days in the Past to Fetch

In [2]:
# Define the start date and how many days in the past including the start date to fetch
# Start date must be yesterday or earlier
number_of_days = 2
start_date = date(2025, 12, 10)

dates = [start_date - timedelta(days=i) for i in range(number_of_days)]
dates = [d.strftime("%Y-%m-%d") for d in dates]

print(dates)

['2025-12-10', '2025-12-09']


### Fetch Static Data from Trafiklab's KoDa API

In [3]:
DATA_ROOT = Path('data')
for d in dates:
    date_dir = Path(f"data/static/{d}")
    if date_dir.exists():
        print(f"{d} exists, skipping")
        continue
        
    max_retries = 10
    for attempt in range(max_retries):
        try:
            zip_file = fetch_static(d, "data")
            zip_dir = extract_zip(zip_file)
            output_dir = txt_to_csv(zip_dir)
            break
        except Exception as e:
            print(f"Attempt {attempt} failed: {e}")
            if attempt == max_retries - 1:
                print("Reached max retries, skipping date")
                
            sleep_time = 5
            print(f"Retrying in {sleep_time}s...")
            time.sleep(sleep_time)
    else:
        print(f'Download failed for date {d}')
        break
    filter_irrelevant_files(output_dir, date)
    filter_static_data_for_date(output_dir, date)
print("Finished")

KeyboardInterrupt: 

### Fetch Realtime VehiclePositions Data from Trafiklab's KoDa API

In [None]:
for d in dates:
    date_dir = Path(f"data/realtime/{d}/VehiclePositions")
    if date_dir.exists():
        print(f"{d} exists, skipping")
        continue

    print(f"Fetching realtime VehiclePositions data for {d}")
    realtime_file = fetch_realtime(d, "data", feed='VehiclePositions')
    if realtime_file is not None:
        extracted = extract_7z(realtime_file, feed='VehiclePositions')
        raw_dir = flatten_extracted_structure(extracted)
        preprocess_and_aggregate_VP(raw_dir, d)

print("Finished fetching realtime VehiclePositions data")

Fetching realtime VehiclePositions data for 2025-12-10
Saved GTFS realtime file to data\2025-12-10.7z.
Successfully extracted data\2025-12-10.7z to data\realtime\2025-12-10\VehiclePositions\raw
Removed data\2025-12-10.7z
Successfully flattened data\realtime\2025-12-10\VehiclePositions\raw
Fetching realtime VehiclePositions data for 2025-12-09
[1/65] Still processing, retrying connection in 60 seconds
[2/65] Still processing, retrying connection in 60 seconds
[3/65] Still processing, retrying connection in 60 seconds
[4/65] Still processing, retrying connection in 60 seconds
[5/65] Still processing, retrying connection in 60 seconds
[6/65] Still processing, retrying connection in 60 seconds
[7/65] Still processing, retrying connection in 60 seconds
[8/65] Still processing, retrying connection in 60 seconds
[9/65] Still processing, retrying connection in 60 seconds
[10/65] Still processing, retrying connection in 60 seconds
[11/65] Still processing, retrying connection in 60 seconds
[12/

In [None]:
for d in dates:
    date_dir = Path(f"data/realtime/{d}/TripUpdates")
    if date_dir.exists():
        print(f"{d} exists, skipping")
        continue

    print(f"Fetching realtime TripUpdates data for {d}")
    realtime_file = fetch_realtime(d, "data", feed='TripUpdates')
    if realtime_file is not None:
        extracted = extract_7z(realtime_file, feed='TripUpdates')
        raw_dir = flatten_extracted_structure(extracted)
        preprocess_and_aggregate_TU(raw_dir, d)

print("Finished fetching realtime TripUpdates data")