# Fetch Historical Data
Fetches historical data from the Trafiklab API Kollektivtrafikens Datalabb (KoDa).

## Imports

In [None]:
import os
from pathlib import Path
import sys

root_dir = Path().absolute()
# Strip subdirectories if the notebook started in any
if root_dir.parts[-1:] == ('pipeline',):
    root_dir = Path(*root_dir.parts[:-1])
if root_dir.parts[-1:] == ('src',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 

os.chdir(root_dir)
print(f"Root dir: {Path.cwd()}")

from datetime import date, timedelta
from src.data_utils.filter import *
from src.data_utils.ingest import *

## Fetch Data

### Decide How Many Days in the Past to Fetch

In [None]:
# Define the start date and how many days in the past including the start date to fetch
# Start date must be yesterday or earlier
number_of_days = 7
start_date = date(2025, 12, 23)

dates = [start_date - timedelta(days=i) for i in range(number_of_days)]
dates = [d.strftime("%Y-%m-%d") for d in dates]

print(dates)

### Fetch Static Data from Trafiklab's KoDa API

In [None]:
DATA_ROOT = Path('data')
for d in dates:
    date_dir = Path(f"data/static/{d}")
    if date_dir.exists():
        print(f"{d} exists, skipping")
        continue
        
    max_retries = 10
    for attempt in range(max_retries):
        try:
            zip_file = fetch_static(d, "data")
            zip_dir = extract_zip(zip_file)
            output_dir = txt_to_csv(zip_dir)
            break
        except Exception as e:
            print(f"Attempt {attempt} failed: {e}")
            if attempt == max_retries - 1:
                print("Reached max retries, skipping date")
                
            sleep_time = 5
            print(f"Retrying in {sleep_time}s...")
            time.sleep(sleep_time)
    else:
        print(f'Download failed for date {d}')
        break
    filter_irrelevant_files(output_dir, date)
    filter_static_data_for_date(output_dir, date)
print("Finished")

### Fetch Realtime VehiclePositions Data from Trafiklab's KoDa API

In [None]:
for d in dates:
    date_dir = Path(f"data/realtime/{d}")
    if date_dir.exists():
        print(f"{d} exists, skipping")
        continue

    for h in range(24):
        print(f"Fetching for {d} at hour {h}")
        realtime_file = fetch_realtime(d, "data", feed='VehiclePositions', hour=h)
        extracted = extract_7z(realtime_file, feed='VehiclePositions', hour=h)
        raw_dir = flatten_extracted_structure(extracted)
        filter_realtime_data_VP(raw_dir, d)

print("Finished fetching realtime VehiclePositions data")