In [8]:
import xml.etree.ElementTree as ET
import pandas as pd
import requests
import time
import schedule
import datetime
import threading


In [12]:
import pytz
berlin_tz = pytz.timezone('Europe/Berlin')
berlin_time = datetime.datetime.now(berlin_tz)
print("Current time in Berlin:", berlin_time)

Current time in Berlin: 2024-07-03 11:07:47.082382+02:00


In [None]:
stations_df = pd.read_csv("data/fromAPI/StaDa.csv")
stations_df.head()



In [None]:
#dropping lower categories - comment out if you want all
stations_df.drop(labels=stations_df.query("category == 6").index, axis=0,inplace=True)
stations_df.drop(labels=stations_df.query("category == 7").index, axis=0,inplace=True)

In [None]:
#get relevant lists from df
eva_nrs = stations_df["eva_nr"].values
names = stations_df["name"].values
states = stations_df["state"].values
cities = stations_df["city"].values
zipcodes = stations_df["zipcode"].values
longs = stations_df["long"].values
lats = stations_df["lat"].values
cats = stations_df["category"].values




In [None]:
#API client ID and Secret
client_id = '5cf21bb577a46a3f0263677e5bee0969'
client_secret = '0229d92a983df1a6681e56ba6a390135'
#Header for request
headers={
        "DB-Api-Key": client_secret,
        "DB-Client-Id": client_id,
        "accept": "application/xml"
            }

In [None]:
def job():
    
    grabtime = f"{datetime.datetime.now(berlin_tz).hour}"
    if len(grabtime) < 2 : grabtime = f"0{grabtime}"
    plan_data = [] #list initializing
    change_data = []
    buglog = []
    date = datetime.datetime.now(berlin_tz).strftime("%y%m%d")
    print(f"start data collection at{date} {grabtime}")
    for i in range(len(eva_nrs)):
        url_change = f"https://apis.deutschebahn.com/db-api-marketplace/apis/timetables/v1/fchg/{eva_nrs[i]}"
        url_plan = f"https://apis.deutschebahn.com/db-api-marketplace/apis/timetables/v1/plan/{eva_nrs[i]}/{date}/{grabtime}"

        try:
            response_plan = requests.get(url_plan, headers=headers)
            response_change = requests.get(url_change, headers=headers)
        except:
            bug = f"{names[i]} connection skipped"
            buglog.append(bug)
            print(bug)
            time.sleep(0.2)
            continue

        try:
            plan_root = ET.fromstring(response_plan.content)
        except:
            bug = f"{names[i]} plan skipped"
            print(bug)
            buglog.append(bug)
            time.sleep(0.2)
            continue

        for s in plan_root.findall('.//s'):
            s_id = s.get('id')
            s_eva = s.get('eva')
            ar = s.find('ar')
            if ar is not None:
                ar_pt = ar.get('pt')
                ar_ppth = ar.get('ppth')
            else:
                ar_pt = ar_ppth = None
            dp = s.find('dp')
            if dp is not None:
                dp_pt = dp.get('pt')
                dp_l = dp.get('l')
            else:
                dp_pt = dp_l = None  
            # Append the extracted data to the list
            plan_data.append([s_id, ar_pt, dp_pt, dp_l, ar_ppth, eva_nrs[i], cats[i], names[i], states[i], cities[i], zipcodes[i], longs[i], lats[i]])

        try:
            change_root = ET.fromstring(response_change.content)
        #change_root = change_tree.getroot()
        except:
            bug = f"{names[i]} change skipped"
            buglog.append(bug)
            print(bug)
            time.sleep(0.2)
            continue
        # Iterate over each 's' element in the XML
        for s in change_root.findall('.//s'):
            s_id = s.get('id')
            s_eva = s.get('eva')

            m = s.find('m')
            if m is not None:
                cat = m.get('cat')
            else:
                cat = None

            ar = s.find('ar')
            if ar is not None:
                ar_ct = ar.get('ct')#planned arrival
            else:
                ar_ct = None
            
            dp = s.find('dp')
            if dp is not None:
                dp_ct = dp.get('ct')
            else:
                dp_ct = None
            
            change_data.append([s_id, ar_ct, dp_ct, cat])
        
        time.sleep(0.1)



    columns = ['ID', 'arrival', "departure", "train", "path",'eva_nr', "category", "name", "state", "city", "zip", "long", "lat"]
    plan_df = pd.DataFrame(plan_data, columns=columns)
    plan_df['arrival'] = pd.to_datetime(plan_df['arrival'], format='%y%m%d%H%M')
    plan_df['departure'] = pd.to_datetime(plan_df['departure'], format='%y%m%d%H%M')

    columns = ['ID', 'arrival', "departure", "info"]
    change_df = pd.DataFrame(change_data, columns=columns)
    change_df['arrival'] = pd.to_datetime(change_df['arrival'], format='%y%m%d%H%M')
    change_df['departure'] = pd.to_datetime(change_df['departure'], format='%y%m%d%H%M')

    delay_df = pd.merge(plan_df, change_df, how='left', on="ID", suffixes=('_plan', '_change'))
    delay_df['depature_delay_m'] = delay_df['departure_change'] - delay_df['departure_plan']
    delay_df['arrival_delay_m'] = delay_df['arrival_change'] - delay_df['arrival_plan']

    #sorting columns
    delay_df = delay_df[['ID', 'train', 'path', 'eva_nr', "category", 'name', 'state', 'city', 'zip', 'long', 'lat', 'arrival_plan', 'departure_plan', 'arrival_change','departure_change',  'arrival_delay_m', 'depature_delay_m', "info"]]

    delay_df["depature_delay_m"] = delay_df["depature_delay_m"].dt.total_seconds()/60
    delay_df["depature_delay_m"] = delay_df["depature_delay_m"].fillna(value=0)
    delay_df["depature_delay_m"] = delay_df["depature_delay_m"].astype(int)
    delay_df["arrival_delay_m"] = delay_df["arrival_delay_m"].dt.total_seconds()/60
    delay_df["arrival_delay_m"] = delay_df["arrival_delay_m"].fillna(value=0)
    delay_df["arrival_delay_m"] = delay_df["arrival_delay_m"].astype(int)
    delay_df["eva_nr"] = delay_df["eva_nr"].astype(int)

    delay_df.to_csv(f"data/fromAPI/hourly2/{date}_{grabtime}.csv")

    file_name = f"data/fromAPI/hourly2/{date}_{grabtime}_log.txt"

    with open(file_name, "w") as file:
        for item in buglog:
            file.write(f"{item}\n")

schedule.every().hour.at(":10").do(job)
stop_run = False
# Function to keep the script running

def run_scheduler():
    global stop_run
    while not stop_run:
        schedule.run_pending()
        time.sleep(10)  # Sleep for 1 second to prevent high CPU usage

# Run the scheduler in a non-blocking way
scheduler_thread = threading.Thread(target=run_scheduler)
scheduler_thread.start()

def stop_scheduler():
    global stop_run
    stop_run = True
    scheduler_thread.join()

# Example of stopping the scheduler after some time (e.g., 1 hour)
time.sleep(3600*72)  # Let it run for 1 hour
stop_scheduler()
       

start data collection at240703 10
