# Requesting and Collecting Data from the DB API

In [1]:
# uncomment, to install the required packages
"""
#!source .jupyter_venv/.venv/bin/activate #if you have an venv for your jupyter kernal, then activate it
import sys
!{sys.executable} -m pip install pyhafas
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install pyarrow #needed in order to read and write feather files
"""

'\n#!source .jupyter_venv/.venv/bin/activate #if you have an venv for your jupyter kernal, then activate it\nimport sys\n!{sys.executable} -m pip install pyhafas\n!{sys.executable} -m pip install pandas\n!{sys.executable} -m pip install pyarrow #needed in order to read and write feather files\n'

# simplified program
This is a simplified version of the program to request the data.
The program which was used has additional features and can be found in the project folder.

The API does not provide delays that are too far in the past, so current data needs to be requested, and previous data must be updated multiple times throughout the day. 

In [2]:
from pyhafas import HafasClient
from pyhafas.profile import DBProfile
import pandas as pd 
from datetime import datetime
from time import sleep

In [3]:
def request_data(start_time, products):
    #requests data and returns it as pandas data frame
    
    request_specifications = {
             'station': '8098105',
             'date': start_time,
             'duration': 1440,  # 1440 minutes = 1 day
             'max_trips': 1e5,
             'products': products
    }
    return pd.DataFrame((stationBoardLeg.__dict__ for stationBoardLeg in client.departures(**request_specifications)))\
        .astype({'id': 'string', 'name' : 'string', 'station' : 'string',  'direction' : 'string', 'platform' : 'string'})\
        .set_index('id') # to set this column as index is important for the update function: it needs it to identify which column of the old data frame needs to be updated with which the new one


In [4]:
products = {
        'long_distance_express': True, # ICE/ECE
        'long_distance': True, # IC/EC
        'regional_express': True, #RE/IRE
        'regional': True, # RB
        'suburban': True, # S
        'bus': False, 
        'ferry': False,
        'subway': False, # U
        'tram': False, # STR/T
        'taxi': False
    }

number_of_requests = 20
wait_time_in_s = 600   # wait this time betwen two requests # 600s = 10 min
start_time = datetime.fromisoformat('2024-10-23 00:00') # each data request will request the data of one day starting at this time/date

In [5]:
client = HafasClient(DBProfile())

# after each request updated_data will get updated with the new data of the request
updated_data = request_data(start_time, products)

for request_no in range(number_of_requests): # one safety intervall period to be extra sure to finish before midnight in addition int(...) rounds off
        #request:
        request_time =  datetime.now()
        new_data = request_data(request_time, products)
        #save request (backup):
        new_data.to_feather(f"request_{request_no}.feather", compression_level=6)
        #update dataframe:
        updated_data.update(new_data)
        #wait beforer doing the next request
        sleep(wait_time_in_s) 

#save final results       
updated_data.to_feather(f"result.feather", compression_level=6)
updated_data.to_csv(f"result.csv")


Every request result is saved in Feather format:

- The read and write speeds of Feather files are faster than those of CSV files.
- Feather files take up less disk space and can be additionally compressed during creation.
- Feather files are binary files and can be easily read and written with pandas (but are not human-readable).

The outcome of each request is saved (backup)

The final result is a dataframe containing the most recent information for each train/ trip, saved both in feather and csv format.


## additional features of the current (extended) program

The features of the current program (code can be found in the project folder) include:
- It creates a new folder for each date the program runs. This folder contains both a Feather and CSV file with the day's results (once the day is completed), as well as another folder for backups (supplemented after each request).
- The time taken for each data request is measured and subtracted from the waiting interval.
- The request backups (saved in each loop) include all products. For the resulting updated file, these are filtered for trains (criterion: has a numeric platform number).