In [1]:
# Get station data
import pandas as pd

stationen = pd.read_csv("./Data/Verzeichnis der Verkehrsstationen.csv", sep = ";")
stationen = stationen.drop(range(1038,1058)) #these are NaNs
replace_map = {
    " a.d.L.": "/Leitha",
    " a.d.": "/", 
    " a d": "/",
    "N.Ö.": "NÖ",
    #"St.": "St. ",
    "Westbf": "Westbahnhof",
    "Ostbf": "Ostbahnhof",
    "Obertrattnach-M Hofk": "Obertrattnach-Markt Hofkirchen",
    "Wien Hauptbahnhof, Wien Hauptbahnhof Südtiroler Platz": "Wien Hbf",
    "Wien Mitte, Wien Mitte - CAT": "Wien Mitte",
    "Wörthersee": "Wörther_See",
    " ": "_",
    "-": "_"
}

def replace_text(text):
    for old, new in replace_map.items():
        text = text.replace(old, new)
    return text
    
stationen["Verkehrsstation"] = stationen["Verkehrsstation"].apply(replace_text)
stations = sorted(stationen["Verkehrsstation"].tolist())

In [3]:
# Progress File

# If the progress file was not saved correctly and a code restart is needed enter last_station and last_date from your 2years_train_schedule here.
# Usually happens when the browser has no more memory space.

import json

#progress = {
#    "last_station": None
#    "last_date": None
#}

#with open("./progress_checkpoint.json", "w") as f:
#    json.dump(progress, f)

In [4]:
# Get time range
import datetime as dt

date_range = []
start_date = dt.datetime(2023, 1, 1)  
end_date = dt.datetime(2025, 1, 1)

for x in range((end_date-start_date).days + 1):
    date = start_date + dt.timedelta(days = x)
    date_range.append(date.strftime("%Y%m%d"))

In [5]:
# Where the Magic begins
import requests, bs4, csv, time, json, logging

# Configure logging to capture errors
logging.basicConfig(filename = "errors_p5.log", level = logging.ERROR)

# Defining url header (for also capturing historic data)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Accept-Language": "de,de-DE;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Cookie": None #actuall cookie requiered with access to Pro version, turned into None for data security reasons
}             

# Open the final csv document for appending 
with open("./Data/Train/Part_5_2years_train_schedule.csv", mode = "a", newline = "", encoding = "utf-8") as file:
    writer = csv.writer(file)

    # Write header if it doesn't already exist
    if file.tell() == 0:
        writer.writerow(["Station", "Date", "Arrival", "Train Nr.", "Departure from", "Arrival train station"])
  
    # Iterate trough all the stations    
    for station in stations_with_an_der: # reversed(stations) => if traversal from the bottom is wanted

        # Open Progress file
        with open("./progress_checkpoint.json", "r") as f:
            progress = json.load(f)
        last_processed_station = progress["last_station"]
        
        if not last_processed_station or station >= last_processed_station: # < for reversed; > for normal use
            
            # Iterate through date range for each station
            for current_date in date_range: 
                
                with open("./progress_checkpoint.json", "r") as f:
                    progress = json.load(f)
                last_processed_date = progress["last_date"]

                if not last_processed_date or current_date >= last_processed_date: 

                    # Establish exception variables
                    retry_count = 0
                    max_retries = 3
                    backoff = 0.5 

                    # Create retry loop for connection errors
                    while retry_count < max_retries:
                        try:
                            # Get response from website
                            url = f"https://www.zugfinder.net/de/bahnhofstafel-{station}-{current_date}-arr"
                            #print(url)
                            time.sleep(backoff)
                            response = requests.get(url, headers = headers)

                            # Escape retrys for certain status codes of the website, as there is no use in retrys
                            if 400 <= response.status_code < 500:
                                print(f"Client error ({response.status_code}): {url}")
                                logging.error(f"Client error ({response.status_code}): {url}")
                                break

                            # Find data in html
                            soup = bs4.BeautifulSoup(response.content, "html.parser")
                            table = soup.find("table", {"id": "zugdaten"})

                            # Escape retry, if table is empty
                            if table is None:
                                progress = {"last_station": station}
                                next_date_index = (date_range.index(current_date) + 1) % len(date_range)
                                progress["last_date"] = date_range[next_date_index]
                                with open("./progress_checkpoint.json", "w") as f:
                                    json.dump(progress, f)
                                break
                        
                            # Write data in csv document
                            rows = table.find_all("tr")[1:] # first one is just initializing
                            for row in rows:
                                row_data = []
                                cells = row.find_all("td")
                                for cell in cells:
                                    row_data.append(cell.text.strip())    
                                writer.writerow([station, current_date] + row_data)

                            # Update progress
                            progress = {"last_station": station}
                            next_date_index = (date_range.index(current_date) + 1) % len(date_range)
                            progress["last_date"] = date_range[next_date_index]
                            with open("./progress_checkpoint.json", "w") as f:
                                json.dump(progress, f)

                            # Exit retry, if successfull
                            break
                    
                        except Exception as e:
                            # Increase exception variables and log error
                            retry_count += 1
                            backoff *= 2  
                            print(f"Error processing URL: {url} (Attempt {retry_count}/{max_retries})\n{e}")
                            logging.error(f"Error processing URL: {url} (Attempt {retry_count}/{max_retries})\n{e}")

                    # Information after failed retry
                    if retry_count == max_retries:
                        print(f"Failed to process city: {station}, date: {current_date} after {max_retries} attempts. Skipping...")

                        # Update progress after failed attempt
                        progress = {"last_station": station}
                        next_date_index = (date_range.index(current_date) + 1) % len(date_range)
                        progress["last_date"] = date_range[next_date_index]
                        with open("./progress_checkpoint.json", "w") as f:
                            json.dump(progress, f)   
# Data collection complete                   
print("Data collection complete!")

Data collection complete!
