In [None]:
# Installiere falls notwendig:
# !pip install selenium beautifulsoup4 pandas

import re
import time
import json
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup

# Passe den Pfad zu deinem ChromeDriver an:
CHROMEDRIVER_PATH = "../drivers/chromedriver.exe"  # Beispiel: Ordner "drivers" im Projektverzeichnis


In [None]:
def get_all_matches_for_day(season_id, spieltag):
    """
    Ruft mit Selenium die Seite eines Spieltags ab und extrahiert alle Match-Boxen,
    die Ergebnisse enthalten.
    """
    base_url = "https://www.transfermarkt.ch/super-league/spieltag/wettbewerb/C1/plus/"
    url = f"{base_url}?saison_id={season_id}&spieltag={spieltag}"
    
    # Konfiguriere den ChromeDriver
    service = Service(CHROMEDRIVER_PATH)
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")
    
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    time.sleep(5)  # Warten, bis die Seite vollständig geladen ist
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    
    matches = []
    # Durchsuche alle <div class="box">, die ein Element mit "matchresult finished" enthalten.
    for box in soup.find_all("div", class_="box"):
        if box.find("span", class_="matchresult finished"):
            table = box.find("table")
            if table:
                rows = table.find_all("tr")
                for row in rows:
                    if row.find("span", class_="matchresult finished"):
                        cells = row.find_all("td")
                        cell_texts = [cell.get_text(strip=True) for cell in cells]
                        matches.append({
                            "season_id": season_id,
                            "spieltag": spieltag,
                            "cells": cell_texts
                        })
    return matches


In [None]:
def parse_match_cells(cells):
    """
    Zerlegt die rohe Zellenliste in strukturierte Felder:
    - Extrahiert Ranking und Teamnamen (lang/kurz)
    - Zerlegt das Ergebnis in Heim- und Auswärtstore
    """
    rank_pattern = re.compile(r'\((\d+)\)')
    
    # Home-Team-Info aus cells[0] und cells[1]
    home_info = cells[0] if len(cells) > 0 else ""
    home_rank_match = rank_pattern.search(home_info)
    home_rank = int(home_rank_match.group(1)) if home_rank_match else None
    home_team_long = rank_pattern.sub("", home_info).strip()
    
    home_team_short = rank_pattern.sub("", cells[1]).strip() if len(cells) > 1 else ""
    
    # Ergebnis in cells[4]
    result = cells[4] if len(cells) > 4 else ""
    try:
        home_goals, away_goals = map(int, result.split(":"))
    except Exception as e:
        home_goals, away_goals = None, None
    
    # Away-Team-Info aus cells[7] und cells[8]
    away_info = cells[7] if len(cells) > 7 else ""
    away_rank_match = rank_pattern.search(away_info)
    away_rank = int(away_rank_match.group(1)) if away_rank_match else None
    away_team_long = rank_pattern.sub("", away_info).strip()
    
    away_team_short = rank_pattern.sub("", cells[8]).strip() if len(cells) > 8 else ""
    
    return {
        "home_rank": home_rank,
        "home_team_long": home_team_long,
        "home_team_short": home_team_short,
        "home_goals": home_goals,
        "away_goals": away_goals,
        "away_team_long": away_team_long,
        "away_team_short": away_team_short,
        "away_rank": away_rank
    }


In [None]:
def get_all_matchdays(season_id, start_day, end_day):
    all_matches = []
    for spieltag in range(start_day, end_day+1):
        print(f"Verarbeite Spieltag {spieltag} für Saison {season_id}...")
        matches = get_all_matches_for_day(season_id, spieltag)
        for match in matches:
            try:
                parsed = parse_match_cells(match["cells"])
                parsed["season_id"] = season_id
                parsed["spieltag"] = spieltag
                all_matches.append(parsed)
            except Exception as e:
                print(f"Fehler beim Parsen von Spieltag {spieltag}: {e}")
    return all_matches

# Beispiel: Alle Spieltage der Saison 24/25 (Saison-ID 2024) von Spieltag 1 bis 38
all_data = get_all_matchdays(2024, 1, 38)

# Erstelle einen pandas DataFrame
df = pd.DataFrame(all_data)
df.head()


In [None]:
# Zeige den DataFrame an
print(df.head())

# Export als CSV (optional, falls gewünscht)
#df.to_csv("season_2425_data.csv", index=False)
