# Scraping Bundesliga Team Formations from FBRef

In [1]:
import pandas as pd
import requests
import random
import time

# FBref team IDs for Bundesliga 2024-25
TEAM_IDS = {
    "Bayern Munich": "054efa67",
    "Bayer Leverkusen": "c7a9f859",
    "Eintracht Frankfurt": "f0ac8ee6",
    "Freiburg": "a486e511",
    "Borussia Dortmund": "add600ae",
    "Mainz 05": "a224b06a",
    "RB Leipzig": "acbb6a5b",
    "Werder Bremen": "62add3bf",
    "Stuttgart": "598bc722",
    "Monchengladbach": "32f3ee20",
    "Augsburg": "0cdc4311",
    "Wolfsburg": "4eaa11d7",
    "Union Berlin": "7a41008f",
    "St Pauli": "54864664",
    "Hoffenheim": "033ea6b8",
    "Heidenheim": "18d9d2a7",
    "Holstein Kiel": "2ac661d9",
    "Bochum": "b42c6323"
}

BASE_URL = "https://fbref.com/en/squads/{team_id}/{slug}-Stats"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

# Collect most common formations
def try_scrape_formation(team_name, team_id, retries=3, delay=3):
    slug = team_name.replace(" ", "-").replace(".", "").replace("&", "and")
    url = BASE_URL.format(team_id=team_id, slug=slug)

    for attempt in range(retries):
        try:
            print(f"Scraping formation data for {team_name} (Attempt {attempt+1}) → {url}")
            response = requests.get(url, headers=HEADERS)
            time.sleep(random.uniform(4, 10))  # Random delay to avoid being blocked
            matchlog_df = pd.read_html(response.url, attrs={"id": "matchlogs_for"})[0]
            if "Formation" in matchlog_df.columns:
                return matchlog_df["Formation"].value_counts().idxmax()
            return "Formation column missing"
        except Exception as e:
            print(f"⚠️ Attempt {attempt+1} failed for {team_name}: {e}")
    return "Error"

# Collect most common formations
formation_data = []
for team_name, team_id in TEAM_IDS.items():
    formation = try_scrape_formation(team_name, team_id)
    formation_data.append({"Team": team_name, "Most Common Formation": formation})
    
formation_df = pd.DataFrame(formation_data)

formation_df.head()

Scraping formation data for Bayern Munich (Attempt 1) → https://fbref.com/en/squads/054efa67/Bayern-Munich-Stats
Scraping formation data for Bayer Leverkusen (Attempt 1) → https://fbref.com/en/squads/c7a9f859/Bayer-Leverkusen-Stats
Scraping formation data for Eintracht Frankfurt (Attempt 1) → https://fbref.com/en/squads/f0ac8ee6/Eintracht-Frankfurt-Stats
Scraping formation data for Freiburg (Attempt 1) → https://fbref.com/en/squads/a486e511/Freiburg-Stats
Scraping formation data for Borussia Dortmund (Attempt 1) → https://fbref.com/en/squads/add600ae/Borussia-Dortmund-Stats
Scraping formation data for Mainz 05 (Attempt 1) → https://fbref.com/en/squads/a224b06a/Mainz-05-Stats
Scraping formation data for RB Leipzig (Attempt 1) → https://fbref.com/en/squads/acbb6a5b/RB-Leipzig-Stats
Scraping formation data for Werder Bremen (Attempt 1) → https://fbref.com/en/squads/62add3bf/Werder-Bremen-Stats
Scraping formation data for Stuttgart (Attempt 1) → https://fbref.com/en/squads/598bc722/Stuttga

Unnamed: 0,Team,Most Common Formation
0,Bayern Munich,4-2-3-1
1,Bayer Leverkusen,3-4-3
2,Eintracht Frankfurt,4-4-2
3,Freiburg,4-2-3-1
4,Borussia Dortmund,4-2-3-1


In [2]:
formation_df.to_csv("../../data/teams/raw/formations/bundesliga_team_formations.csv", index=False)
print("✅ Saved team formations to ../../data/teams/raw/formations/bundesliga_team_formations.csv")

✅ Saved team formations to ../../data/teams/raw/formations/bundesliga_team_formations.csv
