# Scraping La Liga Team Formations from FBRef

In [1]:
import pandas as pd
import requests
import random
import time

# FBref team IDs for La Liga 2024-25
TEAM_IDS = {
    "Barcelona": "206d90db",
    "Real Madrid": "53a2f082",
    "Atletico Madrid": "db3b9613",
    "Athletic Bilbao": "2b390eca",
    "Villarreal": "2a8183b3",
    "Real Betis": "fc536746",
    "Celta Vigo": "f25da7fb",
    "Rayo Vallecano": "98e8af82",
    "Mallorca": "2aa12281",
    "Osasuna": "03c57e2b",
    "Valencia": "dcc91a7b",
    "Real Sociedad": "e31d1cd9",
    "Girona": "9024a00a",
    "Sevilla": "ad2be733",
    "Getafe": "7848bd64",
    "Espanyol": "a8661628",
    "Alaves": "8d6fd021",
    "Leganes": "7c6f2c78",
    "Las Palmas": "0049d422",
    "Valladolid": "17859612"
}

BASE_URL = "https://fbref.com/en/squads/{team_id}/{slug}-Stats"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

# Collect most common formations
def try_scrape_formation(team_name, team_id, retries=3, delay=3):
    slug = team_name.replace(" ", "-").replace(".", "").replace("&", "and")
    url = BASE_URL.format(team_id=team_id, slug=slug)

    for attempt in range(retries):
        try:
            print(f"Scraping formation data for {team_name} (Attempt {attempt+1}) → {url}")
            response = requests.get(url, headers=HEADERS)
            time.sleep(random.uniform(4, 10))  # Random delay to avoid being blocked
            matchlog_df = pd.read_html(response.url, attrs={"id": "matchlogs_for"})[0]
            if "Formation" in matchlog_df.columns:
                return matchlog_df["Formation"].value_counts().idxmax()
            return "Formation column missing"
        except Exception as e:
            print(f"⚠️ Attempt {attempt+1} failed for {team_name}: {e}")
    return "Error"

# Collect most common formations
formation_data = []
for team_name, team_id in TEAM_IDS.items():
    formation = try_scrape_formation(team_name, team_id)
    formation_data.append({"Team": team_name, "Most Common Formation": formation})
    
formation_df = pd.DataFrame(formation_data)

formation_df.head()

Scraping formation data for Barcelona (Attempt 1) → https://fbref.com/en/squads/206d90db/Barcelona-Stats
Scraping formation data for Real Madrid (Attempt 1) → https://fbref.com/en/squads/53a2f082/Real-Madrid-Stats
Scraping formation data for Atletico Madrid (Attempt 1) → https://fbref.com/en/squads/db3b9613/Atletico-Madrid-Stats
Scraping formation data for Athletic Bilbao (Attempt 1) → https://fbref.com/en/squads/2b390eca/Athletic-Bilbao-Stats
Scraping formation data for Villarreal (Attempt 1) → https://fbref.com/en/squads/2a8183b3/Villarreal-Stats
Scraping formation data for Real Betis (Attempt 1) → https://fbref.com/en/squads/fc536746/Real-Betis-Stats
Scraping formation data for Celta Vigo (Attempt 1) → https://fbref.com/en/squads/f25da7fb/Celta-Vigo-Stats
Scraping formation data for Rayo Vallecano (Attempt 1) → https://fbref.com/en/squads/98e8af82/Rayo-Vallecano-Stats
Scraping formation data for Mallorca (Attempt 1) → https://fbref.com/en/squads/2aa12281/Mallorca-Stats
Scraping form

Unnamed: 0,Team,Most Common Formation
0,Barcelona,4-2-3-1
1,Real Madrid,4-2-3-1
2,Atletico Madrid,4-4-2
3,Athletic Bilbao,4-2-3-1
4,Villarreal,4-4-2


In [2]:
formation_df.to_csv("../../data/teams/raw/formations/la_liga_team_formations.csv", index=False)
print("✅ Saved team formations to ../../data/teams/raw/formations/la_liga_team_formations.csv")

✅ Saved team formations to ../../data/teams/raw/formations/la_liga_team_formations.csv
