# Scraping Serie A Team Formations from FBRef

In [1]:
import pandas as pd
import requests
import random
import time

# FBref team IDs for Serie A 2024-25
TEAM_IDS = {
    "Napoli": "d48ad4ff",
    "Inter": "d609edc0",
    "Atalanta": "922493f3",
    "Juventus": "e0652b02",
    "Lazio": "7213da33",
    "Roma": "cf74a709",
    "Bologna": "1d8099f8",
    "AC Milan": "dc56fe14",
    "Fiorentina": "421387cf",
    "Como": "28c9c3cd",
    "Torino": "105360fe",
    "Udinese": "04eea015",
    "Genoa": "658bf2de",
    "Cagliari": "c4260e09",
    "Hellas Verona": "0e72edf2",
    "Parma": "eab4234c",
    "Venezia": "af5d5982",
    "Lecce": "ffcbe334",
    "Empoli": "a3d88bd8",
    "Monza": "21680aa4"
}

BASE_URL = "https://fbref.com/en/squads/{team_id}/{slug}-Stats"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

# Collect most common formations
def try_scrape_formation(team_name, team_id, retries=3, delay=3):
    slug = team_name.replace(" ", "-").replace(".", "").replace("&", "and")
    url = BASE_URL.format(team_id=team_id, slug=slug)

    for attempt in range(retries):
        try:
            print(f"Scraping formation data for {team_name} (Attempt {attempt+1}) → {url}")
            response = requests.get(url, headers=HEADERS)
            time.sleep(random.uniform(4, 10))  # Random delay to avoid being blocked
            matchlog_df = pd.read_html(response.url, attrs={"id": "matchlogs_for"})[0]
            if "Formation" in matchlog_df.columns:
                return matchlog_df["Formation"].value_counts().idxmax()
            return "Formation column missing"
        except Exception as e:
            print(f"⚠️ Attempt {attempt+1} failed for {team_name}: {e}")
    return "Error"

# Collect most common formations
formation_data = []
for team_name, team_id in TEAM_IDS.items():
    formation = try_scrape_formation(team_name, team_id)
    formation_data.append({"Team": team_name, "Most Common Formation": formation})
    
formation_df = pd.DataFrame(formation_data)

formation_df.head()

Scraping formation data for Napoli (Attempt 1) → https://fbref.com/en/squads/d48ad4ff/Napoli-Stats
Scraping formation data for Inter (Attempt 1) → https://fbref.com/en/squads/d609edc0/Inter-Stats
Scraping formation data for Atalanta (Attempt 1) → https://fbref.com/en/squads/922493f3/Atalanta-Stats
Scraping formation data for Juventus (Attempt 1) → https://fbref.com/en/squads/e0652b02/Juventus-Stats
Scraping formation data for Lazio (Attempt 1) → https://fbref.com/en/squads/7213da33/Lazio-Stats
Scraping formation data for Roma (Attempt 1) → https://fbref.com/en/squads/cf74a709/Roma-Stats
Scraping formation data for Bologna (Attempt 1) → https://fbref.com/en/squads/1d8099f8/Bologna-Stats
Scraping formation data for AC Milan (Attempt 1) → https://fbref.com/en/squads/dc56fe14/AC-Milan-Stats
Scraping formation data for Fiorentina (Attempt 1) → https://fbref.com/en/squads/421387cf/Fiorentina-Stats
Scraping formation data for Como (Attempt 1) → https://fbref.com/en/squads/28c9c3cd/Como-Stats


Unnamed: 0,Team,Most Common Formation
0,Napoli,4-3-3
1,Inter,3-5-2
2,Atalanta,3-4-3
3,Juventus,4-2-3-1
4,Lazio,4-2-3-1


In [None]:
formation_df.to_csv("../../../data/teams/raw/formations/serie_a_team_formations.csv", index=False)
print("✅ Saved team formations to ../../../data/teams/raw/formations/serie_a_team_formations.csv")

✅ Saved team formations to ../../data/teams/raw/formations/serie_a_team_formations.csv
