# Scraping PL Team Formations from FBRef

In [1]:
import pandas as pd
import requests
import random
import time

# FBref team IDs for Premier League 2024-25
TEAM_IDS = {
    "Arsenal": "18bb7c10",
    "Aston Villa": "8602292d",
    "Bournemouth": "4ba7cbea",
    "Brentford": "cd051869",
    "Brighton": "d07537b9",
    "Chelsea": "cff3d9bb",
    "Crystal Palace": "47c64c55",
    "Everton": "d3fd31cc",
    "Fulham": "fd962109",
    "Ipswich Town": "b74092de",
    "Leicester City": "a2d435b3",
    "Liverpool": "822bd0ba",
    "Manchester City": "b8fd03ef",
    "Manchester United": "19538871",
    "Newcastle Utd": "b2b47a98",
    "Nottingham Forest": "e4a775cb",
    "Southampton": "33c895d4",
    "Tottenham": "361ca564",
    "West Ham": "7c21e445",
    "Wolves": "8cec06e1"
}

BASE_URL = "https://fbref.com/en/squads/{team_id}/{slug}-Stats"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

# Collect most common formations
def try_scrape_formation(team_name, team_id, retries=3, delay=3):
    slug = team_name.replace(" ", "-").replace(".", "").replace("&", "and")
    url = BASE_URL.format(team_id=team_id, slug=slug)

    for attempt in range(retries):
        try:
            print(f"Scraping formation data for {team_name} (Attempt {attempt+1}) → {url}")
            response = requests.get(url, headers=HEADERS)
            time.sleep(random.uniform(4, 10))  # Random delay to avoid being blocked
            matchlog_df = pd.read_html(response.url, attrs={"id": "matchlogs_for"})[0]
            if "Formation" in matchlog_df.columns:
                return matchlog_df["Formation"].value_counts().idxmax()
            return "Formation column missing"
        except Exception as e:
            print(f"⚠️ Attempt {attempt+1} failed for {team_name}: {e}")
    return "Error"

# Collect most common formations
formation_data = []
for team_name, team_id in TEAM_IDS.items():
    formation = try_scrape_formation(team_name, team_id)
    formation_data.append({"Team": team_name, "Most Common Formation": formation})
    
formation_df = pd.DataFrame(formation_data)

formation_df.head()

Scraping formation data for Arsenal (Attempt 1) → https://fbref.com/en/squads/18bb7c10/Arsenal-Stats
Scraping formation data for Aston Villa (Attempt 1) → https://fbref.com/en/squads/8602292d/Aston-Villa-Stats
Scraping formation data for Bournemouth (Attempt 1) → https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats
Scraping formation data for Brentford (Attempt 1) → https://fbref.com/en/squads/cd051869/Brentford-Stats
Scraping formation data for Brighton (Attempt 1) → https://fbref.com/en/squads/d07537b9/Brighton-Stats
Scraping formation data for Chelsea (Attempt 1) → https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats
Scraping formation data for Crystal Palace (Attempt 1) → https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats
Scraping formation data for Everton (Attempt 1) → https://fbref.com/en/squads/d3fd31cc/Everton-Stats
Scraping formation data for Fulham (Attempt 1) → https://fbref.com/en/squads/fd962109/Fulham-Stats
Scraping formation data for Ipswich Town (Attempt 1) → ht

Unnamed: 0,Team,Most Common Formation
0,Arsenal,4-3-3
1,Aston Villa,4-2-3-1
2,Bournemouth,4-2-3-1
3,Brentford,4-2-3-1
4,Brighton,4-2-3-1


In [2]:
formation_df.to_csv("../../data/teams/raw/formations/pl_team_formations.csv", index=False)
print("✅ Saved team formations to ../../data/teams/raw/formations/pl_team_formations.csv")

✅ Saved team formations to ../../data/teams/raw/formations/pl_team_formations.csv
