# Scraping Ligue 1 Team Formations from FBRef

In [1]:
import pandas as pd
import requests
import random
import time

# FBref team IDs for Ligue 1 2024-25
TEAM_IDS = {
    "Paris Saint-Germain": "e2d8892c",
    "Marseille": "5725cc7b",
    "Monaco": "fd6114db",
    "Nice": "132ebc33",
    "Lille": "cb188c0c",
    "Strasbourg": "c0d3eab4",
    "Lyon": "d53c0b06",
    "Brest": "fb08dbb3",
    "Lens": "fd4e0f7d",
    "Auxerre": "5ae09109",
    "Rennes": "b3072e00",
    "Toulouse": "3f8c4b5f",
    "Angers": "69236f98",
    "Reims": "7fdd64e0",
    "Nantes": "d7a486cd",
    "Le Havre": "5c2737db",
    "Saint-Étienne": "d298ef2c",
    "Montpellier": "281b0e73"
}

BASE_URL = "https://fbref.com/en/squads/{team_id}/{slug}-Stats"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

# Collect most common formations
def try_scrape_formation(team_name, team_id, retries=3, delay=3):
    slug = team_name.replace(" ", "-").replace(".", "").replace("&", "and")
    url = BASE_URL.format(team_id=team_id, slug=slug)

    for attempt in range(retries):
        try:
            print(f"Scraping formation data for {team_name} (Attempt {attempt+1}) → {url}")
            response = requests.get(url, headers=HEADERS)
            time.sleep(random.uniform(4, 10))  # Random delay to avoid being blocked
            matchlog_df = pd.read_html(response.url, attrs={"id": "matchlogs_for"})[0]
            if "Formation" in matchlog_df.columns:
                return matchlog_df["Formation"].value_counts().idxmax()
            return "Formation column missing"
        except Exception as e:
            print(f"⚠️ Attempt {attempt+1} failed for {team_name}: {e}")
    return "Error"

# Collect most common formations
formation_data = []
for team_name, team_id in TEAM_IDS.items():
    formation = try_scrape_formation(team_name, team_id)
    formation_data.append({"Team": team_name, "Most Common Formation": formation})
    
formation_df = pd.DataFrame(formation_data)

formation_df.head()

Scraping formation data for Paris Saint-Germain (Attempt 1) → https://fbref.com/en/squads/e2d8892c/Paris-Saint-Germain-Stats
Scraping formation data for Marseille (Attempt 1) → https://fbref.com/en/squads/5725cc7b/Marseille-Stats
Scraping formation data for Monaco (Attempt 1) → https://fbref.com/en/squads/fd6114db/Monaco-Stats
Scraping formation data for Nice (Attempt 1) → https://fbref.com/en/squads/132ebc33/Nice-Stats
Scraping formation data for Lille (Attempt 1) → https://fbref.com/en/squads/cb188c0c/Lille-Stats
Scraping formation data for Strasbourg (Attempt 1) → https://fbref.com/en/squads/c0d3eab4/Strasbourg-Stats
Scraping formation data for Lyon (Attempt 1) → https://fbref.com/en/squads/d53c0b06/Lyon-Stats
Scraping formation data for Brest (Attempt 1) → https://fbref.com/en/squads/fb08dbb3/Brest-Stats
Scraping formation data for Lens (Attempt 1) → https://fbref.com/en/squads/fd4e0f7d/Lens-Stats
Scraping formation data for Auxerre (Attempt 1) → https://fbref.com/en/squads/5ae0910

Unnamed: 0,Team,Most Common Formation
0,Paris Saint-Germain,4-3-3
1,Marseille,3-4-3
2,Monaco,4-2-3-1
3,Nice,3-4-3
4,Lille,4-2-3-1


In [None]:
formation_df.to_csv("../../../data/teams/raw/formations/ligue_1_team_formations.csv", index=False)
print("✅ Saved team formations to ../../../data/teams/raw/formations/ligue_1_team_formations.csv")

✅ Saved team formations to ../../data/teams/raw/formations/ligue_1_team_formations.csv
