# Serie A Top Goalscorer and Assister

In [1]:
import pandas as pd
import time
import random

# Dictionary mapping team names to their FBref squad URLs
team_url_map = {
    "Atalanta": "https://fbref.com/en/squads/922493f3/Atalanta-Stats",
    "Bologna": "https://fbref.com/en/squads/1d8099f8/Bologna-Stats",
    "Cagliari": "https://fbref.com/en/squads/c4260e09/Cagliari-Stats",
    "Como": "https://fbref.com/en/squads/28c9c3cd/Como-Stats",
    "Empoli": "https://fbref.com/en/squads/a3d88bd8/Empoli-Stats",
    "Fiorentina": "https://fbref.com/en/squads/421387cf/Fiorentina-Stats",
    "Genoa": "https://fbref.com/en/squads/658bf2de/Genoa-Stats",
    "Hellas Verona": "https://fbref.com/en/squads/0e72edf2/Hellas-Verona-Stats",
    "Inter Milan": "https://fbref.com/en/squads/d609edc0/Internazionale-Stats",
    "Juventus": "https://fbref.com/en/squads/e0652b02/Juventus-Stats",
    "Lazio": "https://fbref.com/en/squads/7213da33/Lazio-Stats",
    "Lecce": "https://fbref.com/en/squads/ffcbe334/Lecce-Stats",
    "AC Milan": "https://fbref.com/en/squads/dc56fe14/AC-Milan-Stats",
    "Monza": "https://fbref.com/en/squads/21680aa4/Monza-Stats",
    "Napoli": "https://fbref.com/en/squads/d48ad4ff/Napoli-Stats",
    "Parma": "https://fbref.com/en/squads/eab4234c/Parma-Stats",
    "Roma": "https://fbref.com/en/squads/cf74a709/Roma-Stats",
    "Torino": "https://fbref.com/en/squads/105360fe/Torino-Stats",
    "Udinese": "https://fbref.com/en/squads/04eea015/Udinese-Stats",
    "Venezia": "https://fbref.com/en/squads/af5d5982/Venezia-Stats"
}

In [2]:
df = pd.DataFrame(columns=["Team", "Top Goalscorer", "Goals", "Top Assister", "Assists"])

# Loop over each team and scrape data
for idx, team in enumerate(team_url_map.keys()):
    url = team_url_map[team]
    attempts = 0
    success = False

    while attempts < 3 and not success:
        try:
            data = pd.read_html(url, attrs={"id": "stats_standard_11"})[0]

            # Handle multi-level columns
            if isinstance(data.columns, pd.MultiIndex):
                data.columns = data.columns.get_level_values(-1)

            # Drop duplicated columns
            data = data.loc[:, ~data.columns.duplicated()]

            # Remove rows like 'Squad Total', 'Opponent Total'
            data = data[~data["Player"].str.contains("Total", na=False)]

            # Keep only valid numeric values
            data = data[pd.to_numeric(data["Gls"], errors="coerce").notna()]
            data["Gls"] = data["Gls"].astype(float)
            data["Ast"] = data["Ast"].astype(float)

            # Extract top scorer and assister
            top_goalscorer = data.sort_values(by="Gls", ascending=False).iloc[0]["Player"]
            top_goals = data.sort_values(by="Gls", ascending=False).iloc[0]["Gls"]
            top_assister = data.sort_values(by="Ast", ascending=False).iloc[0]["Player"]
            top_assists = data.sort_values(by="Ast", ascending=False).iloc[0]["Ast"]

            # Add to result dataframe
            df.at[idx, "Team"] = team
            df.at[idx, "Top Goalscorer"] = top_goalscorer
            df.at[idx, "Goals"] = top_goals
            df.at[idx, "Top Assister"] = top_assister
            df.at[idx, "Assists"] = top_assists

            print(f"✔ Processed {team}")
            success = True
        except Exception as e:
            attempts += 1
            print(f"⚠ Attempt {attempts} failed for {team}: {e}")
            time.sleep(random.uniform(2, 5))

    if not success:
        print(f"❌ Failed to process {team} after 3 attempts.")

    time.sleep(random.uniform(4, 10))

# Show results
print("\n✅ Final Result:")
print(df.head())

✔ Processed Atalanta
✔ Processed Bologna
✔ Processed Cagliari
✔ Processed Como
✔ Processed Empoli
✔ Processed Fiorentina
✔ Processed Genoa
✔ Processed Hellas Verona
✔ Processed Inter Milan
✔ Processed Juventus
✔ Processed Lazio
✔ Processed Lecce
✔ Processed AC Milan
✔ Processed Monza
✔ Processed Napoli
✔ Processed Parma
✔ Processed Roma
✔ Processed Torino
✔ Processed Udinese
✔ Processed Venezia

✅ Final Result:
       Team       Top Goalscorer Goals     Top Assister Assists
0  Atalanta        Mateo Retegui  25.0  Raoul Bellanova     8.0
1   Bologna    Riccardo Orsolini  14.0     Juan Miranda     6.0
2  Cagliari      Roberto Piccoli  10.0  Tommaso Augello     7.0
3      Como          Assane Diao   8.0      Nicolás Paz     8.0
4    Empoli  Sebastiano Esposito   8.0  Liberato Cacace     4.0


In [3]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Team,Top Goalscorer,Goals,Top Assister,Assists
0,Atalanta,Mateo Retegui,25.0,Raoul Bellanova,8.0
1,Bologna,Riccardo Orsolini,14.0,Juan Miranda,6.0
2,Cagliari,Roberto Piccoli,10.0,Tommaso Augello,7.0
3,Como,Assane Diao,8.0,Nicolás Paz,8.0
4,Empoli,Sebastiano Esposito,8.0,Liberato Cacace,4.0
5,Fiorentina,Moise Kean,18.0,Yacine Adli,6.0
6,Genoa,Andrea Pinamonti,10.0,Aarón Martín,7.0
7,Hellas Verona,Casper Tengstedt,6.0,Darko Lazović,3.0
8,Inter Milan,Marcus Thuram,14.0,Federico Dimarco,7.0
9,Juventus,Dušan Vlahović,10.0,Khéphren Thuram,5.0


In [4]:
df.to_csv("../../data/teams/raw/goals_assists/serie_a_top_scorers_assisters.csv", index=False)