# Combining Top Scorers and Top Assisters into a Single Dataset

In [1]:
import pandas as pd
import difflib

# === Load the main dataset ===
all_league_df = pd.read_csv("../../data/teams/cleaned/all_league_teams.csv")

all_league_df.drop(columns=[
    "Top Goalscorer", "Goals", "Top Assister", "Assists"
], inplace=True, errors='ignore')

# === Load league top scorer data ===
pl = pd.read_csv("../../data/teams/raw/goals_assists/pl_top_scorers_assisters.csv")
serie_a = pd.read_csv("../../data/teams/raw/goals_assists/serie_a_top_scorers_assisters.csv")
la_liga = pd.read_csv("../../data/teams/raw/goals_assists/la_liga_top_scorers_assisters.csv")
bundesliga = pd.read_csv("../../data/teams/raw/goals_assists/bundesliga_top_scorers_assisters.csv")
ligue_1 = pd.read_csv("../../data/teams/raw/goals_assists/ligue_1_top_scorers_assisters.csv")

# === Combine all top scorer/assister data ===
scorer_df = pd.concat([pl, serie_a, la_liga, bundesliga, ligue_1], ignore_index=True)

# === Fuzzy match team names using difflib ===
matched_teams = {}
scorer_team_names = scorer_df["Team"].unique()

for name in all_league_df["Team Name"].unique():
    match = difflib.get_close_matches(name, scorer_team_names, n=1, cutoff=0.7)
    if match:
        matched_teams[name] = match[0]

# Add matched team column to base dataset
all_league_df["Matched Team"] = all_league_df["Team Name"].map(matched_teams)

cols_to_remove = [
    "Top Goalscorer", "Top Goalscorer.1", 
    "Goals", "Goals.1", 
    "Top Assister", "Top Assister.1", 
    "Assists", "Assists.1"
]

all_league_df.drop(columns=[col for col in cols_to_remove if col in all_league_df.columns], inplace=True)

# === Merge on matched team names ===
merged_df = pd.merge(all_league_df, scorer_df, how="left", left_on="Matched Team", right_on="Team")

# === Clean up ===
merged_df.drop(columns=["Matched Team", "Team",
                        "Top Goalscorer_x", "Goals_x", "Top Assister_x", "Assists_x"], inplace=True, errors='ignore')

merged_df.rename(columns={
    "Top Goalscorer_y": "Top Goalscorer",
    "Goals_y": "Goals",
    "Top Assister_y": "Top Assister",
    "Assists_y": "Assists"
}, inplace=True)

manual_patch = {
    "Bologna FC 1909": {"Top Goalscorer": "Riccardo Orsolini", "Goals": 11, "Top Assister": "Lewis Ferguson", "Assists": 6},
    "Como 1907": {"Top Goalscorer": "Simone Verdi", "Goals": 10, "Top Assister": "Patrick Cutrone", "Assists": 5},
    "Udinese Calcio": {"Top Goalscorer": "Lorenzo Lucca", "Goals": 9, "Top Assister": "Roberto Pereyra", "Assists": 5},
    "Parma Calcio 1913": {"Top Goalscorer": "Dennis Man", "Goals": 11, "Top Assister": "Adrian Benedyczak", "Assists": 7},
    "Cagliari Calcio": {"Top Goalscorer": "Gianluca Lapadula", "Goals": 8, "Top Assister": "Nahitan Nández", "Assists": 4},
    "Real Betis Balompié": {"Top Goalscorer": "Willian José", "Goals": 9, "Top Assister": "Isco", "Assists": 7},
    "RCD Espanyol Barcelona": {"Top Goalscorer": "Martin Braithwaite", "Goals": 17, "Top Assister": "Javi Puado", "Assists": 5},
    "Deportivo Alavés": {"Top Goalscorer": "Samu Omorodion", "Goals": 7, "Top Assister": "Luis Rioja", "Assists": 4}
}

for team, stats in manual_patch.items():
    mask = merged_df["Team Name"] == team
    merged_df.loc[mask, "Top Goalscorer"] = stats["Top Goalscorer"]
    merged_df.loc[mask, "Goals"] = stats["Goals"]
    merged_df.loc[mask, "Top Assister"] = stats["Top Assister"]
    merged_df.loc[mask, "Assists"] = stats["Assists"]

merged_df

Unnamed: 0,League,Team Name,MP,GF,GA,GD,xG,xGA,xGD,Shots/90,...,Tackles,Tackles Won,Challenges Attempted,Aerial Duel Win %,Most Common Formation,Logo URL,Top Goalscorer,Goals,Top Assister,Assists
0,Premier League,Manchester City,36,67,43,24,63.6,45.3,18.3,15.89,...,475,287,430,48.4,4-2-3-1,https://cdn.ssref.net/nocdn/tlogo/fb/b8fd03ef.png,Erling Haaland,21.0,Sávio,8.0
1,Premier League,Arsenal FC,36,66,33,33,57.0,32.3,24.7,14.14,...,563,335,483,50.5,4-3-3,https://cdn.ssref.net/nocdn/tlogo/fb/18bb7c10.png,Kai Havertz,9.0,Bukayo Saka,10.0
2,Premier League,Liverpool FC,36,83,37,46,77.8,34.6,43.2,16.86,...,614,380,626,52.8,4-2-3-1,https://cdn.ssref.net/nocdn/tlogo/fb/822bd0ba.png,Mohamed Salah,28.0,Mohamed Salah,18.0
3,Premier League,Chelsea FC,36,62,43,19,65.9,45.8,20.1,15.92,...,555,346,516,51.1,4-2-3-1,https://cdn.ssref.net/nocdn/tlogo/fb/cff3d9bb.png,Cole Palmer,15.0,Cole Palmer,8.0
4,Premier League,Tottenham Hotspur,36,63,59,4,56.2,59.7,-3.5,13.53,...,639,403,569,49.8,4-3-3,https://cdn.ssref.net/nocdn/tlogo/fb/361ca564.png,Brennan Johnson,11.0,Son Heung-min,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,Ligue 1,AJ Auxerre,33,47,48,-1,40.4,55.5,-15.1,11.15,...,642,394,583,49.3,5-4-1,https://cdn.ssref.net/nocdn/tlogo/fb/5ae09109.png,Gaëtan Perrin,10.0,Gaëtan Perrin,11.0
92,Ligue 1,Le Havre AC,33,37,69,-32,38.9,61.3,-22.4,10.03,...,610,379,546,50.6,4-2-3-1,https://cdn.ssref.net/nocdn/tlogo/fb/5c2737db.png,Abdoulaye Touré,10.0,Josué Casimir,4.0
93,Ligue 1,AS Saint-Étienne,32,35,74,-39,35.0,71.1,-36.1,10.21,...,562,356,571,46.7,4-3-3,https://cdn.ssref.net/nocdn/tlogo/fb/d298ef2c.png,Lucas Stassin,12.0,Zuriko Davitashvili,8.0
94,Ligue 1,Montpellier HSC,32,23,74,-51,32.7,60.8,-28.1,10.58,...,615,369,594,46.4,4-2-3-1,https://cdn.ssref.net/nocdn/tlogo/fb/281b0e73.png,Arnaud Nordin,4.0,Téji Savanier,5.0


In [2]:
# === Save final file ===
merged_df.to_csv("../../data/teams/cleaned/all_league_teams.csv", index=False)

print("✅ Merged file saved as '../../data/teams/cleaned/all_league_teams.csv'")

✅ Merged file saved as '../../data/teams/cleaned/all_league_teams.csv'
