# Combining All Team Stats into a Single File

## Importing Libraries

In [1]:
import pandas as pd
from difflib import get_close_matches

## File Paths

In [2]:
team_list_path = "../../data/teams/cleaned/all_league_teams.csv"
output_path = "../../data/teams/cleaned/all_league_teams.csv"

## League Stats Files

In [3]:
league_files = {
    "Premier League": "../../data/teams/raw/team_stats/premier_league_team_stats.csv",
    "Serie A": "../../data/teams/raw/team_stats/serie_a_team_stats.csv",
    "La Liga": "../../data/teams/raw/team_stats/la_liga_team_stats.csv",
    "Bundesliga": "../../data/teams/raw/team_stats/bundesliga_team_stats.csv",
    "Ligue 1": "../../data/teams/raw/team_stats/ligue_1_team_stats.csv"
}

## Merging Files

In [4]:
manual_matches = {
    "Brighton & Hove Albion": "Brighton",
    "West Ham United": "West Ham",
    "Nottingham Forest": "Nott'ham Forest",
    "Wolverhampton Wanderers": "Wolves",
    "AFC Bournemouth": "Bournemouth",
    "Brentford FC": "Brentford",
    "Everton FC": "Everton",
    "FC Empoli": "Empoli",
    "Parma Calcio 1913": "Parma",
    "Real Betis Balompié": "Real Betis",
    "UD Las Palmas": "Las Palmas",
    "CA Osasuna": "Osasuna",
    "RCD Espanyol Barcelona": "Espanyol",
    "RCD Mallorca": "Mallorca",
    "Getafe CF": "Getafe",
    "Deportivo Alavés": "Alavés",
    "Borussia Mönchengladbach": "Gladbach",
    "SV Werder Bremen": "Werder Bremen",
    "FC Augsburg": "Augsburg",
    "1.FC Union Berlin": "Union Berlin",
    "1.FC Heidenheim 1846": "Heidenheim",
    "FC St. Pauli": "St. Pauli",
    "VfL Bochum": "Bochum",
    "Holstein Kiel": "Holstein Kiel",
    "Paris Saint-Germain": "Paris S-G",
    "AS Monaco": "Monaco",
    "LOSC Lille": "Lille",
    "Olympique Marseille": "Marseille",
    "Olympique Lyon": "Lyon",
    "RC Strasbourg Alsace": "Strasbourg",
    "OGC Nice": "Nice",
    "Stade Rennais FC": "Rennes",
    "RC Lens": "Lens",
    "Stade Brestois 29": "Brest",
    "Real Betis Balompié": "Betis",
}

# Normalization function
def normalize(name):
    return (
        name.lower()
        .replace(" fc", "")
        .replace("club", "")
        .replace("&", "and")
        .replace("-", " ")
        .replace(".", "")
        .replace("  ", " ")
        .strip()
    )

# Load master team list
all_teams_df = pd.read_csv(team_list_path)
all_teams_df["Normalized Name"] = all_teams_df["Team Name"].apply(normalize)

# Store all cleaned matches
merged_dfs = []

for league, file_path in league_files.items():
    league_stats = pd.read_csv(file_path)
    league_stats["Normalized Name"] = league_stats["Team"].apply(normalize)

    league_teams = all_teams_df[all_teams_df["League"] == league].copy()

    matched_rows = []

    for _, team_row in league_teams.iterrows():
        team_name = team_row["Team Name"]
        norm_name = team_row["Normalized Name"]

        # First try manual match
        manual_match = manual_matches.get(team_name)

        # If not found, try fuzzy
        if manual_match:
            stat_row = league_stats[league_stats["Team"] == manual_match]
        else:
            match = get_close_matches(norm_name, league_stats["Normalized Name"], n=1, cutoff=0.6)
            stat_row = league_stats[league_stats["Normalized Name"] == match[0]] if match else pd.DataFrame()

        if not stat_row.empty:
            stat_row = stat_row.copy()
            stat_row["Matched Team Name"] = team_name
            matched_rows.append(stat_row)

    # Merge matched rows with league team list
    if matched_rows:
        league_matched_df = pd.concat(matched_rows, ignore_index=True)
        merged = pd.merge(
            league_teams,
            league_matched_df,
            left_on="Team Name",
            right_on="Matched Team Name",
            how="left"
        )
        merged_dfs.append(merged)

# Final merged DataFrame
final_df = pd.concat(merged_dfs, ignore_index=True)

final_df.drop(columns=["Normalized Name_x", "Team", "Normalized Name_y", "Matched Team Name"], errors="ignore", inplace=True)

final_df.head()

Unnamed: 0,League,Team Name,MP,GF,GA,GD,xG,xGA,xGD,Shots/90,...,Key Passes/90,Passes into Final Third/90,Passes into Penalty Area/90,Crosses into Penalty Area/90,Progressive Passes/90,Possession %,Tackles,Tackles Won,Challenges Attempted,Aerial Duel Win %
0,Premier League,Manchester City,36,67,43,24,63.6,45.3,18.3,15.89,...,5.0,17.89,4.66,0.68,20.52,61.7,475,287,430,48.4
1,Premier League,Arsenal FC,36,66,33,33,57.0,32.3,24.7,14.14,...,4.4,14.52,4.6,0.76,19.31,56.9,563,335,483,50.5
2,Premier League,Liverpool FC,36,83,37,46,77.8,34.6,43.2,16.86,...,5.36,16.54,4.57,0.88,20.03,57.6,614,380,626,52.8
3,Premier League,Chelsea FC,36,62,43,19,65.9,45.8,20.1,15.92,...,4.96,15.99,3.78,0.81,17.08,57.6,555,346,516,51.1
4,Premier League,Tottenham Hotspur,36,63,59,4,56.2,59.7,-3.5,13.53,...,4.11,13.71,4.06,0.92,17.39,55.8,639,403,569,49.8


## Saving File

In [5]:
final_df.to_csv(output_path, index=False)
print(f"✅ Merged file saved to: {output_path}")

✅ Merged file saved to: ../../data/teams/cleaned/all_league_teams.csv
