# Combining All Leagues Into a Single Dataset

## Importing Libraries

In [1]:
import pandas as pd
import os

## Defining Directory and File Names

In [2]:
data_dir = "../data/raw"

files = [
    "premier_league_players.csv",
    "serie_a_players.csv",
    "la_liga_players.csv",
    "bundesliga_players.csv",
    "ligue_1_players.csv"
]
league_labels = {
    "premier_league_players.csv": "Premier League",
    "serie_a_players.csv": "Serie A",
    "la_liga_players.csv": "La Liga",
    "bundesliga_players.csv": "Bundesliga",
    "ligue_1_players.csv": "Ligue 1"
}

nationality_replacements = {
    "Korea, South": "South Korea",
    "Congo, DR": "DR Congo",
    "Congo, Republic": "Republic of Congo",
    "United States": "USA",
    "England": "England",  # keep consistent
    "Ivory Coast": "Côte d'Ivoire",
    "Bosnia-Herzegovina": "Bosnia and Herzegovina"
}

combined = []

## Reading and Processing Each File

In [3]:
for file in files:
    path = os.path.join(data_dir, file)
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["League"] = league_labels[file]
        # Extract the first nationality
        df["Nationality"] = df["Nationality"].apply(lambda x: str(x).split(" / ")[0] if pd.notnull(x) else x)
        # Apply known replacements
        df["Nationality"] = df["Nationality"].replace(nationality_replacements)
        df = df[["Name", "Age", "Position", "Nationality", "League", "Club Name", "Market Value"]]
        df.columns = ["Name", "Age", "Position", "Nationality", "League", "Club", "Market Value"]
        combined.append(df)

## Combining All Files into a Single DataFrame

In [4]:
if combined:
    full_df = pd.concat(combined, ignore_index=True)
    full_df.to_csv("../data/cleaned/all_leagues_combined.csv", index=False)
    print("✅ Combined and cleaned CSV saved as 'data/all_leagues_combined.csv'")
else:
    print("⚠️ No files found. Please check paths.")

✅ Combined and cleaned CSV saved as 'data/all_leagues_combined.csv'
