In [5]:
import pandas as pd
from pathlib import Path

# Define paths
data_dir = (Path.cwd() / ".." / ".." / "data" / "Cleaned Data").resolve()
prem_path = data_dir / "Full_Prem_2020-2026_cleaned.csv"
xg_path = data_dir / "cleaned_game_stats_2020onwards.csv"

print("Loading datasets...")
prem = pd.read_csv(prem_path, parse_dates=["Date"])
xg = pd.read_csv(xg_path, parse_dates=["date"])

print(f"\nPrem rows: {len(prem)}, xG rows: {len(xg)}")

# Convert Prem to long format
home_result_map = {"H": "W", "D": "D", "A": "L"}
away_result_map = {"H": "L", "D": "D", "A": "W"}

home_prem = pd.DataFrame({
    "date": prem["Date"],
    "team": prem["HomeTeam"],
    "opponent": prem["AwayTeam"],
    "home_away": "h",
    "goals_for": prem["FTHG"],
    "goals_against": prem["FTAG"],
    "yellow": prem["HY"],
    "red": prem["HR"],
    "result": prem["FTR"].map(home_result_map)
})

away_prem = pd.DataFrame({
    "date": prem["Date"],
    "team": prem["AwayTeam"],
    "opponent": prem["HomeTeam"],
    "home_away": "a",
    "goals_for": prem["FTAG"],
    "goals_against": prem["FTHG"],
    "yellow": prem["AY"],
    "red": prem["AR"],
    "result": prem["FTR"].map(away_result_map)
})

prem_long = pd.concat([home_prem, away_prem], ignore_index=True)

# Normalize dates and teams
prem_long["date"] = pd.to_datetime(prem_long["date"]).dt.date
prem_long["team"] = prem_long["team"].astype(str).str.strip().str.lower()
xg["date"] = pd.to_datetime(xg["date"]).dt.date
xg["team"] = xg["club_name"].astype(str).str.strip().str.lower()

# DEBUG: Check team name mismatches
print("\n===== TEAM NAME COMPARISON =====")
prem_teams = set(prem_long["team"].unique())
xg_teams = set(xg["team"].unique())
print(f"\nPrem teams ({len(prem_teams)}): {sorted(prem_teams)}")
print(f"\nxG teams ({len(xg_teams)}): {sorted(xg_teams)}")
print(f"\nIn Prem but NOT in xG: {sorted(prem_teams - xg_teams)}")
print(f"\nIn xG but NOT in Prem: {sorted(xg_teams - prem_teams)}")

# Create team name mapping to fix mismatches
team_name_map = {
    "brighton and hove albion": "brighton",
    "manchester united": "manchester utd",
    "newcastle united": "newcastle",
    "nottingham forest": "nott'ham forest",
    "tottenham hotspur": "tottenham",
    "west ham united": "west ham",
    "wolverhampton wanderers": "wolves",
    # Add more mappings as needed based on debug output
}

# Apply mapping to prem_long
prem_long["team"] = prem_long["team"].replace(team_name_map)

# Try merge again
combined = prem_long.merge(
    xg[["date", "team", "home_away", "xG", "xGA", "npxG", "npxGA", "ppda", "ppda_allowed", "deep", "deep_allowed", "xpts"]],
    on=["date", "team", "home_away"],
    how="left"
)

print(f"\n===== MERGE RESULTS =====")
print(f"Combined rows: {len(combined)}")
print(f"Rows with xG data: {combined['xG'].notna().sum()} ({100*combined['xG'].notna().sum()/len(combined):.1f}%)")

# Show sample matches and non-matches
print("\n===== SAMPLE WITH xG DATA =====")
display(combined[combined['xG'].notna()].head(10))

print("\n===== SAMPLE WITHOUT xG DATA =====")
display(combined[combined['xG'].isna()].head(10))

# Export
output_path = data_dir / "combined_prem_xg_2020_2026.csv"
combined.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")

Loading datasets...

Prem rows: 2010, xG rows: 3158

===== TEAM NAME COMPARISON =====

Prem teams (28): ['arsenal', 'aston villa', 'bournemouth', 'brentford', 'brighton and hove albion', 'burnley', 'chelsea', 'crystal palace', 'everton', 'fulham', 'ipswich town', 'leeds united', 'leicester city', 'liverpool', 'luton town', 'manchester city', 'manchester united', 'newcastle united', 'norwich city', 'nottingham forest', 'sheffield united', 'southampton', 'sunderland', 'tottenham hotspur', 'watford', 'west brom', 'west ham united', 'wolverhampton wanderers']

xG teams (27): ['arsenal', 'aston villa', 'bournemouth', 'brentford', 'brighton and hove albion', 'burnley', 'chelsea', 'crystal palace', 'everton', 'fulham', 'ipswich town', 'leeds united', 'leicester city', 'liverpool', 'luton town', 'manchester city', 'manchester united', 'newcastle united', 'norwich city', 'nottingham forest', 'sheffield united', 'southampton', 'tottenham hotspur', 'watford', 'west bromwich albion', 'west ham uni

Unnamed: 0,date,team,opponent,home_away,goals_for,goals_against,yellow,red,result,xG,xGA,npxG,npxGA,ppda,ppda_allowed,deep,deep_allowed,xpts
0,2020-09-12,fulham,Arsenal,h,0,3,2,0,L,0.126327,2.16287,0.126327,2.16287,32.692308,21.928571,0.0,11.0,0.0587
1,2020-09-12,crystal palace,Southampton,h,1,0,2,0,W,1.39569,1.26267,1.39569,1.26267,17.333333,3.608696,1.0,14.0,1.477
2,2020-09-12,liverpool,Leeds United,h,4,3,1,0,W,3.15412,0.269813,1.63179,0.269813,9.935484,9.333333,19.0,2.0,2.927
7,2020-09-14,sheffield united,Wolverhampton Wanderers,h,0,2,2,0,L,0.949316,1.61307,0.949316,1.61307,8.233333,31.333333,10.0,3.0,0.7955
8,2020-09-19,everton,West Brom,h,5,2,1,0,W,4.16254,0.315347,4.16254,0.315347,9.095238,41.5,6.0,6.0,2.9941
9,2020-09-19,leeds united,Fulham,h,4,3,1,0,W,1.45254,1.55503,0.691368,0.793862,7.617647,9.423077,5.0,4.0,1.2608
11,2020-09-19,arsenal,West Ham United,h,2,1,0,0,W,1.32902,2.06377,1.32902,2.06377,12.764706,18.26087,16.0,4.0,0.8155
12,2020-09-20,southampton,Tottenham Hotspur,h,2,5,4,0,L,2.27708,2.28004,1.51591,2.28004,8.666667,10.782609,4.0,3.0,1.3364
14,2020-09-20,chelsea,Liverpool,h,0,2,0,1,L,0.904183,2.25211,0.143014,2.25211,37.636364,11.4375,1.0,13.0,0.3666
15,2020-09-20,leicester city,Burnley,h,4,2,1,0,W,0.987365,1.52404,0.987365,1.52404,7.470588,16.095238,14.0,6.0,0.9512



===== SAMPLE WITHOUT xG DATA =====


Unnamed: 0,date,team,opponent,home_away,goals_for,goals_against,yellow,red,result,xG,xGA,npxG,npxGA,ppda,ppda_allowed,deep,deep_allowed,xpts
3,2020-09-12,west ham,Newcastle United,h,0,2,2,0,L,,,,,,,,,
4,2020-09-13,west brom,Leicester City,h,0,3,1,0,L,,,,,,,,,
5,2020-09-13,tottenham,Everton,h,0,1,1,0,L,,,,,,,,,
6,2020-09-14,brighton,Chelsea,h,1,3,1,0,L,,,,,,,,,
10,2020-09-19,manchester utd,Crystal Palace,h,1,3,2,0,L,,,,,,,,,
13,2020-09-20,newcastle,Brighton and Hove Albion,h,0,3,3,0,L,,,,,,,,,
17,2020-09-21,wolves,Manchester City,h,1,3,0,0,L,,,,,,,,,
18,2020-09-26,brighton,Manchester United,h,2,3,4,0,L,,,,,,,,,
20,2020-09-26,west brom,Chelsea,h,3,3,1,0,D,,,,,,,,,
23,2020-09-27,tottenham,Newcastle United,h,1,1,1,0,D,,,,,,,,,



Saved to: /Users/saarj/Documents/The Drive/UNIL Masters/Year 1/Autumn Semester/Datascience and Advanced Programming/fantasy-football-stock-market-simulator/data/Cleaned Data/combined_prem_xg_2020_2026.csv
