In [1]:
import pandas as pd
from pathlib import Path
import os

print("Process cwd:", Path.cwd())
print("Working directory (os):", os.getcwd())

# Load Premier League results
results_path = (Path.cwd() / ".." / "data" / "Cleaned Data" / "Full_Prem_2020-2026_cleaned.csv").resolve()
print("Resolved results_path:", results_path)
print("Exists:", results_path.exists())

if not results_path.exists():
    print("Contents of parent dir:", list(results_path.parent.glob("*")))
else:
    results = pd.read_csv(results_path, parse_dates=["Date"])
    print("Loaded rows:", len(results))
    print("Columns:", results.columns.tolist())
    
    # Map full-time result to team-centric result
    home_result_map = {"H": "W", "D": "D", "A": "L"}
    away_result_map = {"H": "L", "D": "D", "A": "W"}

    # Home team perspective
    home_df = pd.DataFrame({
        "date": results["Date"],
        "team": results["HomeTeam"],
        "opponent": results["AwayTeam"],
        "goals_for": results["FTHG"],
        "goals_against": results["FTAG"],
        "yellow": results["HY"],
        "red": results["HR"],
        "result": results["FTR"].map(home_result_map)
    })

    # Away team perspective
    away_df = pd.DataFrame({
        "date": results["Date"],
        "team": results["AwayTeam"],
        "opponent": results["HomeTeam"],
        "goals_for": results["FTAG"],
        "goals_against": results["FTHG"],
        "yellow": results["AY"],
        "red": results["AR"],
        "result": results["FTR"].map(away_result_map)
    })

    # Combine into a long per-team-per-match table
    pricing_df = pd.concat([home_df, away_df], ignore_index=True)
    pricing_df = pricing_df.sort_values(["team", "date"]).reset_index(drop=True)

    # Load xG data
    data_dir = (Path.cwd() / ".." / "data" / "Cleaned Data").resolve()
    print("Searching xG files in:", data_dir)
    
    xg_long = pd.DataFrame()
    
    # Try to find and load xG files
    for xg_file in data_dir.glob("*understat*.csv"):
        print("Trying xG file:", xg_file)
        try:
            xg = pd.read_csv(xg_file, low_memory=False)
            print("xG columns:", xg.columns.tolist())
            
            # Detect understat-style per-team layout
            if {"club_name", "xG", "xGA", "date"}.issubset(xg.columns):
                xg["date"] = pd.to_datetime(xg["date"])
                xg["team"] = xg["club_name"].astype(str).str.strip()
                xg_long = xg[["date", "team", "xG", "xGA"]].rename(columns={"xG": "xG_for", "xGA": "xG_against"})
                print("Detected understat-style xG file. Built xg_long rows:", len(xg_long))
                break
        except Exception as e:
            print("Failed to read", xg_file, ":", e)
            continue
    
    if xg_long.empty:
        print("No xG data found. Proceeding without xG columns.")

    # Normalize pricing_df keys for merge
    pricing_df["date"] = pd.to_datetime(pricing_df["date"])
    pricing_df["team"] = pricing_df["team"].astype(str).str.strip()
    pricing_df["opponent"] = pricing_df["opponent"].astype(str).str.strip()

    # Merge xG data if available
    if not xg_long.empty:
        xg_long["date"] = pd.to_datetime(xg_long["date"])
        xg_long["team"] = xg_long["team"].astype(str).str.strip()
        pricing_df = pricing_df.merge(xg_long, on=["date", "team"], how="left")
        matched = pricing_df["xG_for"].notna().sum()
        print(f"Merged pricing_df. rows with xG_for: {matched} / {len(pricing_df)}")
    else:
        # If no xG data, fill with NaN
        pricing_df["xG_for"] = None
        pricing_df["xG_against"] = None

    # ===== PRICING FORMULA =====
    
    # Define pricing formula coefficients (you can tune these)
    alpha = 0.5      # WDL impact
    beta = 0.1       # xG difference impact
    gamma = 0.05     # CleanSheet impact
    delta = 0.02     # CardPoints impact
    epsilon = 0.03   # OppStrength impact
    zeta = 0.01      # Random shock / intercept

    # Initialize starting price
    P0 = 100

    # Compute derived metrics for each match
    pricing_df["pts"] = pricing_df["result"].map({"W": 3, "D": 1, "L": 0})
    pricing_df["xG_diff"] = pricing_df["xG_for"] - pricing_df["xG_against"]
    pricing_df["clean_sheet"] = (pricing_df["goals_against"] == 0).astype(int)
    pricing_df["card_points"] = pricing_df["yellow"] + 2 * pricing_df["red"]

    # Compute OppStrength (opponent's average goals_for as proxy)
    opp_strength = pricing_df.groupby("opponent")["goals_for"].transform("mean")
    pricing_df["opp_strength"] = opp_strength.fillna(0)

    # Compute match-level price change
    pricing_df["delta_P"] = (
        alpha * pricing_df["pts"] 
        + beta * pricing_df["xG_diff"].fillna(0)
        + gamma * pricing_df["clean_sheet"]
        - delta * pricing_df["card_points"]
        + epsilon * pricing_df["opp_strength"]
        + zeta
    )

    # Compute cumulative price by team
    pricing_df = pricing_df.sort_values(["team", "date"]).reset_index(drop=True)
    pricing_df["price"] = pricing_df.groupby("team")["delta_P"].cumsum() + P0

    print("Pricing DataFrame with prices:")
    display(pricing_df[["date", "team", "opponent", "pts", "xG_diff", "clean_sheet", "card_points", "delta_P", "price"]].head(20))

    # Export full pricing data to CSV
    pricing_df.to_csv("pricing_output.csv", index=False)
    print("Saved full pricing data to pricing_output.csv")

    # ===== GET FINAL PRICES FOR TARGET TEAMS =====
    
    # Get final prices for all teams
    final_prices = pricing_df.groupby("team").agg({
        "price": "last",
        "pts": "sum",
        "xG_diff": "mean",
        "clean_sheet": "sum",
        "card_points": "sum",
        "delta_P": "mean"
    }).reset_index()

    final_prices.columns = ["team", "final_price", "total_pts", "avg_xG_diff", "total_clean_sheets", "total_card_points", "avg_delta_P"]
    final_prices = final_prices.sort_values("final_price", ascending=False)

    print("\n===== FINAL TEAM STOCK PRICES (ALL TEAMS) =====\n")
    display(final_prices)

    # Filter for the specific teams you mentioned
    target_teams = [
        "Arsenal", "Aston Villa", "Bournemouth", "Brentford", "Brighton and Hove Albion", 
        "Burnley", "Chelsea", "Crystal Palace", "Everton", "Fulham", 
        "Leeds United", "Liverpool", "Manchester City", "Manchester United", 
        "Newcastle United", "Nottingham Forest", "Sunderland", "Tottenham Hotspur", 
        "West Ham United", "Wolverhampton Wanderers"
    ]

    # Normalize team names for matching
    def normalize_team_name(name):
        return name.lower().strip()

    final_prices["team_normalized"] = final_prices["team"].apply(normalize_team_name)
    target_teams_normalized = [normalize_team_name(t) for t in target_teams]

    # Filter for target teams
    target_final_prices = final_prices[final_prices["team_normalized"].isin(target_teams_normalized)].copy()
    target_final_prices = target_final_prices.sort_values("final_price", ascending=False)

    print("\n===== TARGET TEAMS STOCK PRICES =====\n")
    display(target_final_prices[["team", "final_price", "total_pts", "avg_xG_diff", "total_clean_sheets", "total_card_points"]])

    # Export target teams to CSV
    target_final_prices.to_csv("target_teams_pricing.csv", index=False)
    print("\nSaved target teams to target_teams_pricing.csv")

    # Summary statistics
    print("\n===== SUMMARY =====")
    print(f"Total teams in dataset: {len(final_prices)}")
    print(f"Target teams found: {len(target_final_prices)}")
    print(f"\nPrice range (all): {final_prices['final_price'].min():.2f} - {final_prices['final_price'].max():.2f}")
    print(f"Average price (all): {final_prices['final_price'].mean():.2f}")
    print(f"\nPrice range (target): {target_final_prices['final_price'].min():.2f} - {target_final_prices['final_price'].max():.2f}")
    print(f"Average price (target): {target_final_prices['final_price'].mean():.2f}")

Process cwd: /Users/saarj/Documents/The Drive/UNIL Masters/Year 1/Autumn Semester/Datascience and Advanced Programming/fantasy-football-stock-market-simulator/src
Working directory (os): /Users/saarj/Documents/The Drive/UNIL Masters/Year 1/Autumn Semester/Datascience and Advanced Programming/fantasy-football-stock-market-simulator/src
Resolved results_path: /Users/saarj/Documents/The Drive/UNIL Masters/Year 1/Autumn Semester/Datascience and Advanced Programming/fantasy-football-stock-market-simulator/data/Cleaned Data/Full_Prem_2020-2026_cleaned.csv
Exists: True
Loaded rows: 2010
Columns: ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']
Searching xG files in: /Users/saarj/Documents/The Drive/UNIL Masters/Year 1/Autumn Semester/Datascience and Advanced Programming/fantasy-football-stock-market-simulator/data/Cleaned Data
Trying xG file: /Users/saarj/Documents/The Drive/UNIL Masters/Year 1/Autumn Semester/Da

Unnamed: 0,date,team,opponent,pts,xG_diff,clean_sheet,card_points,delta_P,price
0,2020-09-12,Arsenal,Fulham,3,,1,2,1.56362,101.56362
1,2020-09-19,Arsenal,West Ham United,3,,0,0,1.556567,103.120187
2,2020-09-28,Arsenal,Liverpool,0,,0,2,0.00194,103.122127
3,2020-10-04,Arsenal,Sheffield United,3,,0,0,1.575921,104.698048
4,2020-10-17,Arsenal,Manchester City,0,,0,1,0.016418,104.714466
5,2020-10-25,Arsenal,Leicester City,0,,0,3,0.000724,104.71519
6,2020-11-01,Arsenal,Manchester United,3,,1,3,1.540896,106.256085
7,2020-11-08,Arsenal,Aston Villa,0,,0,0,0.05,106.306085
8,2020-11-22,Arsenal,Leeds United,1,,1,2,0.57544,106.881525
9,2020-11-29,Arsenal,Wolverhampton Wanderers,0,,0,3,-0.003433,106.878092


Saved full pricing data to pricing_output.csv

===== FINAL TEAM STOCK PRICES (ALL TEAMS) =====



Unnamed: 0,team,final_price,total_pts,avg_xG_diff,total_clean_sheets,total_card_points,avg_delta_P
15,Manchester City,335.689275,452,,84,273,1.172583
13,Liverpool,314.196141,412,,74,319,1.065652
0,Arsenal,309.684295,403,,77,329,1.043205
6,Chelsea,273.94344,337,,68,441,0.86539
16,Manchester United,268.967241,327,,58,410,0.840633
23,Tottenham Hotspur,263.034538,315,,55,401,0.811117
1,Aston Villa,261.625541,313,,59,431,0.804107
17,Newcastle United,257.784565,303,,57,369,0.784998
4,Brighton and Hove Albion,245.013313,279,,51,392,0.721459
26,West Ham United,239.07742,266,,41,336,0.691927



===== TARGET TEAMS STOCK PRICES =====



Unnamed: 0,team,final_price,total_pts,avg_xG_diff,total_clean_sheets,total_card_points
15,Manchester City,335.689275,452,,84,273
13,Liverpool,314.196141,412,,74,319
0,Arsenal,309.684295,403,,77,329
6,Chelsea,273.94344,337,,68,441
16,Manchester United,268.967241,327,,58,410
23,Tottenham Hotspur,263.034538,315,,55,401
1,Aston Villa,261.625541,313,,59,431
17,Newcastle United,257.784565,303,,57,369
4,Brighton and Hove Albion,245.013313,279,,51,392
26,West Ham United,239.07742,266,,41,336



Saved target teams to target_teams_pricing.csv

===== SUMMARY =====
Total teams in dataset: 28
Target teams found: 20

Price range (all): 109.85 - 335.69
Average price (all): 203.54

Price range (target): 109.85 - 335.69
Average price (target): 232.96


In [None]:
# ===== NEW CELL: DEBUG MISSING TEAMS =====

# Check all unique team names in the dataset
print("\n===== ALL TEAMS IN DATASET =====\n")
all_teams = final_prices["team"].unique()
print(sorted(all_teams))

# Find Tottenham and Brighton variants
print("\n===== SEARCHING FOR MISSING TEAMS =====\n")
for team in all_teams:
    if "tottenham" in team.lower() or "brighton" in team.lower():
        print(f"Found: {team}")

# Update target_teams with correct names (adjust based on debug output above)
target_teams = [
    "Arsenal", "Aston Villa", "Bournemouth", "Brentford", "Brighton",  # update if needed
    "Burnley", "Chelsea", "Crystal Palace", "Everton", "Fulham", 
    "Leeds United", "Liverpool", "Manchester City", "Manchester United", 
    "Newcastle United", "Nottingham Forest", "Sunderland", "Tottenham",  # update if needed
    "West Ham United", "Wolverhampton Wanderers"
]

# Normalize team names for matching
def normalize_team_name(name):
    return name.lower().strip()

final_prices["team_normalized"] = final_prices["team"].apply(normalize_team_name)
target_teams_normalized = [normalize_team_name(t) for t in target_teams]

# Filter for target teams
target_final_prices = final_prices[final_prices["team_normalized"].isin(target_teams_normalized)].copy()
target_final_prices = target_final_prices.sort_values("final_price", ascending=False)

print("\n===== TARGET TEAMS STOCK PRICES =====\n")
display(target_final_prices[["team", "final_price", "total_pts", "avg_xG_diff", "total_clean_sheets", "total_card_points"]])

# Show which target teams were NOT found
found_teams_set = set(t.lower().strip() for t in target_final_prices["team"].tolist())
missing = [t for t in target_teams if normalize_team_name(t) not in found_teams_set]
print(f"\n⚠️ Missing teams: {missing}")

# Export target teams to CSV
target_final_prices.to_csv("target_teams_pricing.csv", index=False)
print("\nSaved target teams to target_teams_pricing.csv")