In [None]:
"""
=============================================================================
Sprint 2 - Data Cleaning Pipeline
PGA Outrights Betting Project (2019-2024)
=============================================================================

PURPOSE:
    Clean raw historical outrights data fetched from DataGolf API and prepare
    it for exploratory data analysis (EDA) and modeling.

INPUTS:
    - Raw CSV from fetch_datagolf.py: data/interim/hist_outrights.csv
    - Contains odds, outcomes, and metadata for PGA Tour winner markets

OUTPUTS:
    - Cleaned parquet file: data/processed/model_data_clean.parquet
    - Analysis-ready dataset with de-vigged probabilities and binary target

KEY STEPS:
    1. Filter to win market only (exclude Top-10, etc.)
    2. Parse outcomes to create binary winner flag (Y)
    3. Construct event identifiers from timestamps
    4. Validate data quality (winner counts, missing values)
    5. Convert American odds to decimal and implied probabilities
    6. Remove bookmaker vig using proportional de-vig method
    7. Engineer features (field size, price rank, etc.)
    8. Remove duplicates and validate final dataset
    
AUTHOR: Matt Raivel
DATE: October 2024
=============================================================================
"""

import pandas as pd
import numpy as np
from pathlib import Path
import sys

# ============================================================================
# PATH CONFIGURATION
# ============================================================================
# Add the directory containing devig_utils.py to Python's search path
# Structure: notebooks/ → ../src/Positive_EV_Repo/data/devig_utils.py
data_path = Path.cwd().parent / 'src' / 'Positive_EV_Repo' / 'data'

if str(data_path) not in sys.path:
    sys.path.append(str(data_path))
    
# ============================================================================
# IMPORT CUSTOM UTILITIES
# ============================================================================
# These functions handle odds conversion and vig removal
# See devig_utils.py for implementation details
from devig_utils import (
    american_to_decimal,      # Convert +150 → 2.50 decimal odds
    implied_from_american,    # Convert +150 → 0.40 implied probability
    proportional_devig        # Remove bookmaker margin to get fair probabilities
)

# ============================================================================
# DISPLAY SETTINGS
# ============================================================================
# Configure pandas to show all columns and format floats consistently
pd.set_option('display.max_columns', None)           # Show all columns (no truncation)
pd.set_option('display.max_rows', 100)               # Show up to 100 rows
pd.set_option('display.float_format', '{:.6f}'.format)  # 6 decimal places for floats

print("✓ Imports successful")
print(f"✓ Loaded devig_utils from: {data_path / 'devig_utils.py'}")

✓ Imports successful
✓ Loaded devig_utils from: c:\Users\mattr\OneDrive\Desktop\Positive_EV_Project\src\Positive_EV_Repo\data\devig_utils.py


In [None]:
# ============================================================================
# CELL 2: Load Raw Historical Outrights Data
# ============================================================================
"""
GOAL: Locate and load the raw CSV file containing historical betting odds
      and tournament outcomes fetched from DataGolf API.

DATA SOURCE: fetch_datagolf.py saves to data/interim/hist_outrights.csv
             (relative to where the script was executed)

EXPECTED COLUMNS:
    - bet_outcome_numeric: Binary indicator if bet paid (1) or lost (0)
    - close_odds: American odds at market close (e.g., "+1000")
    - close_time: Timestamp when odds were finalized
    - dg_id: DataGolf's unique player identifier (join key)
    - player_name: Golfer's name ("Last, First" format)
    - outcome: Finish position ("1", "T14", "CUT", etc.)
    - year: Tournament year
    - book: Sportsbook name (e.g., "draftkings")
    - market: Bet type ("win" or "top_10")
"""

print("="*70)
print("LOADING RAW DATA")
print("="*70)

# ============================================================================
# FILE LOCATION LOGIC
# ============================================================================
# The fetch script may run from different locations, so we check multiple paths
project_root = Path.cwd().parent  # Go up one level from notebooks/ to project root

# Define potential locations (in order of likelihood)
possible_paths = [
    project_root / 'data' / 'interim' / 'hist_outrights.csv',              # Standard location
    project_root / 'src' / 'Positive_EV_Repo' / 'data' / 'interim' / 'hist_outrights.csv',  # Alt location
]

# Try each path until we find the file
raw_path = None
for path in possible_paths:
    if path.exists():
        raw_path = path
        break

# If not found in expected locations, search entire project
if raw_path is None:
    print("⚠️  File not found in expected locations. Searching project...")
    found = list(project_root.rglob('hist_outrights.csv'))
    
    if found:
        raw_path = found[0]
        print(f"⚠️  Found file at unexpected location: {raw_path}")
    else:
        # File doesn't exist - provide helpful error message
        raise FileNotFoundError(
            "hist_outrights.csv not found. Please run fetch_datagolf.py first:\n"
            "  cd src/Positive_EV_Repo/data\n"
            "  python fetch_datagolf.py"
        )

# ============================================================================
# LOAD DATA
# ============================================================================
df_raw = pd.read_csv(raw_path)

# ============================================================================
# INITIAL DATA INSPECTION
# ============================================================================
print(f"\n✓ Loaded from: {raw_path}")
print(f"Raw data shape: {df_raw.shape[0]:,} rows × {df_raw.shape[1]} columns")

print(f"\nColumns ({len(df_raw.columns)}):")
print(df_raw.columns.tolist())

print(f"\nData types:")
print(df_raw.dtypes)

print(f"\nFirst 3 rows (preview):")
display(df_raw.head(3))  # Use display() for better notebook formatting

LOADING RAW DATA

✓ Loaded from: c:\Users\mattr\OneDrive\Desktop\Positive_EV_Project\data\interim\hist_outrights.csv
Raw data shape: 624 rows × 13 columns

Columns (13):
['bet_outcome_numeric', 'bet_outcome_text', 'close_odds', 'close_time', 'dg_id', 'open_odds', 'open_time', 'outcome', 'player_name', 'tour', 'year', 'book', 'market']

Data types:
bet_outcome_numeric    float64
bet_outcome_text        object
close_odds               int64
close_time              object
dg_id                    int64
open_odds                int64
open_time               object
outcome                 object
player_name             object
tour                    object
year                     int64
book                    object
market                  object
dtype: object

First 3 rows (preview):


Unnamed: 0,bet_outcome_numeric,bet_outcome_text,close_odds,close_time,dg_id,open_odds,open_time,outcome,player_name,tour,year,book,market
0,1.0,paid in full,1000,2025-10-08 15:37,19895,1000,2025-10-06 14:00,1,"Schauffele, Xander",pga,2023,draftkings,win
1,0.0,loss,1600,2025-10-08 15:37,22085,1600,2025-10-06 14:00,T14,"Morikawa, Collin",pga,2023,draftkings,win
2,0.0,loss,1800,2025-10-08 15:37,13562,1800,2025-10-06 14:00,T20,"Matsuyama, Hideki",pga,2023,draftkings,win


In [None]:
# ============================================================================
# CELL 3: Initial Data Quality Assessment
# ============================================================================
"""
GOAL: Understand the raw data before cleaning - identify issues that need
      to be addressed (missing values, duplicate markets, year coverage, etc.)

CHECKS:
    1. What markets are present? (win, top_10, etc.)
    2. Which sportsbooks provided data?
    3. What years are covered?
    4. Are there missing values? Where?
    5. Do we have event identifiers for grouping?
"""

print("="*70)
print("INITIAL DATA QUALITY ASSESSMENT")
print("="*70)

# ============================================================================
# 1. MARKET TYPES
# ============================================================================
# DataGolf API returns multiple bet types - we only want "win" (outright winner)
print("\n1. Market Types:")
print(df_raw['market'].value_counts())
print(f"   → We will filter to 'win' market only (outrights)")

# ============================================================================
# 2. SPORTSBOOK COVERAGE
# ============================================================================
# Different books may have different odds/margins - document what we have
print("\n2. Sportsbook Coverage:")
print(df_raw['book'].value_counts())
print(f"   → Total books: {df_raw['book'].nunique()}")

# ============================================================================
# 3. TEMPORAL COVERAGE
# ============================================================================
# Check what years we successfully fetched
print("\n3. Year Distribution:")
year_counts = df_raw['year'].value_counts().sort_index()
print(year_counts)
print(f"   → Coverage: {year_counts.index.min()} to {year_counts.index.max()}")

# ============================================================================
# 4. MISSING DATA ANALYSIS
# ============================================================================
# Identify columns with missing values and quantify missingness
print("\n4. Missing Data:")
missing = df_raw.isnull().sum()  # Count nulls per column
missing_pct = 100 * missing / len(df_raw)  # Convert to percentage

# Create summary table sorted by most missing
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
}).sort_values('Percentage', ascending=False)

# Only show columns that have missing data
missing_subset = missing_df[missing_df['Percentage'] > 0]

if len(missing_subset) > 0:
    print(missing_subset)
    print(f"\n   → {len(missing_subset)} columns have missing values")
else:
    print("  ✓ No missing values detected!")

# ============================================================================
# 5. EVENT IDENTIFICATION
# ============================================================================
# Check if we have a clean event_id column or need to construct one
print("\n5. Event Identification:")
if 'event_id' in df_raw.columns:
    print(f"  ✓ event_id column present")
    print(f"  Unique events: {df_raw['event_id'].nunique():,}")
    
    # Check if event_id has nulls
    if df_raw['event_id'].isna().any():
        print(f"  ⚠️  {df_raw['event_id'].isna().sum()} rows have missing event_id")
else:
    print("  ⚠️  No event_id column found")
    print("     → Will construct from date/timestamp in next step")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "="*70)
print(f"✓ Initial assessment complete")
print(f"  Next: Filter to win market and parse outcomes")
print("="*70)

INITIAL DATA QUALITY ASSESSMENT

1. Market Types:
market
win       312
top_10    312
Name: count, dtype: int64
   → We will filter to 'win' market only (outrights)

2. Sportsbook Coverage:
book
draftkings    312
fanduel       312
Name: count, dtype: int64
   → Total books: 2

3. Year Distribution:
year
2023    312
2024    312
Name: count, dtype: int64
   → Coverage: 2023 to 2024

4. Missing Data:
  ✓ No missing values detected!

5. Event Identification:
  ⚠️  No event_id column found
     → Will construct from date/timestamp in next step

✓ Initial assessment complete
  Next: Filter to win market and parse outcomes


In [28]:
# ============================================================================
# CELL 4: Filter to Win Market Only
# ============================================================================
"""
GOAL: Isolate outright winner bets and remove other market types

RATIONALE:
    - Project scope is predicting WINNERS (1st place finish)
    - Top-10, Top-20, etc. are different prediction problems
    - Different markets have different vig structures
    - Keeping multiple markets would require separate models

INPUT: df_raw with multiple markets
OUTPUT: df_win containing only 'win' market rows
"""

print("="*70)
print("STEP 1: FILTER TO WIN MARKET")
print("="*70)

# ============================================================================
# COUNT BEFORE FILTERING
# ============================================================================
initial_count = len(df_raw)
print(f"\nStarting dataset:")
print(f"  Total rows: {initial_count:,}")
print(f"  Market breakdown:")
print(df_raw['market'].value_counts())

# ============================================================================
# APPLY FILTER
# ============================================================================
# Keep only rows where market == 'win'
# Use .copy() to avoid SettingWithCopyWarning in later operations
df_win = df_raw[df_raw['market'] == 'win'].copy()

# ============================================================================
# COUNT AFTER FILTERING
# ============================================================================
dropped_count = initial_count - len(df_win)

print(f"\nAfter filtering to 'win' market:")
print(f"  Remaining rows: {len(df_win):,}")
print(f"  Dropped rows (other markets): {dropped_count:,}")
print(f"  Percentage retained: {100 * len(df_win) / initial_count:.1f}%")

# ============================================================================
# SANITY CHECK
# ============================================================================
# Verify that we only have 'win' market now
assert df_win['market'].nunique() == 1, "ERROR: Multiple markets still present!"
assert df_win['market'].iloc[0] == 'win', "ERROR: Wrong market retained!"

print(f"\n✓ Filter successful - dataset contains only outright winner bets")

# Show sample of filtered data
print(f"\nSample of win market data (first 5 rows):")
display(df_win[['player_name', 'close_odds', 'outcome', 'year', 'book']].head())

STEP 1: FILTER TO WIN MARKET

Starting dataset:
  Total rows: 624
  Market breakdown:
market
win       312
top_10    312
Name: count, dtype: int64

After filtering to 'win' market:
  Remaining rows: 312
  Dropped rows (other markets): 312
  Percentage retained: 50.0%

✓ Filter successful - dataset contains only outright winner bets

Sample of win market data (first 5 rows):


Unnamed: 0,player_name,close_odds,outcome,year,book
0,"Schauffele, Xander",1000,1,2023,draftkings
1,"Morikawa, Collin",1600,T14,2023,draftkings
2,"Matsuyama, Hideki",1800,T20,2023,draftkings
3,"Gotterup, Chris",2000,T40,2023,draftkings
4,"Noren, Alex",2200,T27,2023,draftkings


In [29]:
# ============================================================================
# CELL 5: Parse Outcomes to Create Target (Y)
# ============================================================================

print("="*70)
print("STEP 2: PARSE OUTCOMES TO CREATE TARGET (Y)")
print("="*70)

def parse_outcome_to_winner(outcome_str):
    """
    Convert DataGolf outcome string to binary winner flag.

    Examples"
    "1" -> 1 (sole winner)
    "T1" -> 2 (co-winner in playoff, very rare)
    "2" -> 0 (2nd place)
    "T14" -> 0 (tied 14th)
    "Cut" -> 0 (missed cut)
    "WD" -> 0 (Withdrew)

    TARGET VARIABLE (Y):
    1 = Player won the tournament
    2 = Player did not win

    EDGE CASES:
    - Playoffs: Multiple players tied for 1st go to sudden death,
    DataGolf marks winner as "1", playoff losers as "2" or "T2".
    - Co-winners: Very Rare (called due to darkness or weather) both players
    marked as "T1" -> both get Y=1

    """

#===================================================================================================================
# DEFINE PARSING FUNCTION
#====================================================================================================================

def parse_outcome_to_winner(outcome_str):
    """
    Convert Datagolf outcome string to binary winner flag.

    Logic:
    - Winner is indicated by exactly "1" (no tie) or T1 (co-winner)
    - Everything else (2nd place, missed cut, etc) is 0
    - Missing outcomes return NaN for later investigation

    Parameters
    ----------
    outcome_str : str or NaN
        Finish position string from DataGolf

    Examples
    --------
    >>> parse_outcome_to_winner("1")
    1
    >>> parse_outcome_to_winner("T1")
    1
    >>> parse_outcome_to_winner("2")
    0
    >>> parse_outcome_to_winner("T14")
    0
    >>> parse_outcome_to_winner("CUT")
    0
    """

    if pd.isna(outcome_str):
        return np.nan

    #Standardize: convert string, strip white space, uppercase
    outcome_str = str(outcome_str).strip().upper()

    #Winner check: exactly "1" or "T1"
    if outcome_str in ["1", "T1"]:
        return 1
    else:
        return 0
    


# ============================================================================
# APPLY PARSING TO ENTIRE DATASET
# ============================================================================

print("Parsing Outcomes...")
df_win['Y'] = df_win['outcome'].apply(parse_outcome_to_winner)


# ============================================================================
# TARGET VARIABLE SUMMARY
# ============================================================================

print(f"\n✓ Target variable (Y) created:")
print(f"\nValue counts:")
print(df_win['Y'].value_counts())

#Calculate wini rate ( should be 0.8% for typical field size of 120-150)
win_rate = df_win["Y"].mean()
print(f'\n Overall Win Rate: {win_rate:.6f}({100*win_rate:.4f}%)')


# ============================================================================
# MISSING OUTCOME CHECK
# ============================================================================

#if some outcome sare missing we need to see why

missing_outcomes = df_win["Y"].isna().sum()

if missing_outcomes > 0:
    print(f"\n  WARNING: {missing_outcomes} rows have missing outcomes")
    print("   This could indicate:")
    print("   - Tournament was canceled/postponed")
    print("   - Data collection error")
    print("   - Event is ongoing (live odds)")
    
    print("\nSample rows with missing outcome:")
    missing_sample = df_win[df_win['Y'].isna()][['player_name', 'outcome', 'year', 'event_id']].head()
    display(missing_sample)
    
    # Decision point: Drop these rows or investigate further?
    print("\n   → These rows will be flagged for potential removal")
else:
    print("\n✓ No missing outcomes - all events have results")

# ============================================================================
# DISTRIBUTION BY YEAR
# ============================================================================
# Check that we have winners in each year (sanity check for data completeness)
print("\nWinners per year:")
winners_by_year = df_win.groupby('year')['Y'].sum()
print(winners_by_year)

# Expected: ~40-50 PGA Tour events per year
print(f"\n   → Typical PGA Tour season has ~45 events")
print(f"   → Our data has {winners_by_year.mean():.1f} winners per year (avg)")

# ============================================================================
# SAMPLE OUTCOMES
# ============================================================================
# Show examples of parsed outcomes for verification
print("\nSample outcome parsing (first 10 rows):")
sample_outcomes = df_win[['player_name', 'outcome', 'Y']].head(10)
display(sample_outcomes)






    


STEP 2: PARSE OUTCOMES TO CREATE TARGET (Y)
Parsing Outcomes...

✓ Target variable (Y) created:

Value counts:
Y
0    308
1      4
Name: count, dtype: int64

 Overall Win Rate: 0.012821(1.2821%)

✓ No missing outcomes - all events have results

Winners per year:
year
2023    2
2024    2
Name: Y, dtype: int64

   → Typical PGA Tour season has ~45 events
   → Our data has 2.0 winners per year (avg)

Sample outcome parsing (first 10 rows):


Unnamed: 0,player_name,outcome,Y
0,"Schauffele, Xander",1,1
1,"Morikawa, Collin",T14,0
2,"Matsuyama, Hideki",T20,0
3,"Gotterup, Chris",T40,0
4,"Noren, Alex",T27,0
5,"Kitayama, Kurt",T48,0
6,"Kim, Si Woo",T20,0
7,"Hojgaard, Rasmus",T14,0
8,"Yu, Kevin",T20,0
9,"Thorbjornsen, Michael",3,0


In [30]:
# ============================================================================
# CELL 6: Create Event Identifier for Grouping
# ============================================================================
"""
Goal: Create a unique identifier for each tournament event

Why?
- We need to group players by event for de-vigging (vig removal)
- Calculate per event statistics(field size, overround)
- Validate that each event has exactly one winner
- Time series train/validation/test splits

Strategy:
if event_id column exists:
    -> use it directly

If not:
    -> Construct from year + close_time date
    -> Assumption: only one PGA tour event per day
    -> Format: "2023_2023-10-08 (year_date)


TIMESTAMP FIELDS:
- open_time : when odds first posted (Monday before tournament)
- close_time: when odds finalize (typically Wednesday evening)
- We use close time as canonical decision timestamp



"""


# ============================================================================
# PARSE TIMESTAMPS
# ============================================================================
# Convert string timestamps to pandas datetime objects for manipulation
print("\nParsing timestamp columns...")

df_win['close_time_dt'] = pd.to_datetime(df_win['close_time'], errors = 'coerce')
df_win['open_time_dt'] = pd.to_datetime(df_win['open_time'], errors = 'coerce')

#Check for any unparseable dates
bad_close = df_win['close_time_dt'].isna().sum()
bad_open = df_win['open_time_dt'].isna().sum()

if bad_close > 0:
    print(f"{bad_close} rows have invalid close_time")

if bad_open < 0:
    print(f'{bad_open} rows have invald open_time')

if bad_close == 0 and bad_open == 0:
    print("ALL timestamps parsed successfully! ")


#============================================================================
# EXTRACT DATE (without time component)
# ============================================================================
# Create date-only column for grouping (ignore hours/minutes)
# Assumption: All odds for same event have same close_time date
df_win['event_date'] = df_win['close_time_dt'].dt.date


# ============================================================================
# CREATE OR VERIFY EVENT_ID
# ============================================================================
if 'event_id' not in df_win.columns:
    # No event_id in data - construct one
    print("\nNo event_id column found. Constructing from year + close_time...")
    
    # Format: "YEAR_YYYY-MM-DD"
    # Example: "2023_2023-10-08" for ZOZO Championship
    df_win['event_id'] = (
        df_win['year'].astype(str) + '_' + 
        df_win['event_date'].astype(str)
    )
    
    print(f"✓ Created event_id from year + close_time")
else:
    # event_id already exists - just verify it's usable
    print(f"✓ event_id column already exists in data")


# ============================================================================
# EVENT SUMMARY STATISTICS
# ============================================================================
print(f"\nEvent summary:")
print(f"  Unique events: {df_win['event_id'].nunique():,}")
print(f"  Date range: {df_win['event_date'].min()} to {df_win['event_date'].max()}")

# ============================================================================
# FIELD SIZE DISTRIBUTION
# ============================================================================
# How many players per event? Important for understanding market structure
event_sizes = df_win.groupby('event_id').size()

print(f"\nField sizes (players per event):")
print(event_sizes.describe())

# Flag unusually small or large events
print(f"\nSmallest events (top 5):")
print(event_sizes.nsmallest(5))

print(f"\nLargest events (top 5):")
print(event_sizes.nlargest(5))

# PGA Tour typical range: 120-156 players
small_events = (event_sizes < 100).sum()
large_events = (event_sizes > 160).sum()

if small_events > 0:
    print(f"\n  ⚠️  {small_events} events have < 100 players (invitational/limited field?)")
if large_events > 0:
    print(f"\n  ⚠️  {large_events} events have > 160 players (unusual)")

# ============================================================================
# SHOW SAMPLE EVENT
# ============================================================================
# Display one complete event to verify structure
sample_event = event_sizes.index[0]
print(f"\nSample event: {sample_event}")
print(f"Field size: {event_sizes[sample_event]} players")

sample_event_data = df_win[df_win['event_id'] == sample_event][
    ['player_name', 'close_odds', 'outcome', 'Y']
].head(10)

print(f"\nTop 10 players (by odds) in this event:")
display(sample_event_data)




Parsing timestamp columns...
ALL timestamps parsed successfully! 

No event_id column found. Constructing from year + close_time...
✓ Created event_id from year + close_time

Event summary:
  Unique events: 2
  Date range: 2025-10-08 to 2025-10-08

Field sizes (players per event):
count     2.000000
mean    156.000000
std       0.000000
min     156.000000
25%     156.000000
50%     156.000000
75%     156.000000
max     156.000000
dtype: float64

Smallest events (top 5):
event_id
2023_2025-10-08    156
2024_2025-10-08    156
dtype: int64

Largest events (top 5):
event_id
2023_2025-10-08    156
2024_2025-10-08    156
dtype: int64

Sample event: 2023_2025-10-08
Field size: 156 players

Top 10 players (by odds) in this event:


Unnamed: 0,player_name,close_odds,outcome,Y
0,"Schauffele, Xander",1000,1,1
1,"Morikawa, Collin",1600,T14,0
2,"Matsuyama, Hideki",1800,T20,0
3,"Gotterup, Chris",2000,T40,0
4,"Noren, Alex",2200,T27,0
5,"Kitayama, Kurt",2200,T48,0
6,"Kim, Si Woo",2200,T20,0
7,"Hojgaard, Rasmus",2200,T14,0
8,"Yu, Kevin",2800,T20,0
9,"Thorbjornsen, Michael",3000,3,0


In [31]:
# RECOVERY CELL: Recreate event_id if missing
print("="*70)
print("RECOVERY: Checking and recreating event_id if needed")
print("="*70)

if 'event_id' not in df_win.columns:
    print("\n⚠️  event_id missing - recreating from timestamps...")
    
    # Parse timestamps if needed
    if 'close_time_dt' not in df_win.columns:
        df_win['close_time_dt'] = pd.to_datetime(df_win['close_time'], errors='coerce')
    
    # Create event_date
    df_win['event_date'] = df_win['close_time_dt'].dt.date
    
    # Recreate event_id
    df_win['event_id'] = (
        df_win['year'].astype(str) + '_' + 
        df_win['event_date'].astype(str)
    )
    
    print(f"✓ Recreated event_id: {df_win['event_id'].nunique()} unique events")
else:
    print(f"✓ event_id exists: {df_win['event_id'].nunique()} unique events")

# Verify we have what we need
print(f"\nDataset check:")
print(f"  Rows: {len(df_win):,}")
print(f"  Events: {df_win['event_id'].nunique():,}")
print(f"  Players: {df_win['dg_id'].nunique():,}")
print(f"\nRequired columns present:")
for col in ['event_id', 'dg_id', 'implied_raw', 'dec_odds', 'Y']:
    print(f"  {col}: {col in df_win.columns}")

RECOVERY: Checking and recreating event_id if needed
✓ event_id exists: 2 unique events

Dataset check:
  Rows: 312
  Events: 2
  Players: 78

Required columns present:
  event_id: True
  dg_id: True
  implied_raw: False
  dec_odds: False
  Y: True


In [32]:
# ============================================================================
# CELL 7: Validate Winner Counts per Event
# ============================================================================

"""

GOAL: Ensure data quality by checking that each event has exactly 1 winner

EXPECTED: Every PGA tour event has exactly one winner
- Even Playoffs result in a single winner.
- Co-winners (T1) are extremely rare but would have Y=1

DATA QUALITY ISSUES TO CATCH:
1. Events with 0 winners
    -> Missing outcome data
    -> event was cancelled
    -> Incomplete data pull

  
    2. Events with 2+ winners
       → Duplicate rows (same player multiple times)
       → Co-winner situation (rare but valid)
       → Multiple books/timestamps not deduplicated yet

ACTION: Flag problematic events for manual review or exclusion
""" 

# ============================================================================
# COUNT WINNERS PER EVENT
# ============================================================================
# Sum Y column (binary) per event → should equal 1

winner_counts = df_win.groupby('event_id')["Y"].sum()


# ============================================================================
# SUMMARY STATISTICS
# ============================================================================

total_events = len(winner_counts)
good_events = (winner_counts == 1).sum()
zero_winner_events = (winner_counts == 0).sum()
multi_winner_events = (winner_counts >1).sum()

print(f"\nWinner count validation:")
print(f"  Total events: {total_events:,}")
print(f"  Events with exactly 1 winner: {good_events:,} ({100*good_events/total_events:.1f}%)")
print(f"  Events with 0 winners: {zero_winner_events:,}")
print(f"  Events with 2+ winners: {multi_winner_events:,}")


# ============================================================================
# INVESTIGATE ZERO-WINNER EVENTS
# ============================================================================
if zero_winner_events > 0:
    print(f"\n{'='*70}")
    print(f"⚠️  ISSUE: {zero_winner_events} events have NO winner")
    print(f"{'='*70}")
    
    no_winner = winner_counts[winner_counts == 0]
    
    # Show list of problematic events
    print(f"\nEvents with 0 winners:")
    print(no_winner.head(10))
    
    # Show sample data from one problematic event
    sample_event = no_winner.index[0]
    print(f"\nSample data from event: {sample_event}")
    sample_data = df_win[df_win['event_id'] == sample_event][
        ['event_id', 'player_name', 'outcome', 'Y', 'year']
    ].head()
    display(sample_data)
    
    print("\nPossible causes:")
    print("  - Tournament canceled/postponed")
    print("  - Incomplete data (still in progress)")
    print("  - Outcome parsing failed")
    print("  → Recommend: Exclude these events from modeling")

# ============================================================================
# INVESTIGATE MULTI-WINNER EVENTS
# ============================================================================
if multi_winner_events > 0:
    print(f"\n{'='*70}")
    print(f"⚠️  ISSUE: {multi_winner_events} events have MULTIPLE winners")
    print(f"{'='*70}")
    
    multi_winner = winner_counts[winner_counts > 1]
    
    # Show list of problematic events
    print(f"\nEvents with 2+ winners:")
    print(multi_winner.head(10))
    
    # Show sample data from one problematic event
    sample_event = multi_winner.index[0]
    print(f"\nSample data from event: {sample_event}")
    sample_data = df_win[df_win['event_id'] == sample_event][
        ['event_id', 'player_name', 'outcome', 'Y', 'book', 'close_time']
    ]
    
    # Show only winners from this event
    winners_only = sample_data[sample_data['Y'] == 1]
    display(winners_only)
    
    print("\nPossible causes:")
    print("  - True co-winners (tied 1st, no playoff) - RARE but valid")
    print("  - Duplicate rows not yet removed (same player, multiple books/times)")
    print("  → Will be resolved in deduplication step (Cell 12)")

# ============================================================================
# FLAG PROBLEMATIC EVENTS
# ============================================================================
# Add column to mark rows in events with bad winner counts
# We'll decide later whether to exclude or fix these
problematic_event_ids = winner_counts[winner_counts != 1].index

df_win['flag_bad_winner_count'] = df_win['event_id'].isin(problematic_event_ids)

flagged_rows = df_win['flag_bad_winner_count'].sum()
flagged_events = (winner_counts != 1).sum()

print(f"\n{'='*70}")
print(f"✓ Flagging complete:")
print(f"  Flagged rows: {flagged_rows:,} (in {flagged_events} events)")
print(f"  Clean rows: {len(df_win) - flagged_rows:,} (in {total_events - flagged_events} events)")
print(f"{'='*70}")

if flagged_rows > 0:
    print(f"\n⚠️  Note: Flagged events retained for now, will address in:")
    print(f"     - Deduplication (Cell 12)")
    print(f"     - Final validation (Cell 13)")
else:
    print(f"\n✓ All events have exactly 1 winner - data quality excellent!")





Winner count validation:
  Total events: 2
  Events with exactly 1 winner: 0 (0.0%)
  Events with 0 winners: 0
  Events with 2+ winners: 2

⚠️  ISSUE: 2 events have MULTIPLE winners

Events with 2+ winners:
event_id
2023_2025-10-08    2
2024_2025-10-08    2
Name: Y, dtype: int64

Sample data from event: 2023_2025-10-08


Unnamed: 0,event_id,player_name,outcome,Y,book,close_time
0,2023_2025-10-08,"Schauffele, Xander",1,1,draftkings,2025-10-08 15:37
156,2023_2025-10-08,"Schauffele, Xander",1,1,fanduel,2025-10-08 15:37



Possible causes:
  - True co-winners (tied 1st, no playoff) - RARE but valid
  - Duplicate rows not yet removed (same player, multiple books/times)
  → Will be resolved in deduplication step (Cell 12)

✓ Flagging complete:
  Flagged rows: 312 (in 2 events)
  Clean rows: 0 (in 0 events)

⚠️  Note: Flagged events retained for now, will address in:
     - Deduplication (Cell 12)
     - Final validation (Cell 13)


In [33]:
# ============================================================================
# CELL 8: Clean Odds Data - Remove Missing & Invalid Values
# ============================================================================
"""
GOAL: Ensure all odds are valid American format and non-null

AMERICAN ODDS FORMAT:
    Positive: +150 means bet $100 to win $150 (underdog)
    Negative: -150 means bet $150 to win $100 (favorite)
    
VALID FORMATS:
    ✓ "+1000"
    ✓ "-110"
    ✓ "+250"
    ✗ "1000" (missing +/- sign)
    ✗ "N/A" (non-numeric)
    ✗ "" (empty string)

WHY CRITICAL:
    - Cannot convert to decimal odds without valid American odds
    - Cannot calculate implied probabilities
    - Cannot de-vig without probabilities
    → Invalid odds = row must be dropped

DECISION: Drop rows with missing/invalid odds (no imputation)
    Rationale: Fabricating odds would create fake market signal
"""

print("="*70)
print("STEP 5: CLEAN ODDS DATA")
print("="*70)

# ============================================================================
# CHECK FOR NULL VALUES
# ============================================================================

print("\nChecking for missing odds")
null_close = df_win['close_odds'].isna().sum()
null_open = df_win['open_odds'].isna().sum()


print(f"  close_odds null: {null_close:,}")
print(f"  open_odds null: {null_open:,}")

# ============================================================================
# VALIDATE AMERICAN ODDS FORMAT
# ============================================================================
print("\nValidating American odds format...")

def is_valid_american_odds(odds_str):
    """
    Check if string is valid American odds format.
    
    Valid: "+150", "-110", "+2500"
    Invalid: "150" (no sign), "N/A", None, ""
    
    Parameters
    ----------
    odds_str : str or NaN
        Odds string to validate
    
    Returns
    -------
    bool
        True if valid American odds format, False otherwise
    """
    # Null values are invalid
    if pd.isna(odds_str):
        return False
    
    # Convert to string and strip whitespace
    odds_str = str(odds_str).strip()
    
    # Must start with + or -
    if not (odds_str.startswith('+') or odds_str.startswith('-')):
        return False
    
    # Everything after +/- must be numeric
    try:
        # Remove +/- sign and try to convert to integer
        numeric_part = odds_str.replace('+', '').replace('-', '')
        int(numeric_part)
        return True
    except ValueError:
        # Conversion failed - not a valid number
        return False

# Apply validation to close_odds column
df_win['valid_odds'] = df_win['close_odds'].apply(is_valid_american_odds)

# Count invalid odds
invalid_count = (~df_win['valid_odds']).sum()

print(f"\n  Valid odds: {df_win['valid_odds'].sum():,}")
print(f"  Invalid odds: {invalid_count:,}")



# ============================================================================
# SHOW EXAMPLES OF INVALID ODDS
# ============================================================================
if invalid_count > 0:
    print(f"\n⚠️  WARNING: {invalid_count} rows have invalid odds format")
    print("\nSample invalid odds (first 10):")
    invalid_sample = df_win[~df_win['valid_odds']][
        ['player_name', 'close_odds', 'year', 'event_id']
    ].head(10)
    display(invalid_sample)

# ============================================================================
# DROP INVALID ODDS
# ============================================================================
# Cannot proceed without valid odds - must drop these rows
before_count = len(df_win)

# Keep only rows with valid odds
df_win = df_win[df_win['valid_odds']].copy()

# Remove temporary validation column
df_win.drop('valid_odds', axis=1, inplace=True)

after_count = len(df_win)
dropped = before_count - after_count

# ============================================================================
# SUMMARY
# ============================================================================
print(f"\n{'='*70}")
print(f"✓ Odds cleaning complete:")
print(f"  Before: {before_count:,} rows")
print(f"  After:  {after_count:,} rows")
print(f"  Dropped: {dropped:,} rows ({100*dropped/before_count:.2f}%)")
print(f"{'='*70}")

if dropped > 0:
    print(f"\nNote: Dropped rows had missing or malformed odds")
    print(f"      Cannot impute market prices - deletion is appropriate")
else:
    print(f"\n✓ No invalid odds found - all data retained!")

STEP 5: CLEAN ODDS DATA

Checking for missing odds
  close_odds null: 0
  open_odds null: 0

Validating American odds format...

  Valid odds: 0
  Invalid odds: 312


Sample invalid odds (first 10):


Unnamed: 0,player_name,close_odds,year,event_id
0,"Schauffele, Xander",1000,2023,2023_2025-10-08
1,"Morikawa, Collin",1600,2023,2023_2025-10-08
2,"Matsuyama, Hideki",1800,2023,2023_2025-10-08
3,"Gotterup, Chris",2000,2023,2023_2025-10-08
4,"Noren, Alex",2200,2023,2023_2025-10-08
5,"Kitayama, Kurt",2200,2023,2023_2025-10-08
6,"Kim, Si Woo",2200,2023,2023_2025-10-08
7,"Hojgaard, Rasmus",2200,2023,2023_2025-10-08
8,"Yu, Kevin",2800,2023,2023_2025-10-08
9,"Thorbjornsen, Michael",3000,2023,2023_2025-10-08



✓ Odds cleaning complete:
  Before: 312 rows
  After:  0 rows
  Dropped: 312 rows (100.00%)

Note: Dropped rows had missing or malformed odds
      Cannot impute market prices - deletion is appropriate


In [34]:
# ============================================================================
# CELL 9: Convert Odds to Decimal & Calculate Implied Probabilities
# ============================================================================
"""
GOAL: Transform American odds into formats needed for analysis

CONVERSIONS:
    American → Decimal → Implied Probability
    
EXAMPLE:
    American: +1000
    Decimal:  11.0  (total return per $1 wagered, including stake)
    Implied:  0.0909 (9.09% probability)

WHY MULTIPLE FORMATS:
    - Decimal odds: Easier to work with mathematically
    - Implied probability: Raw market belief before vig removal
    - Both needed: Decimal for EV calculations, implied for de-vigging

FORMULAS:
    If American > 0:  decimal = 1 + (American / 100)
    If American < 0:  decimal = 1 + (100 / |American|)
    
    Implied probability = 1 / decimal
"""

print("="*70)
print("STEP 6: CONVERT ODDS & CALCULATE IMPLIED PROBABILITIES")
print("="*70)

# ============================================================================
# CONVERT STRING TO NUMERIC (ROBUST VERSION)
# ============================================================================
print("\nConverting American odds to numeric...")

# Handle both string and numeric types
def convert_american_odds(value):
    """Convert American odds (string or numeric) to integer"""
    if isinstance(value, (int, float)):
        return int(value)
    # If string, remove + and spaces
    return int(str(value).replace('+', '').replace(' ', ''))

df_win['american_odds'] = df_win['close_odds'].apply(convert_american_odds)

print(f"  ✓ Converted {len(df_win):,} odds to numeric format")

# ============================================================================
# CONVERT TO DECIMAL ODDS
# ============================================================================
# Use utility function from devig_utils.py
print("\nConverting to decimal odds...")

df_win['dec_odds'] = american_to_decimal(df_win['american_odds'])

print(f"  ✓ Calculated decimal odds")

# ============================================================================
# CALCULATE IMPLIED PROBABILITIES
# ============================================================================
# This is the RAW implied probability BEFORE de-vigging
# Sum of all implied probs per event will be > 1.0 (bookmaker overround)
print("\nCalculating raw implied probabilities...")

df_win['implied_raw'] = implied_from_american(df_win['american_odds'])

print(f"  ✓ Calculated implied probabilities")

# ============================================================================
# SHOW CONVERSION EXAMPLES
# ============================================================================
print("\nSample conversions (first 10 rows):")
conversion_cols = ['player_name', 'close_odds', 'american_odds', 'dec_odds', 'implied_raw']
conversion_sample = df_win[conversion_cols].head(10)

# Format for readability
print(conversion_sample.to_string(index=False))

# ============================================================================
# VALIDATION CHECKS
# ============================================================================
print(f"\n{'='*70}")
print("Validation checks:")
print(f"{'='*70}")

# Check 1: All decimal odds should be >= 1.0
all_decimal_valid = (df_win['dec_odds'] >= 1.0).all()
print(f"  1. All decimal odds >= 1.0: {all_decimal_valid}")

if not all_decimal_valid:
    # Should never happen if conversion is correct
    bad_decimal = df_win[df_win['dec_odds'] < 1.0]
    print(f"\n     ⚠️  ERROR: {len(bad_decimal)} rows have decimal odds < 1.0")
    print(bad_decimal[['player_name', 'close_odds', 'american_odds', 'dec_odds']].head())

# Check 2: All implied probabilities should be between 0 and 1
all_implied_valid = ((df_win['implied_raw'] > 0) & (df_win['implied_raw'] < 1)).all()
print(f"  2. All implied probs in (0, 1): {all_implied_valid}")

if not all_implied_valid:
    # Should never happen if conversion is correct
    bad_implied = df_win[~((df_win['implied_raw'] > 0) & (df_win['implied_raw'] < 1))]
    print(f"\n     ⚠️  ERROR: {len(bad_implied)} rows have implied prob out of bounds")
    print(bad_implied[['player_name', 'close_odds', 'implied_raw']].head())

# ============================================================================
# DISTRIBUTION SUMMARY
# ============================================================================
print(f"\n✓ Odds distribution summary:")
print(df_win[['dec_odds', 'implied_raw']].describe())

# Interpretation notes
print(f"\nInterpretation:")
median_dec = df_win['dec_odds'].median()
median_implied = df_win['implied_raw'].median()
print(f"  Median decimal odds: {median_dec:.2f} ({median_implied:.4f} implied)")
print(f"  → Typical player has ~{100*median_implied:.2f}% win probability (before de-vig)")

print(f"\n  Min odds: {df_win['dec_odds'].min():.2f} (favorite)")
print(f"  Max odds: {df_win['dec_odds'].max():.2f} (extreme longshot)")

STEP 6: CONVERT ODDS & CALCULATE IMPLIED PROBABILITIES

Converting American odds to numeric...
  ✓ Converted 0 odds to numeric format

Converting to decimal odds...
  ✓ Calculated decimal odds

Calculating raw implied probabilities...
  ✓ Calculated implied probabilities

Sample conversions (first 10 rows):
Empty DataFrame
Columns: [player_name, close_odds, american_odds, dec_odds, implied_raw]
Index: []

Validation checks:
  1. All decimal odds >= 1.0: True
  2. All implied probs in (0, 1): True

✓ Odds distribution summary:
       dec_odds  implied_raw
count  0.000000     0.000000
mean        NaN          NaN
std         NaN          NaN
min         NaN          NaN
25%         NaN          NaN
50%         NaN          NaN
75%         NaN          NaN
max         NaN          NaN

Interpretation:
  Median decimal odds: nan (nan implied)
  → Typical player has ~nan% win probability (before de-vig)

  Min odds: nan (favorite)
  Max odds: nan (extreme longshot)


In [35]:
# Create event_id from tournament/date combination
df_win['event_id'] = df_win['tour'].astype(str) + '_' + df_win['event_date'].astype(str)

print(f"Created event_id column")
print(f"Unique events: {df_win['event_id'].nunique()}")

Created event_id column
Unique events: 0


In [36]:
# ============================================================================
# CELL 10: Remove Bookmaker Vig (Proportional De-vig Method)
# ============================================================================
"""
GOAL: Convert biased market probabilities to fair probabilities

THE PROBLEM: Bookmaker Overround (Vig)
    - Sum of raw implied probabilities per event > 1.0
    - Example: Two players at -110 each (52.38% implied)
      → Total: 104.76% (should be 100%)
    - Extra 4.76% is bookmaker's built-in profit margin

SOLUTION: Proportional De-vig
    - Normalize probabilities to sum to exactly 1.0
    - Formula: fair_prob[i] = implied_prob[i] / sum(implied_probs)
    
EXAMPLE:
    Raw implied probs: [0.55, 0.55] → Sum = 1.10 (10% vig)
    De-vigged probs:   [0.50, 0.50] → Sum = 1.00 (fair)

ASSUMPTIONS:
    - Bookmaker applies same percentage margin to all players
    - Simpler than power/additive methods (good for Sprint 2)
    - More sophisticated methods reserved for Sprint 3

OUTPUTS:
    - p_book: De-vigged fair probability per player
    - overround: Bookmaker margin per event (typically 1.15-1.20 for PGA)
"""

print("="*70)
print("STEP 7: DE-VIG ODDS (PROPORTIONAL METHOD)")
print("="*70)

# ============================================================================
# DEFINE DE-VIG FUNCTION - FIXED VERSION
# ============================================================================
def devig_event_group(group):
    """
    Apply proportional de-vig to one event's odds.
    
    This function is applied per event using groupby().
    
    Steps:
        1. Sum all raw implied probabilities
        2. Calculate overround (how much > 1.0)
        3. Normalize each probability by dividing by total
    
    Parameters
    ----------
    group : pd.DataFrame
        All rows (players) for a single event
    
    Returns
    -------
    pd.DataFrame
        Same group with two new columns:
        - overround: Sum of implied_raw (bookmaker margin)
        - p_book: De-vigged fair probability (sums to 1.0)
    """
    # Make a copy to avoid modifying original
    group = group.copy()
    
    # Calculate total (should be > 1.0 due to vig)
    total_implied = group['implied_raw'].sum()
    
    # Store overround for analysis (same value for all rows in group)
    group['overround'] = total_implied
    
    # Proportional normalization - convert Series to array for proportional_devig
    # then assign back to ensure proper indexing
    devigged_probs = proportional_devig(group['implied_raw'].values)
    group['p_book'] = devigged_probs
    
    return group

# ============================================================================
# APPLY DE-VIG PER EVENT
# ============================================================================
print("\nApplying proportional de-vig per event...")
print(f"  Processing {df_win['event_id'].nunique():,} unique events...")

# Group by event and apply de-vig function
# Use transform=False (default) to get full DataFrame back
df_win = (
    df_win
    .groupby('event_id', group_keys=False)  # Process each event separately
    .apply(devig_event_group)                # Apply de-vig function
)

# Reset index to clean up any multi-index issues
df_win = df_win.reset_index(drop=True)

print(f"  ✓ De-vig complete for all events")

# ============================================================================
# VALIDATE DE-VIG RESULTS
# ============================================================================
print(f"\n{'='*70}")
print("Validation: De-vigged probabilities")
print(f"{'='*70}")

# Critical check: p_book should sum to exactly 1.0 per event
event_prob_sums = df_win.groupby('event_id')['p_book'].sum()

print(f"\nDe-vigged probability sums per event:")
print(f"  Mean:   {event_prob_sums.mean():.15f}")
print(f"  Std:    {event_prob_sums.std():.15e}")
print(f"  Min:    {event_prob_sums.min():.15f}")
print(f"  Max:    {event_prob_sums.max():.15f}")

# Check within numerical tolerance (floating point precision)
all_sum_to_one = np.allclose(event_prob_sums, 1.0, atol=1e-10)
print(f"\n  All events sum to 1.0 (±1e-10): {all_sum_to_one}")

if not all_sum_to_one:
    # Should not happen with proportional method
    print("\n  ⚠️  WARNING: Some events don't sum to exactly 1.0")
    bad_events = event_prob_sums[~np.isclose(event_prob_sums, 1.0, atol=1e-10)]
    print(f"  Problematic events: {len(bad_events)}")
    print(bad_events.head())

# ============================================================================
# OVERROUND (VIG) ANALYSIS
# ============================================================================
print(f"\n{'='*70}")
print("Bookmaker Margin (Overround) Analysis")
print(f"{'='*70}")

# Get one overround value per event (all players in event have same value)
overround_stats = df_win.groupby('event_id')['overround'].first()

print(f"\nOverround statistics:")
print(overround_stats.describe())

# Convert to percentage margin for easier interpretation
print(f"\nInterpretation (as margin %):")
print(f"  Min margin:      {100*(overround_stats.min()-1):.2f}%")
print(f"  25th percentile: {100*(overround_stats.quantile(0.25)-1):.2f}%")
print(f"  Median margin:   {100*(overround_stats.median()-1):.2f}%")
print(f"  75th percentile: {100*(overround_stats.quantile(0.75)-1):.2f}%")
print(f"  Max margin:      {100*(overround_stats.max()-1):.2f}%")

print(f"\nTypical PGA outrights vig: 15-20%")
print(f"Our data median: {100*(overround_stats.median()-1):.1f}%")

# ============================================================================
# SHOW BEFORE/AFTER EXAMPLE
# ============================================================================
# Pick one event and show de-vig effect
sample_event = df_win['event_id'].iloc[0]

print(f"\n{'='*70}")
print(f"Example: Event {sample_event}")
print(f"{'='*70}")

sample_df = df_win[df_win['event_id'] == sample_event][
    ['player_name', 'dec_odds', 'implied_raw', 'p_book', 'Y']
].head(10)

print(sample_df.to_string(index=False))

# Calculate sums for this event
sample_implied_sum = df_win[df_win['event_id'] == sample_event]['implied_raw'].sum()
sample_pbook_sum = df_win[df_win['event_id'] == sample_event]['p_book'].sum()

print(f"\nBefore de-vig (raw implied):  Sum = {sample_implied_sum:.6f}")
print(f"After de-vig (p_book):        Sum = {sample_pbook_sum:.15f}")
print(f"Removed vig:                  {100*(sample_implied_sum-1):.2f}%")

print(f"\n✓ De-vig successful - probabilities now sum to 1.0 per event")


STEP 7: DE-VIG ODDS (PROPORTIONAL METHOD)

Applying proportional de-vig per event...
  Processing 0 unique events...
  ✓ De-vig complete for all events

Validation: De-vigged probabilities


  .apply(devig_event_group)                # Apply de-vig function


KeyError: 'Column not found: p_book'

In [None]:
# ============================================================================
# EMERGENCY FIX: Manually create de-vig columns
# ============================================================================
print("="*70)
print("EMERGENCY: Creating de-vig columns manually")
print("="*70)

# Check if we have data
print(f"\nCurrent df_win shape: {df_win.shape}")

if len(df_win) == 0:
    print("\n⚠️  CRITICAL: df_win is empty!")
    print("\nYou need to:")
    print("   1. Restart kernel")
    print("   2. Re-run Cell 1 (imports)")
    print("   3. Re-run Cell 2 (load data)")
    print("   4. Continue through cells 3-9")
    print("   5. Skip to this fix")
    raise ValueError("Cannot proceed with empty DataFrame")

# Verify we have required columns
required_cols = ['event_id', 'implied_raw']
missing = [col for col in required_cols if col not in df_win.columns]

if missing:
    print(f"\n⚠️  Missing columns: {missing}")
    raise ValueError(f"Cannot de-vig without: {missing}")

# Calculate overround per event
print("\nCalculating overround per event...")
event_overround = df_win.groupby('event_id')['implied_raw'].transform('sum')
df_win['overround'] = event_overround

# De-vig: divide each implied_raw by the event's overround
print("De-vigging probabilities...")
df_win['p_book'] = df_win['implied_raw'] / df_win['overround']

# Validate
print("\nValidation:")
event_sums = df_win.groupby('event_id')['p_book'].sum()
print(f"  Events: {len(event_sums)}")
print(f"  All sum to 1.0: {np.allclose(event_sums, 1.0, atol=1e-10)}")
print(f"  Mean overround: {df_win.groupby('event_id')['overround'].first().mean():.4f}")
print(f"  Median vig: {100*(df_win.groupby('event_id')['overround'].first().median()-1):.1f}%")

print("\n✓ De-vig columns created successfully")
print(f"✓ Final shape: {df_win.shape}")

EMERGENCY: Creating de-vig columns manually

Current df_win shape: (0, 22)

⚠️  CRITICAL: df_win is empty!

You need to:
   1. Restart kernel
   2. Re-run Cell 1 (imports)
   3. Re-run Cell 2 (load data)
   4. Continue through cells 3-9
   5. Skip to this fix


ValueError: Cannot proceed with empty DataFrame