# GolfGod Data Exploration
## Phase 0: Proof of Concept Analysis

This notebook explores our initial data collection and validates our betting edge hypothesis.

In [None]:
# Import required libraries
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('.'))))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Import our modules
from src.collectors.pga_scraper import PGATourScraper
from src.collectors.weather_collector import WeatherCollector
from src.collectors.odds_importer import OddsImporter
from src.backtesting.strategy import WeatherEdgeStrategy, FormMomentumStrategy, CourseFitStrategy
from src.backtesting.roi_calculator import ROICalculator
from src.database.db_manager import GolfDatabase

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
sns.set_style('whitegrid')

print('Libraries loaded successfully!')

## 1. Load Tournament Data

In [None]:
# Initialize scraper
scraper = PGATourScraper()

# Get 2024 tournaments
tournaments = scraper.get_tournaments_2024()
tournaments_df = pd.DataFrame(tournaments)
tournaments_df['date'] = pd.to_datetime(tournaments_df['date'])

print(f"Total tournaments: {len(tournaments_df)}")
print(f"Date range: {tournaments_df['date'].min()} to {tournaments_df['date'].max()}")
print(f"Total purse: ${tournaments_df['purse'].sum():,.0f}")

# Show first 10 tournaments
tournaments_df.head(10)

## 2. Analyze Tournament Results

In [None]:
# Get Masters results as example
masters_results = scraper.get_tournament_results('Masters Tournament')
leaderboard_df = pd.DataFrame(masters_results['leaderboard'])

# Display top 10
print("Masters Tournament Leaderboard:")
print(leaderboard_df[['position', 'player', 'total', 'prize_money']].head(10))

# Plot prize money distribution
plt.figure(figsize=(10, 6))
plt.bar(range(len(leaderboard_df)), leaderboard_df['prize_money'])
plt.xlabel('Finishing Position')
plt.ylabel('Prize Money ($)')
plt.title('Prize Money Distribution - Masters Tournament')
plt.xticks(range(len(leaderboard_df)), leaderboard_df['position'])
plt.show()

## 3. Weather Impact Analysis

In [None]:
# Initialize weather collector
weather_collector = WeatherCollector()

# Get weather for Masters
masters_weather = weather_collector.get_tournament_weather(
    location='Augusta',
    tournament_date='2024-04-11'
)

# Analyze weather conditions
weather_analysis = weather_collector.analyze_weather_conditions(masters_weather)

print("Masters Weather Analysis:")
for key, value in weather_analysis.items():
    print(f"  {key}: {value}")

# Create weather impact visualization
if masters_weather and 'weather_days' in masters_weather:
    weather_df = pd.DataFrame(masters_weather['weather_days'][:4])  # Tournament days only
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Wind speeds
    axes[0].bar(range(len(weather_df)), weather_df['wind_mph'])
    axes[0].set_xlabel('Tournament Day')
    axes[0].set_ylabel('Wind Speed (mph)')
    axes[0].set_title('Wind Conditions by Day')
    axes[0].axhline(y=15, color='r', linestyle='--', label='High Wind Threshold')
    axes[0].legend()
    
    # Temperature range
    axes[1].plot(weather_df['temp_max_f'], 'r-', label='High')
    axes[1].plot(weather_df['temp_min_f'], 'b-', label='Low')
    axes[1].set_xlabel('Tournament Day')
    axes[1].set_ylabel('Temperature (°F)')
    axes[1].set_title('Temperature Range by Day')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()

## 4. Odds Analysis

In [None]:
# Initialize odds importer
odds_importer = OddsImporter()

# Load sample odds (you'll replace with real data)
sample_odds_file = 'data/raw/sample_odds.csv'
if os.path.exists(sample_odds_file):
    odds_df = odds_importer.import_csv(sample_odds_file)
    
    print(f"Loaded {len(odds_df)} odds records")
    print(f"\nTop 5 favorites:")
    print(odds_df.nsmallest(5, 'implied_prob')[['player', 'outright_odds', 'implied_prob']])
    
    # Calculate and display vig
    vig = odds_importer.calculate_vig(odds_df, 'Masters 2024')
    print(f"\nBookmaker vig: {vig}%")
    
    # Visualize implied probabilities
    plt.figure(figsize=(10, 6))
    plt.barh(odds_df['player'], odds_df['implied_prob'])
    plt.xlabel('Implied Probability')
    plt.title('Market Implied Win Probabilities')
    plt.tight_layout()
    plt.show()
else:
    print("No odds data found. Run odds_importer to create sample data.")

## 5. Strategy Backtesting

In [None]:
# Initialize strategies
weather_strategy = WeatherEdgeStrategy()
form_strategy = FormMomentumStrategy()
course_strategy = CourseFitStrategy()

# Create sample tournament data for testing
tournament_data = {
    'players': [
        {'name': 'Scottie Scheffler', 'base_probability': 0.15, 'wind_performance': 0.9, 
         'recent_finishes': [1, 3, 2, 5, 7], 'course_history': [2, 1, 4]},
        {'name': 'Jon Rahm', 'base_probability': 0.12, 'wind_performance': 0.7,
         'recent_finishes': [5, 2, 8, 3, 12], 'course_history': [3, 5, 6]},
        {'name': 'Rory McIlroy', 'base_probability': 0.10, 'wind_performance': 0.6,
         'recent_finishes': [8, 10, 5, 15, 20], 'course_history': [10, 8, 5]},
    ],
    'weather': {'avg_wind_mph': 18, 'temp_high': 75},
    'course_type': 'links'
}

# Calculate probabilities for each strategy
weather_probs = weather_strategy.calculate_probabilities(tournament_data)
form_probs = form_strategy.calculate_probabilities(tournament_data)
course_probs = course_strategy.calculate_probabilities(tournament_data)

# Create comparison DataFrame
comparison_data = []
for player in tournament_data['players']:
    comparison_data.append({
        'Player': player['name'],
        'Market Prob': player['base_probability'],
        'Weather Strategy': weather_probs.get(player['name'], 0),
        'Form Strategy': form_probs.get(player['name'], 0),
        'Course Strategy': course_probs.get(player['name'], 0)
    })

comparison_df = pd.DataFrame(comparison_data)

# Display comparison
print("Strategy Probability Comparison:")
print(comparison_df.round(3))

# Calculate edges
for index, row in comparison_df.iterrows():
    weather_edge = weather_strategy.calculate_edge(row['Weather Strategy'], row['Market Prob'])
    form_edge = form_strategy.calculate_edge(row['Form Strategy'], row['Market Prob'])
    course_edge = course_strategy.calculate_edge(row['Course Strategy'], row['Market Prob'])
    
    print(f"\n{row['Player']} Edges:")
    print(f"  Weather: {weather_edge*100:.1f}%")
    print(f"  Form: {form_edge*100:.1f}%")
    print(f"  Course: {course_edge*100:.1f}%")

## 6. ROI Simulation

In [None]:
# Initialize ROI calculator
roi_calc = ROICalculator(initial_bankroll=1000)

# Simulate some bets based on our strategies
# In production, these would come from actual tournament results
simulated_bets = [
    {'stake': 50, 'odds': 12.0, 'won': False},  # Scottie at +1100
    {'stake': 30, 'odds': 15.0, 'won': False},  # Jon at +1400
    {'stake': 40, 'odds': 18.0, 'won': True},   # Rory at +1700 - Winner!
    {'stake': 25, 'odds': 25.0, 'won': False},  # Longshot
    {'stake': 35, 'odds': 10.0, 'won': False},  # Favorite
    {'stake': 45, 'odds': 8.0, 'won': True},    # Another winner!
]

# Add bets to calculator
for bet in simulated_bets:
    roi_calc.add_bet(bet['stake'], bet['odds'], bet['won'])

# Generate report
print(roi_calc.generate_report())

# Visualize bankroll progression
cumulative_profit = [0]
for bet in roi_calc.bets:
    cumulative_profit.append(cumulative_profit[-1] + bet['profit'])

plt.figure(figsize=(10, 6))
plt.plot(cumulative_profit, 'b-', linewidth=2)
plt.axhline(y=0, color='r', linestyle='--', alpha=0.5)
plt.xlabel('Bet Number')
plt.ylabel('Cumulative Profit ($)')
plt.title('Bankroll Progression')
plt.grid(True, alpha=0.3)
plt.show()

## 7. Next Steps

### Data Collection Priorities:
1. **Historical Odds** - Most critical! Need from co-founder
2. **More Tournament Results** - Expand beyond sample data
3. **Weather Correlation** - Match weather to actual results

### Analysis To-Do:
- [ ] Correlate wind conditions with winning scores
- [ ] Analyze form patterns over time
- [ ] Identify course clustering patterns
- [ ] Calculate actual vs expected performance

### Questions for Co-founder:
1. What bookmakers should we track?
2. Is 5% minimum edge realistic?
3. Should we focus on outright winners or top-5/10?
4. What's our risk tolerance for variance?

In [None]:
# Save our analysis results
import json

analysis_summary = {
    'date': datetime.now().isoformat(),
    'tournaments_analyzed': len(tournaments_df),
    'weather_impact': weather_analysis,
    'sample_roi': roi_calc.calculate_roi(),
    'next_steps': [
        'Collect real historical odds data',
        'Scrape actual tournament results',
        'Run full backtest on 50+ tournaments',
        'Validate with co-founder'
    ]
}

with open('data/analysis_summary.json', 'w') as f:
    json.dump(analysis_summary, f, indent=2)

print("Analysis complete! Summary saved to data/analysis_summary.json")