In [138]:
# Cell 1: Install Dependencies
# Install necessary packages required for the notebook.

!pip install nba_api pandas numpy scikit-learn lightgbm xgboost seaborn matplotlib ipywidgets optuna joblib shap requests tensorflow fastapi uvicorn nest_asyncio pyngrok



In [172]:
# Cell 2: Import Libraries and Set Up Environment

import sys
import os
import logging
import pickle
import warnings
warnings.filterwarnings('ignore')  # Ignore warning messages

# Check if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules

# Import data manipulation libraries
import pandas as pd
import numpy as np

# Import plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import datetime utilities
from datetime import datetime, timedelta

# Import machine learning libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import xgboost as xgb

# Import deep learning libraries
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Import Optuna for hyperparameter optimization
import optuna

# Import SHAP for model interpretation
import shap

# Import requests and retry adapters
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Import NBA API libraries
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams

# Import FastAPI and Uvicorn for API development
from fastapi import FastAPI
import uvicorn
import nest_asyncio
nest_asyncio.apply()

# Import ngrok for exposing local server (optional)
if IN_COLAB:
    from pyngrok import ngrok

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)
logging.info("Logging configured successfully")

# Mount Google Drive if running in Colab
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

# Set up base directories using pathlib
from pathlib import Path

base_dir = Path("/content/drive/MyDrive/nba_models") if IN_COLAB else Path("nba_models")
cache_dir = base_dir / "cache"
cache_dir.mkdir(parents=True, exist_ok=True)  # Create directories if they don't exist

2024-11-13 19:13:53,188 - INFO - Logging configured successfully


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [165]:
# Cell 3: Configuration Class
# This class stores all the configuration parameters used throughout the notebook.

class NotebookConfig:
    # Model parameters for XGBoost and LightGBM
    XGB_PARAMS = {
        'objective': 'binary:logistic',
        'random_state': 42,
        'n_estimators': 100,
        'eval_metric': 'logloss',
        'max_depth': 6,
        'learning_rate': 0.1
    }

    LGB_PARAMS = {
        'objective': 'binary',
        'random_state': 42,
        'n_estimators': 100,
        'metric': 'binary_logloss',
        'num_leaves': 31,
        'learning_rate': 0.1
    }

    # Training parameters
    VALIDATION_WINDOW = 60  # Days
    RETRAIN_FREQUENCY = 7   # Days
    MIN_TRAINING_SAMPLES = 1000
    ROLLING_WINDOWS = [5, 10, 20]  # For rolling statistics
    HEAD_TO_HEAD_WINDOW = 10  # Games

    # Data parameters
    START_SEASON = '2015-16'
    RANDOM_SEED = 42

    # API Configuration for fetching odds data
    ODDS_API_KEY = '55bc8ab276ec29e4a474114a4eccb463'  # Replace with your actual API key
    ODDS_API_URL = 'https://api.the-odds-api.com/v4/sports/basketball_nba/odds'
    ODDS_API_HISTORICAL_URL = 'https://api.the-odds-api.com/v4/sports/basketball_nba/odds-history'
    REGIONS = 'us'
    MARKETS = 'h2h'
    ODDS_FORMAT = 'american'
    BOOKMAKERS = 'fanduel'
    BATCH_SIZE = 10

    # Paths for models and cache directories
    MODELS_DIR = base_dir
    CACHE_DIR = cache_dir

    # Betting parameters
    MIN_KELLY_FRACTION = 0.1
    MAX_KELLY_FRACTION = 0.5
    MIN_CONFIDENCE = 0.6

In [166]:
# Cell 4: Data Manager Class
# This class handles data saving, loading, and cache validation.

class DataManager:
    def __init__(self):
        self.config = NotebookConfig()

    def get_cache_path(self, filename):
        """Get the full path to the cache file."""
        return self.config.CACHE_DIR / filename

    def save_data(self, data, filename):
        """Save data to a cache file using pickle."""
        cache_path = self.get_cache_path(filename)
        with open(cache_path, 'wb') as f:
            pickle.dump(data, f)
        logging.info(f"Data saved to {cache_path}")

    def load_data(self, filename):
        """Load data from a cache file."""
        cache_path = self.get_cache_path(filename)
        if cache_path.exists():
            with open(cache_path, 'rb') as f:
                data = pickle.load(f)
            logging.info(f"Data loaded from {cache_path}")
            return data
        return None

    def is_cache_fresh(self, filename, max_age_hours=24):
        """Check if the cache file is fresh within the specified hours."""
        cache_path = self.get_cache_path(filename)
        try:
            if not cache_path.exists():
                return False
            file_time = datetime.fromtimestamp(cache_path.stat().st_mtime)
            return (datetime.now() - file_time).total_seconds() < max_age_hours * 3600
        except Exception as e:
            logging.error(f"Error checking cache freshness: {e}")
            return False


2024-11-13 19:05:46,565 - INFO - Data saved to /content/drive/MyDrive/nba_models/cache/test_cache.pkl
2024-11-13 19:05:46,575 - INFO - Data loaded from /content/drive/MyDrive/nba_models/cache/test_cache.pkl
2024-11-13 19:05:46,578 - INFO - DataManager test passed.


In [167]:
# Cell 5: Data Fetching Functions
# These functions handle data fetching from the NBA API with retry logic.

def fetch_nba_data_with_retry():
    """Fetch NBA game data with retry logic."""
    session = requests.Session()
    retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)

    try:
        logging.info("Fetching all NBA game data...")
        gamefinder = leaguegamefinder.LeagueGameFinder()
        all_games = gamefinder.get_data_frames()[0]
        logging.info(f"Fetched {len(all_games)} games.")
        return all_games
    except Exception as e:
        logging.error(f"Error fetching NBA data: {e}")
        return pd.DataFrame()

def fetch_or_load_data():
    """Fetch data from API or load from cache."""
    data_manager = DataManager()
    cache_filename = 'nba_game_data.pkl'

    # Check if cached data is fresh
    if data_manager.is_cache_fresh(cache_filename):
        cached_data = data_manager.load_data(cache_filename)
        if cached_data is not None:
            # Ensure GAME_DATE is in datetime format
            cached_data['GAME_DATE'] = pd.to_datetime(cached_data['GAME_DATE'])

            # Check if new data is available
            latest_game_date = cached_data['GAME_DATE'].max()

            # Calculate difference in days
            if (datetime.now() - latest_game_date).days < 1:
                logging.info("Cache is fresh, loading data from cache.")
                return cached_data
            else:
                logging.info("Cache is stale, fetching new data.")
        else:
            logging.info("No cached data found, fetching fresh data.")

    # Fetch new data if cache is unavailable or stale
    game_data = fetch_nba_data_with_retry()

    # Check if data was fetched successfully before saving
    if game_data is not None and not game_data.empty:
        data_manager.save_data(game_data, cache_filename)
        logging.info(f"New data fetched and saved to cache with {len(game_data)} games.")
    else:
        logging.error("Failed to fetch new game data; not saving to cache.")

    return game_data

In [164]:
# Cell 6: Data Processor Class
# This class handles data processing and feature engineering.

class DataProcessor:
    def __init__(self):
        self.config = NotebookConfig()
        self.date_odds_cache = {}  # Cache odds data by date
        self.batch_odds_data = {}  # Temporary store for batch processing

    def prepare_features(self, df):
        """Prepare features for model training/prediction."""
        try:
            # Create a copy to avoid modifying original
            features = df.copy()

            # Drop non-feature columns
            drop_columns = [
                'TEAM_NAME', 'MATCHUP', 'WL', 'TEAM_ABBREVIATION',
                'OPPONENT_ABBREV', 'HOME_TEAM', 'AWAY_TEAM'
            ]
            features = features.drop([col for col in drop_columns if col in features.columns], axis=1)

            # Convert categorical variables to numeric
            categorical_columns = ['TEAM_ID', 'OPPONENT_TEAM_ID', 'HOME_TEAM_ID', 'AWAY_TEAM_ID']
            for col in categorical_columns:
                if col in features.columns:
                    features[col] = features[col].astype('category').cat.codes

            # Create game importance features
            features['is_division_game'] = (features['TEAM_ID'] // 10 == features['OPPONENT_TEAM_ID'] // 10).astype(int)
            features['month_number'] = pd.to_datetime(features['GAME_DATE']).dt.month
            features['is_playoff_month'] = (features['month_number'] >= 4).astype(int)

            # Handle missing values
            numeric_columns = features.select_dtypes(include=['float64', 'int64']).columns
            for col in numeric_columns:
                features[col] = features[col].fillna(features[col].mean())

            # Create interaction features
            features['pts_per_min'] = features['PTS'] / features['MIN']
            features['ast_to_tov'] = features['AST'] / (features['TOV'] + 1)  # Add 1 to avoid division by zero
            features['fg_efficiency'] = features['FG_PCT'] * features['FGA']

            # Create relative performance metrics
            for stat in ['PTS', 'REB', 'AST']:
                if f'{stat}_avg_10' in features.columns:
                    features[f'{stat}_rel_performance'] = features[stat] / features[f'{stat}_avg_10']

            # Normalize certain features
            features['normalized_plus_minus'] = features['PLUS_MINUS'] / features['MIN']
            features['usage_rate'] = (features['FGA'] + 0.44 * features['FTA'] + features['TOV']) / features['MIN']

            # Sort features by date for time-series consistency
            if 'GAME_DATE' in features.columns:
                features = features.sort_values('GAME_DATE')

            # Remove any remaining non-numeric columns except GAME_DATE and WIN
            non_numeric_cols = features.select_dtypes(exclude=['float64', 'int64']).columns
            keep_cols = ['GAME_DATE', 'WIN']
            drop_cols = [col for col in non_numeric_cols if col not in keep_cols]
            features = features.drop(drop_cols, axis=1)

            return features

        except Exception as e:
            logging.error(f"Error in prepare_features: {str(e)}")
            raise



    def preprocess_game_data(self, df):
        """Preprocess the raw game data with improved NaN handling."""
        try:
            logging.info(f"Initial data shape: {df.shape}")
            logging.info(f"Initial columns: {df.columns}")

            df = df.copy()

            # Create WIN column from WL
            if 'WL' in df.columns:
                df['WIN'] = df['WL'].apply(lambda x: 1 if x == 'W' else 0)
                logging.info("Created WIN column from WL")
            else:
                logging.error("WL column not found in data")
                return pd.DataFrame()

            # Convert GAME_DATE to datetime
            df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
            logging.info(f"After date conversion shape: {df.shape}")

            # Sort data
            df = df.sort_values(['TEAM_ID', 'GAME_DATE'])

            # Map team abbreviations
            from nba_api.stats.static import teams
            nba_teams = teams.get_teams()
            team_abbrev_to_id = {team['abbreviation']: team['id'] for team in nba_teams}
            team_id_to_abbrev = {team['id']: team['abbreviation'] for team in nba_teams}

            df['TEAM_ABBREVIATION'] = df['TEAM_ID'].map(team_id_to_abbrev)
            logging.info(f"After team abbreviation mapping shape: {df.shape}")

            # Handle home/away determination
            df['HOME_AWAY'] = df['MATCHUP'].apply(lambda x: 'HOME' if 'vs.' in x else 'AWAY')
            df['OPPONENT_ABBREV'] = df['MATCHUP'].apply(lambda x: x.split(' ')[-1])

            # Create HOME_TEAM and AWAY_TEAM columns
            def get_home_team(row):
                return row['TEAM_ABBREVIATION'] if row['HOME_AWAY'] == 'HOME' else row['OPPONENT_ABBREV']

            def get_away_team(row):
                return row['TEAM_ABBREVIATION'] if row['HOME_AWAY'] == 'AWAY' else row['OPPONENT_ABBREV']

            df['HOME_TEAM'] = df.apply(get_home_team, axis=1)
            df['AWAY_TEAM'] = df.apply(get_away_team, axis=1)

            # Map team IDs
            df['OPPONENT_TEAM_ID'] = df['OPPONENT_ABBREV'].map(team_abbrev_to_id)
            df['HOME_TEAM_ID'] = df['HOME_TEAM'].map(team_abbrev_to_id)
            df['AWAY_TEAM_ID'] = df['AWAY_TEAM'].map(team_abbrev_to_id)

            # Handle missing opponent team IDs
            missing_opponents = df[df['OPPONENT_TEAM_ID'].isna()]['OPPONENT_ABBREV'].unique()
            if len(missing_opponents) > 0:
                logging.warning(f"Missing opponent team IDs for: {missing_opponents}")

            # Drop rows with missing opponent team IDs
            df = df.dropna(subset=['OPPONENT_TEAM_ID'])
            logging.info(f"After dropping missing opponents shape: {df.shape}")

            # Fill missing values in numeric columns
            numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
            for col in numeric_cols:
                df[col] = df[col].fillna(df[col].mean())

            # Convert IDs to integers with proper NaN handling
            id_columns = ['TEAM_ID', 'OPPONENT_TEAM_ID', 'HOME_TEAM_ID', 'AWAY_TEAM_ID']
            for col in id_columns:
                df[col] = df[col].astype('float').fillna(-1).astype(int)

            # Add features
            logging.info("Adding fatigue features...")
            df = self.add_fatigue_features(df)

            logging.info("Adding advanced features...")
            df = self.add_advanced_features(df)

            logging.info("Adding rolling stats...")
            for window in self.config.ROLLING_WINDOWS:
                df = self.add_rolling_stats(df, window)

            logging.info("Adding head-to-head features...")
            df = self.add_head_to_head_features(df)

            logging.info("Adding streak features...")
            df = self.add_streak_features(df)

            logging.info("Adding time-based features...")
            df = self.add_time_based_features(df)

            # Final NaN check and handling
            df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)
            logging.info(f"Final shape: {df.shape}")

            return df

        except Exception as e:
            logging.error(f"Error in preprocess_game_data: {str(e)}")
            import traceback
            logging.error(traceback.format_exc())
            return pd.DataFrame()

    def add_advanced_features(self, df):
        """Add advanced statistical features."""
        try:
            # Create a boolean for home games
            df['is_home_game'] = df['HOME_AWAY'] == 'HOME'

            # Create separate DataFrames for home and away games
            home_games = df[df['is_home_game']].copy()
            away_games = df[~df['is_home_game']].copy()

            # Initialize columns with zeros
            df['home_win_rate'] = 0.0
            df['away_win_rate'] = 0.0

            # Calculate win rates separately for home and away games
            for team_id in df['TEAM_ID'].unique():
                # Home win rate
                team_home = home_games[home_games['TEAM_ID'] == team_id]
                if not team_home.empty:
                    win_rate = team_home['WIN'].rolling(10, min_periods=1).mean()
                    df.loc[team_home.index, 'home_win_rate'] = win_rate

                # Away win rate
                team_away = away_games[away_games['TEAM_ID'] == team_id]
                if not team_away.empty:
                    win_rate = team_away['WIN'].rolling(10, min_periods=1).mean()
                    df.loc[team_away.index, 'away_win_rate'] = win_rate

            # Fill any remaining NaN values
            df['home_win_rate'] = df['home_win_rate'].fillna(0.5)
            df['away_win_rate'] = df['away_win_rate'].fillna(0.5)

            # Difference between home and away win rates
            df['home_road_diff'] = df['home_win_rate'] - df['away_win_rate']

            # Head-to-head performance with minimum periods=1
            df['h2h_wins'] = df.groupby(['TEAM_ID', 'OPPONENT_TEAM_ID'])['WIN'].transform(
                lambda x: x.rolling(window=5, min_periods=1).mean().fillna(0.5)
            )

            # Recent form and trends with minimum periods=1
            df['recent_form'] = df.groupby('TEAM_ID')['WIN'].transform(
                lambda x: x.rolling(window=10, min_periods=1).mean().fillna(0.5)
            )

            df['pts_diff_trend'] = df.groupby('TEAM_ID')['PLUS_MINUS'].transform(
                lambda x: x.rolling(window=5, min_periods=1).mean().fillna(0)
            )

            # Offensive and defensive ratings with minimum periods=1
            df['off_rating'] = df.groupby('TEAM_ID')['PTS'].transform(
                lambda x: x.rolling(window=5, min_periods=1).mean().fillna(x.mean())
            )

            df['def_rating'] = df.groupby('OPPONENT_TEAM_ID')['PTS'].transform(
                lambda x: x.rolling(window=5, min_periods=1).mean().fillna(x.mean())
            )

            # Scoring consistency with minimum periods=1
            df['scoring_consistency'] = df.groupby('TEAM_ID')['PTS'].transform(
                lambda x: x.rolling(window=5, min_periods=1).std().fillna(0)
            )

            return df
        except Exception as e:
            logging.error(f"Error in add_advanced_features: {str(e)}")
            return df


    def add_rolling_stats(self, df, window):
        """Add rolling statistics for a given window size."""
        try:
            stats_columns = ['PTS', 'REB', 'AST', 'PLUS_MINUS', 'FG_PCT', 'FT_PCT', 'FG3_PCT']

            for col in stats_columns:
                # Calculate rolling statistics with min_periods=1
                df[f'{col}_avg_{window}'] = df.groupby('TEAM_ID')[col].transform(
                    lambda x: x.rolling(window=window, min_periods=1).mean().fillna(x.mean())
                )

                df[f'{col}_std_{window}'] = df.groupby('TEAM_ID')[col].transform(
                    lambda x: x.rolling(window=window, min_periods=1).std().fillna(0)
                )

                df[f'{col}_max_{window}'] = df.groupby('TEAM_ID')[col].transform(
                    lambda x: x.rolling(window=window, min_periods=1).max().fillna(x.mean())
                )

            return df
        except Exception as e:
            logging.error(f"Error in add_rolling_stats: {str(e)}")
            return df


    def add_head_to_head_features(self, df):
        """Add head-to-head matchup features."""
        try:
            # Calculate h2h win rate with minimum periods=1 and fill NaN with 0.5
            df['h2h_win_rate'] = df.groupby(['TEAM_ID', 'OPPONENT_TEAM_ID'])['WIN'].transform(
                lambda x: x.rolling(window=self.config.HEAD_TO_HEAD_WINDOW, min_periods=1).mean().fillna(0.5)
            )

            # Calculate point differential with minimum periods=1 and fill NaN with 0
            df['h2h_point_diff'] = df.groupby(['TEAM_ID', 'OPPONENT_TEAM_ID'])['PLUS_MINUS'].transform(
                lambda x: x.rolling(window=self.config.HEAD_TO_HEAD_WINDOW, min_periods=1).mean().fillna(0)
            )

            return df
        except Exception as e:
            logging.error(f"Error in add_head_to_head_features: {str(e)}")
            return df

    def add_fatigue_features(self, df):
        """Add fatigue and travel-related features."""
        # Ensure dates are in datetime format (redundant safety check)
        if not pd.api.types.is_datetime64_any_dtype(df['GAME_DATE']):
            df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

        # Sort by team and date
        df = df.sort_values(['TEAM_ID', 'GAME_DATE']).copy()

        # Calculate days between games
        df['days_since_last'] = df.groupby('TEAM_ID')['GAME_DATE'].diff().dt.days.fillna(0)

        # Calculate games in last 7 days using vectorized operations
        def calculate_rolling_games(group):
            # Create a Series with the game dates
            dates = pd.Series(group['GAME_DATE'])
            counts = []

            for current_date in dates:
                # Count games in previous 7 days (excluding current game)
                count = ((dates < current_date) &
                        (dates >= current_date - pd.Timedelta(days=7))).sum()
                counts.append(count)

            return counts

        # Apply the calculation to each team
        df['games_last_7d'] = df.groupby('TEAM_ID').apply(
            calculate_rolling_games
        ).explode().values

        # Calculate back-to-back games
        df['is_back_to_back'] = (df['days_since_last'] == 1).astype(int)

        return df

    def add_streak_features(self, df):
        """Add streak-related features."""
        try:
            def get_streak(series):
                streak = 0
                streaks = []
                for val in series:
                    if val == 1:
                        streak = streak + 1 if streak >= 0 else 1
                    else:
                        streak = streak - 1 if streak <= 0 else -1
                    streaks.append(streak)
                return pd.Series(streaks, index=series.index)

            # Calculate streaks
            df['streak'] = df.groupby('TEAM_ID')['WIN'].transform(
                lambda x: get_streak(x)
            )

            # Calculate momentum with minimum periods=1
            weights = np.array([0.35, 0.25, 0.20, 0.15, 0.05])
            def weighted_momentum(series):
                return series.rolling(5, min_periods=1).apply(
                    lambda x: np.sum(weights[-len(x):] * x) / np.sum(weights[-len(x):])
                ).fillna(0.5)

            df['momentum'] = df.groupby('TEAM_ID')['WIN'].transform(weighted_momentum)

            return df
        except Exception as e:
            logging.error(f"Error in add_streak_features: {str(e)}")
            return df

    def add_time_based_features(self, df):
        """Add time-based features such as season phase and rest days."""
        try:
            logging.info("Starting add_time_based_features")
            logging.info(f"Initial shape: {df.shape}")

            # Season phase (regular season or playoffs)
            df['month'] = df['GAME_DATE'].dt.month
            df['season_phase'] = df['month'].apply(
                lambda x: 'playoffs' if x >= 4 else 'regular'
            )

            # Encode season phase
            le = LabelEncoder()
            df['season_phase'] = le.fit_transform(df['season_phase'])

            # Rest days between games
            df['rest_days'] = df.groupby('TEAM_ID')['GAME_DATE'].diff().dt.days.fillna(0)

            logging.info(f"Final shape after time-based features: {df.shape}")
            return df

        except Exception as e:
            logging.error(f"Error in add_time_based_features: {str(e)}")
            return df



Starting script execution...


In [185]:
class ModelTrainer:
    def __init__(self):
        self.config = NotebookConfig()
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='mean')

    def evaluate_model(self, model, X_test, y_test, model_name):
        """Evaluate model with proper error handling."""
        try:
            if hasattr(model, 'predict_proba'):
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                y_pred = (y_pred_proba > 0.5).astype(int)
            else:
                y_pred = model.predict(X_test)
                y_pred = (y_pred > 0.5).astype(int)
                y_pred_proba = y_pred

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred_proba)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)

            logging.info(f"{model_name} Evaluation:")
            logging.info(f"  Accuracy: {acc:.4f}")
            logging.info(f"  F1 Score: {f1:.4f}")
            logging.info(f"  ROC AUC: {roc_auc:.4f}")
            logging.info(f"  Precision: {precision:.4f}")
            logging.info(f"  Recall: {recall:.4f}")

            cm = confusion_matrix(y_test, y_pred)
            logging.info(f"  Confusion Matrix:")
            logging.info(f"    TN: {cm[0,0]}, FP: {cm[0,1]}")
            logging.info(f"    FN: {cm[1,0]}, TP: {cm[1,1]}")

        except Exception as e:
            logging.error(f"Error evaluating {model_name}: {str(e)}")

    def preprocess_features(self, X_train, X_val, X_test):
        """Preprocess features with improved scaling and imputation."""
        # Convert to DataFrame if not already
        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)
            X_val = pd.DataFrame(X_val)
            X_test = pd.DataFrame(X_test)

        feature_names = [f'feature_{i}' for i in range(X_train.shape[1])]
        X_train.columns = feature_names
        X_val.columns = feature_names
        X_test.columns = feature_names

        # First handle missing values
        X_train_imputed = self.imputer.fit_transform(X_train)
        X_val_imputed = self.imputer.transform(X_val)
        X_test_imputed = self.imputer.transform(X_test)

        # Then scale the data
        X_train_scaled = self.scaler.fit_transform(X_train_imputed)
        X_val_scaled = self.scaler.transform(X_val_imputed)
        X_test_scaled = self.scaler.transform(X_test_imputed)

        # Convert back to DataFrame
        X_train_df = pd.DataFrame(X_train_scaled, columns=feature_names)
        X_val_df = pd.DataFrame(X_val_scaled, columns=feature_names)
        X_test_df = pd.DataFrame(X_test_scaled, columns=feature_names)

        return X_train_df, X_val_df, X_test_df

    def train_baseline_model(self, X_train, y_train):
        """Train a more robust baseline model with balanced class weights."""
        model = LogisticRegression(
            class_weight='balanced',
            penalty='l1',
            solver='liblinear',
            random_state=self.config.RANDOM_SEED,
            max_iter=1000
        )
        model.fit(X_train, y_train)
        return model

    def optimize_xgboost_params(self, X_train, y_train, X_val, y_val):
        """Optimize XGBoost with improved parameters."""
        def objective(trial):
            params = {
                'max_depth': trial.suggest_int('max_depth', 2, 6),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
                'subsample': trial.suggest_float('subsample', 0.6, 0.9),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
                'gamma': trial.suggest_float('gamma', 0, 2),
                'scale_pos_weight': sum(y_train == 0) / sum(y_train == 1),
                'objective': 'binary:logistic',
                'tree_method': 'exact',
                'random_state': self.config.RANDOM_SEED,
                'missing': np.nan  # Explicitly handle missing values
            }

            model = xgb.XGBClassifier(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_val)[:, 1]
            return roc_auc_score(y_val, y_pred)

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=20)

        return study.best_params

    def optimize_lightgbm_params(self, X_train, y_train, X_val, y_val):
        """Optimize LightGBM with improved parameters."""
        def objective(trial):
            params = {
                'num_leaves': trial.suggest_int('num_leaves', 8, 32),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
                'subsample': trial.suggest_float('subsample', 0.6, 0.9),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
                'objective': 'binary',
                'metric': 'auc',
                'deterministic': True,
                'force_row_wise': True,
                'min_data_in_leaf': 20,
                'scale_pos_weight': sum(y_train == 0) / sum(y_train == 1),
                'random_state': self.config.RANDOM_SEED
            }

            model = lgb.LGBMClassifier(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_val)[:, 1]
            return roc_auc_score(y_val, y_pred)

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=20)

        return study.best_params

    def create_neural_network(self, input_dim):
        """Create a neural network with proper input dimension handling."""
        try:
            if input_dim <= 0:
                raise ValueError(f"Invalid input dimension: {input_dim}")

            model = Sequential([
                Dense(64, input_shape=(input_dim,), activation='relu',
                      kernel_regularizer=l2(0.01)),
                BatchNormalization(),
                Dropout(0.3),

                Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
                BatchNormalization(),
                Dropout(0.2),

                Dense(16, activation='relu', kernel_regularizer=l2(0.01)),
                BatchNormalization(),
                Dropout(0.1),

                Dense(1, activation='sigmoid')
            ])

            model.compile(
                optimizer=Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy']
            )

            return model
        except Exception as e:
            logging.error(f"Error creating neural network: {str(e)}")
            raise

    def train_models(self, X_train, X_val, X_test, y_train, y_val, y_test):
        """Train all models with improved error handling and class weight calculation."""
        try:
            logging.info("Preprocessing features...")

            # Validate input data
            if X_train.shape[0] == 0 or X_train.shape[1] == 0:
                raise ValueError(f"Invalid training data shape: {X_train.shape}")

            # Preprocess the data
            X_train_processed, X_val_processed, X_test_processed = self.preprocess_features(
                X_train, X_val, X_test
            )

            # Convert targets to proper format and numpy arrays
            y_train = y_train.astype(int).values
            y_val = y_val.astype(int).values
            y_test = y_test.astype(int).values

            # Initialize models list
            trained_models = []

            # Train baseline model
            logging.info("Training baseline Logistic Regression model...")
            baseline_model = self.train_baseline_model(X_train_processed, y_train)
            trained_models.append(baseline_model)
            self.evaluate_model(baseline_model, X_test_processed, y_test, "Baseline Logistic Regression")

            # Train XGBoost
            logging.info("Training XGBoost model...")
            xgb_params = self.optimize_xgboost_params(X_train_processed, y_train, X_val_processed, y_val)
            xgb_model = xgb.XGBClassifier(**xgb_params)
            xgb_model.fit(X_train_processed, y_train)
            trained_models.append(xgb_model)
            self.evaluate_model(xgb_model, X_test_processed, y_test, "XGBoost")

            # Train LightGBM
            logging.info("Training LightGBM model...")
            lgb_params = self.optimize_lightgbm_params(X_train_processed, y_train, X_val_processed, y_val)
            lgb_model = lgb.LGBMClassifier(**lgb_params)
            lgb_model.fit(X_train_processed, y_train)
            trained_models.append(lgb_model)
            self.evaluate_model(lgb_model, X_test_processed, y_test, "LightGBM")

            # Train Neural Network
            logging.info("Training Neural Network model...")
            input_dim = X_train_processed.shape[1]
            if input_dim <= 0:
                raise ValueError(f"Invalid input dimension for neural network: {input_dim}")

            logging.info(f"Creating neural network with input dimension: {input_dim}")
            nn_model = self.create_neural_network(input_dim)

            # Calculate class weights properly using numpy
            n_negative = np.sum(y_train == 0)
            n_positive = np.sum(y_train == 1)
            class_weights = {
                0: 1.0,
                1: (n_negative / n_positive) if n_positive > 0 else 1.0
            }

            callbacks = [
                EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
                ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.0001)
            ]

            nn_model.fit(
                X_train_processed, y_train,
                validation_data=(X_val_processed, y_val),
                epochs=200,
                batch_size=32,
                callbacks=callbacks,
                class_weight=class_weights,
                verbose=1
            )
            trained_models.append(nn_model)
            self.evaluate_model(nn_model, X_test_processed, y_test, "Neural Network")

            return trained_models

        except Exception as e:
            logging.error(f"Error in train_models: {str(e)}")
            logging.error(f"Error details - X_train shape: {X_train.shape if hasattr(X_train, 'shape') else 'No shape'}")
            logging.error(f"Stack trace: {traceback.format_exc()}")
            raise

In [180]:
# Cell 8: Game Predictor Class
# This class handles making predictions using the trained models.

class GamePredictor:
    def __init__(self, models, scaler, processor):
        self.baseline_model, self.xgb_model, self.lgb_model, self.nn_model = models
        self.scaler = scaler
        self.processor = processor
        self.config = NotebookConfig()

        # Cache team data
        self.nba_teams = teams.get_teams()
        self.team_abbrev_to_id = {team['abbreviation']: team['id'] for team in self.nba_teams}

        # Define core feature columns
        self.core_features = [
            'FG_PCT', 'FG3_PCT', 'FT_PCT', 'AST',
            'REB', 'TOV', 'PLUS_MINUS', 'pts_per_min',
            'ast_to_tov', 'fg_efficiency'
        ]

    def prepare_sample_game_data(self, team_abbrev, opponent_abbrev, game_date=None, is_home=True):
        """Create properly formatted sample game data."""
        if game_date is None:
            game_date = datetime.now()

        # Get team IDs
        team_id = self.team_abbrev_to_id.get(team_abbrev)
        opponent_id = self.team_abbrev_to_id.get(opponent_abbrev)

        if not team_id or not opponent_id:
            raise ValueError(f"Invalid team abbreviation: {team_abbrev if not team_id else opponent_abbrev}")

        # Create matchup string
        matchup = f"{team_abbrev} vs. {opponent_abbrev}" if is_home else f"{team_abbrev} @ {opponent_abbrev}"

        # Create sample game data with all required fields
        sample_game = pd.DataFrame({
            'GAME_DATE': [game_date],
            'TEAM_ID': [team_id],
            'TEAM_ABBREVIATION': [team_abbrev],
            'TEAM_NAME': [next((team['full_name'] for team in self.nba_teams if team['id'] == team_id), '')],
            'OPPONENT_TEAM_ID': [opponent_id],
            'OPPONENT_ABBREV': [opponent_abbrev],
            'MATCHUP': [matchup],
            'WL': ['W'],  # Placeholder
            'MIN': [240],  # Standard game length
            'PTS': [100],  # Placeholder stats
            'FGM': [40],
            'FGA': [80],
            'FG_PCT': [0.500],
            'FG3M': [10],
            'FG3A': [25],
            'FG3_PCT': [0.400],
            'FTM': [10],
            'FTA': [15],
            'FT_PCT': [0.667],
            'OREB': [10],
            'DREB': [30],
            'REB': [40],
            'AST': [25],
            'STL': [8],
            'BLK': [5],
            'TOV': [12],
            'PF': [20],
            'PLUS_MINUS': [0]
        })

        return sample_game

    def predict_game(self, team_abbrev, opponent_abbrev, game_date=None, is_home=True):
        """Make ensemble prediction for a single game."""
        try:
            # Prepare properly formatted game data
            game_data = self.prepare_sample_game_data(team_abbrev, opponent_abbrev, game_date, is_home)

            # Process game data
            processed_data = self.processor.preprocess_game_data(game_data)
            if processed_data.empty:
                raise ValueError("No valid processed data")

            # Calculate derived features
            processed_data['pts_per_min'] = processed_data['PTS'] / processed_data['MIN']
            processed_data['ast_to_tov'] = processed_data['AST'] / (processed_data['TOV'] + 1)
            processed_data['fg_efficiency'] = processed_data['FG_PCT'] * processed_data['FGA']

            # Select only the core features used in training
            X = processed_data[self.core_features]

            # Scale features
            X_scaled = self.scaler.transform(X)

            # Get predictions from each model
            baseline_pred = self.baseline_model.predict_proba(X_scaled)[:, 1]
            xgb_pred = self.xgb_model.predict_proba(X_scaled)[:, 1]
            lgb_pred = self.lgb_model.predict_proba(X_scaled)[:, 1]
            nn_pred = self.nn_model.predict(X_scaled).ravel()

            # Ensemble prediction (weighted average)
            weights = [0.2, 0.3, 0.3, 0.2]  # Adjustable weights for each model
            ensemble_pred = (
                weights[0] * baseline_pred +
                weights[1] * xgb_pred +
                weights[2] * lgb_pred +
                weights[3] * nn_pred
            )

            return {
                'prediction': float(ensemble_pred[0]),
                'confidence': float(np.abs(ensemble_pred[0] - 0.5) * 2),
                'model_predictions': {
                    'baseline': float(baseline_pred[0]),
                    'xgboost': float(xgb_pred[0]),
                    'lightgbm': float(lgb_pred[0]),
                    'neural_network': float(nn_pred[0])
                }
            }

        except Exception as e:
            logging.error(f"Error in predict_game: {str(e)}")
            raise


2024-11-13 19:16:43,909 - INFO - Initial data shape: (1, 28)
2024-11-13 19:16:43,912 - INFO - Initial columns: Index(['GAME_DATE', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME',
       'OPPONENT_TEAM_ID', 'OPPONENT_ABBREV', 'MATCHUP', 'WL', 'MIN', 'PTS',
       'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PLUS_MINUS'],
      dtype='object')
2024-11-13 19:16:43,917 - INFO - Created WIN column from WL
2024-11-13 19:16:43,920 - INFO - After date conversion shape: (1, 29)
2024-11-13 19:16:43,931 - INFO - After team abbreviation mapping shape: (1, 29)
2024-11-13 19:16:43,947 - INFO - After dropping missing opponents shape: (1, 34)
2024-11-13 19:16:43,964 - INFO - Adding fatigue features...
2024-11-13 19:16:43,978 - INFO - Adding advanced features...
2024-11-13 19:16:44,001 - INFO - Adding rolling stats...


[LightGBM] [Info] Number of positive: 57, number of negative: 43
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 350
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.570000 -> initscore=0.281851
[LightGBM] [Info] Start training from score 0.281851


2024-11-13 19:16:44,140 - INFO - Adding head-to-head features...
2024-11-13 19:16:44,149 - INFO - Adding streak features...
2024-11-13 19:16:44,159 - INFO - Adding time-based features...
2024-11-13 19:16:44,161 - INFO - Starting add_time_based_features
2024-11-13 19:16:44,164 - INFO - Initial shape: (1, 114)
2024-11-13 19:16:44,172 - INFO - Final shape after time-based features: (1, 117)
2024-11-13 19:16:44,192 - INFO - Final shape: (1, 117)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step


2024-11-13 19:16:44,369 - INFO - GamePredictor test passed. Prediction: {'prediction': 0.5901043924233957, 'confidence': 0.18020878484679148, 'model_predictions': {'baseline': 1.0, 'xgboost': 0.5150402784347534, 'lightgbm': 0.11864098990680226, 'neural_network': 1.0}}


In [181]:
# Cell 9: FastAPI Setup
# Setting up a local FastAPI application for making predictions via API.

from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class GameData(BaseModel):
    TEAM_ABBREVIATION: str
    OPPONENT_ABBREV: str
    GAME_DATE: str  # Date in 'YYYY-MM-DD' format
    MATCHUP: str
    WL: str  # Placeholder

@app.post("/predict")
async def predict_game_endpoint(game_data: GameData):
    """API endpoint for predicting a game's outcome."""
    try:
        input_df = pd.DataFrame([game_data.dict()])
        predictor = GamePredictor(models, scaler, processor)
        prediction = predictor.predict_game(input_df)
        return prediction
    except Exception as e:
        return {"error": str(e)}

In [193]:
# Cell 10: Interactive Interface
# Creating an interactive widget interface for making predictions in the notebook.

def get_valid_team_abbreviations():
    """Get list of valid NBA team abbreviations."""
    return [
        'ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW',
        'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK',
        'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS'
    ]

def validate_team_abbreviation(abbrev):
    """Validate team abbreviation."""
    valid_teams = get_valid_team_abbreviations()
    if abbrev not in valid_teams:
        raise ValueError(
            f"Invalid team abbreviation: {abbrev}\n"
            f"Valid abbreviations are: {', '.join(valid_teams)}"
        )
    return True

class ModelManager:
    def __init__(self):
        self.config = NotebookConfig()
        self.data_manager = DataManager()
        self.processor = DataProcessor()
        self.trainer = ModelTrainer()

    def train_and_save_models(self):
        """Train and save all models."""
        try:
            # Fetch and process data
            logging.info("Fetching NBA data...")
            game_data = fetch_or_load_data()
            if game_data.empty:
                raise ValueError("No game data available.")

            # Preprocess data
            logging.info("Preprocessing game data...")
            processed_data = self.processor.preprocess_game_data(game_data)

            # Calculate core features
            processed_data['pts_per_min'] = processed_data['PTS'] / processed_data['MIN']
            processed_data['ast_to_tov'] = processed_data['AST'] / (processed_data['TOV'] + 1)
            processed_data['fg_efficiency'] = processed_data['FG_PCT'] * processed_data['FGA']

            # Select core features
            core_features = [
                'FG_PCT', 'FG3_PCT', 'FT_PCT', 'AST',
                'REB', 'TOV', 'PLUS_MINUS', 'pts_per_min',
                'ast_to_tov', 'fg_efficiency'
            ]

            X = processed_data[core_features]
            y = processed_data['WIN']

            # Split data
            train_size = int(0.7 * len(X))
            val_size = int(0.15 * len(X))

            X_train = X[:train_size]
            y_train = y[:train_size]
            X_val = X[train_size:train_size + val_size]
            y_val = y[train_size:train_size + val_size]
            X_test = X[train_size + val_size:]
            y_test = y[train_size + val_size:]

            # Train models
            logging.info("Training models...")
            models = self.trainer.train_models(X_train, X_val, X_test, y_train, y_val, y_test)
            scaler = self.trainer.scaler

            # Save models
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            self.save_models(models, scaler, timestamp)

            return models, scaler

        except Exception as e:
            logging.error(f"Error in train_and_save_models: {str(e)}")
            raise

    def save_models(self, models, scaler, timestamp):
        """Save all models and scaler."""
        try:
            # Create models directory if it doesn't exist
            os.makedirs(self.config.MODELS_DIR, exist_ok=True)

            # Save individual models
            logging.info("Saving baseline model...")
            joblib.dump(models[0], f'{self.config.MODELS_DIR}/baseline_model_{timestamp}.joblib')

            logging.info("Saving XGBoost model...")
            models[1].save_model(f'{self.config.MODELS_DIR}/xgb_model_{timestamp}.json')

            logging.info("Saving LightGBM model...")
            # Save LightGBM model directly using the booster
            if hasattr(models[2], '_Booster'):
                models[2]._Booster.save_model(f'{self.config.MODELS_DIR}/lgb_model_{timestamp}.txt')
            else:
                models[2].booster_.save_model(f'{self.config.MODELS_DIR}/lgb_model_{timestamp}.txt')

            logging.info("Saving Neural Network model...")
            models[3].save(f'{self.config.MODELS_DIR}/nn_model_{timestamp}.keras')

            logging.info("Saving scaler...")
            with open(f'{self.config.MODELS_DIR}/scaler_{timestamp}.pkl', 'wb') as f:
                pickle.dump(scaler, f)

            logging.info(f"Models saved with timestamp: {timestamp}")

        except Exception as e:
            logging.error(f"Error saving models: {str(e)}")
            logging.error(f"Full error: {traceback.format_exc()}")
            raise

    def load_most_recent_models(self):
        """Load the most recently saved models."""
        try:
            models_dir = Path(self.config.MODELS_DIR)
            if not models_dir.exists():
                logging.info(f"Models directory not found: {models_dir}")
                return

            # Get all model files
            model_files = list(models_dir.glob("baseline_model_*.joblib"))
            if not model_files:
                logging.info("No model files found")
                return

            # Extract timestamps and get the most recent one
            timestamps = []
            for file in model_files:
                try:
                    timestamp = file.stem.split('baseline_model_')[1]
                    timestamps.append(timestamp)
                except Exception as e:
                    logging.warning(f"Couldn't parse timestamp from file: {file}")
                    continue

            if not timestamps:
                logging.info("No valid timestamps found")
                return

            latest_timestamp = max(timestamps)
            logging.info(f"Found latest timestamp: {latest_timestamp}")

            # Construct full paths for all model files
            baseline_path = models_dir / f"baseline_model_{latest_timestamp}.joblib"
            xgb_path = models_dir / f"xgb_model_{latest_timestamp}.json"
            lgb_path = models_dir / f"lgb_model_{latest_timestamp}.txt"
            nn_path = models_dir / f"nn_model_{latest_timestamp}.keras"
            scaler_path = models_dir / f"scaler_{latest_timestamp}.pkl"

            # Verify all files exist
            required_files = [baseline_path, xgb_path, lgb_path, nn_path, scaler_path]
            for file in required_files:
                if not file.exists():
                    logging.error(f"Required model file not found: {file}")
                    return

            # Load models
            models = []

            # Load baseline model
            logging.info("Loading baseline model...")
            models.append(joblib.load(baseline_path))

            # Load XGBoost model
            logging.info("Loading XGBoost model...")
            xgb_model = xgb.XGBClassifier()
            xgb_model.load_model(str(xgb_path))
            models.append(xgb_model)

            # Load LightGBM model
            logging.info("Loading LightGBM model...")
            booster = lgb.Booster(model_file=str(lgb_path))
            lgb_model = lgb.LGBMClassifier(n_estimators=100)  # Initialize with default params
            lgb_model._Booster = booster  # Set the booster
            lgb_model._n_features = booster.num_feature()  # Set number of features
            lgb_model._n_classes = 2  # Binary classification
            lgb_model._classes = np.array([0, 1])  # Binary classes
            models.append(lgb_model)

            # Load Neural Network model
            logging.info("Loading Neural Network model...")
            nn_model = tf.keras.models.load_model(str(nn_path))
            models.append(nn_model)

            # Load scaler
            logging.info("Loading scaler...")
            with open(scaler_path, 'rb') as f:
                scaler = pickle.load(f)

            self.models = models
            self.scaler = scaler
            logging.info(f"Successfully loaded all models from timestamp: {latest_timestamp}")

        except Exception as e:
            logging.error(f"Error loading models: {str(e)}")
            logging.error(f"Full error: {traceback.format_exc()}")
            self.models = None
            self.scaler = None

    def predict_with_models(self, X):
        """Make predictions with all models."""
        if self.models is None or self.scaler is None:
            raise ValueError("Models not loaded")

        # Scale the input data
        X_scaled = self.scaler.transform(X)

        predictions = []
        for i, model in enumerate(self.models):
            try:
                if isinstance(model, lgb.LGBMClassifier):
                    pred = model.predict_proba(X_scaled)[:, 1]
                elif hasattr(model, 'predict_proba'):
                    pred = model.predict_proba(X_scaled)[:, 1]
                else:
                    pred = model.predict(X_scaled).ravel()
                predictions.append(pred)
            except Exception as e:
                logging.error(f"Error predicting with model {i}: {str(e)}")
                raise

        return np.mean(predictions, axis=0)

def create_prediction_interface():
    """Create and display the prediction interface."""
    # Create widgets
    home_team = widgets.Text(description='Home Team:', value='')
    away_team = widgets.Text(description='Away Team:', value='')
    train_button = widgets.Button(description='Train Models')
    predict_button = widgets.Button(description='Predict')
    output = widgets.Output()
    status_label = widgets.HTML(value="")  # Add status label

    # Initialize prediction interface
    interface = PredictionInterface()

    def update_status():
        """Update status label based on model availability."""
        if interface.models is not None and interface.scaler is not None:
            status_label.value = '<p style="color: green;">Models loaded and ready</p>'
        else:
            status_label.value = '<p style="color: red;">No models loaded - please train first</p>'

    update_status()  # Initial status update

    def on_train_button_clicked(b):
        with output:
            clear_output()
            try:
                model_manager = ModelManager()
                print("Training models... This may take a few minutes.")
                interface.models, interface.scaler = model_manager.train_and_save_models()
                print("Models trained and saved successfully!")
                update_status()
            except Exception as e:
                print(f"Error training models: {str(e)}")
                logging.error(f"Training error: {traceback.format_exc()}")

    def on_predict_button_clicked(b):
        with output:
            clear_output()
            try:
                if interface.models is None or interface.scaler is None:
                    print("Please train models first!")
                    return

                if not home_team.value or not away_team.value:
                    print("Please enter both home and away teams!")
                    return

                # Validate team abbreviations
                home_abbrev = home_team.value.upper()
                away_abbrev = away_team.value.upper()

                try:
                    validate_team_abbreviation(home_abbrev)
                    validate_team_abbreviation(away_abbrev)
                except ValueError as e:
                    print(str(e))
                    return

                predictor = GamePredictor(interface.models, interface.scaler, interface.processor)
                prediction = predictor.predict_game(
                    team_abbrev=home_abbrev,
                    opponent_abbrev=away_abbrev,
                    is_home=True
                )

                print(f"\nPrediction for {home_abbrev} vs {away_abbrev}:")
                print(f"Win Probability for {home_abbrev}: {prediction['prediction']:.2%}")
                print(f"Confidence: {prediction['confidence']:.2%}")
                print("\nModel Predictions:")
                for model, pred in prediction['model_predictions'].items():
                    print(f"{model.capitalize()}: {pred:.2%}")

            except Exception as e:
                print(f"Error during prediction: {str(e)}")
                logging.error(f"Prediction error: {traceback.format_exc()}")

    # Connect buttons to callbacks
    train_button.on_click(on_train_button_clicked)
    predict_button.on_click(on_predict_button_clicked)

    # Display widgets
    display(widgets.VBox([
        widgets.HTML("<h3>NBA Game Prediction</h3>"),
        status_label,
        widgets.HBox([home_team, away_team]),
        widgets.HBox([train_button, predict_button]),
        output
    ]))

# Create the interface when running this cell
create_prediction_interface()

2024-11-13 19:47:14,811 - INFO - Found latest timestamp: 20241113_193935
2024-11-13 19:47:14,879 - ERROR - Error loading models: can't set attribute 'booster_'
2024-11-13 19:47:14,881 - ERROR - Full error: Traceback (most recent call last):
  File "<ipython-input-192-a0b171efd0f6>", line 174, in load_most_recent_models
    lgb_model.booster_ = lgb.Booster(model_file=str(lgb_path))
AttributeError: can't set attribute 'booster_'



VBox(children=(HTML(value='<h3>NBA Game Prediction</h3>'), HTML(value='<p style="color: red;">No models loaded…

In [194]:
# Cell 11: Initialization of Components

# Initialize components
logging.info("Initializing components...")
data_manager = DataManager()
processor = DataProcessor()
trainer = ModelTrainer()

# Fetch data
logging.info("Fetching NBA data...")
game_data = fetch_or_load_data()
if game_data.empty:
    logging.error("No game data available.")
else:
    # Preprocess data
    logging.info("Preprocessing game data...")
    features = processor.preprocess_game_data(game_data)
    features = processor.prepare_features(features)

    # Split the dataset into features and target
    X = features.drop(['GAME_DATE', 'WIN'], axis=1)
    y = features['WIN']

    # Split data into training, validation, and test sets
    train_size = int(0.7 * len(features))
    val_size = int(0.15 * len(features))

    X_train, y_train = X[:train_size], y[:train_size]
    X_val, y_val = X[train_size:train_size + val_size], y[train_size:train_size + val_size]
    X_test, y_test = X[train_size + val_size:], y[train_size + val_size:]

    # Train models
    logging.info("Training models...")
    models = trainer.train_models(X_train, X_val, X_test, y_train, y_val, y_test)
    scaler = trainer.scaler

    # Save models and scaler
    logging.info("Saving models...")
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    joblib.dump(models[0], f'{NotebookConfig.MODELS_DIR}/baseline_model_{timestamp}.joblib')
    models[1].save_model(f'{NotebookConfig.MODELS_DIR}/xgb_model_{timestamp}.json')
    models[2].booster_.save_model(f'{NotebookConfig.MODELS_DIR}/lgb_model_{timestamp}.txt')
    models[3].save(f'{NotebookConfig.MODELS_DIR}/nn_model_{timestamp}.h5')
    with open(f'{NotebookConfig.MODELS_DIR}/scaler_{timestamp}.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    logging.info("Initialization and training completed.")

2024-11-13 19:56:25,910 - INFO - Initializing components...
2024-11-13 19:56:25,913 - INFO - Fetching NBA data...
2024-11-13 19:56:25,967 - INFO - Data loaded from /content/drive/MyDrive/nba_models/cache/nba_game_data.pkl
2024-11-13 19:56:25,985 - INFO - Cache is fresh, loading data from cache.
2024-11-13 19:56:25,988 - INFO - Preprocessing game data...
2024-11-13 19:56:25,990 - INFO - Initial data shape: (30000, 28)
2024-11-13 19:56:25,994 - INFO - Initial columns: Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')
2024-11-13 19:56:26,025 - INFO - Created WIN column from WL
2024-11-13 19:56:26,209 - INFO - After date conversion shape: (30000, 29)
2024-11-13 19:56:26,239 - INFO - After team abbreviation mapping shape: (30000, 29)
 

[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:07,428] Trial 0 finished with value: 1.0 and parameters: {'num_leaves': 25, 'learning_rate': 0.03689013247052946, 'n_estimators': 191, 'min_child_samples': 13, 'subsample': 0.747450879777221, 'colsample_bytree': 0.8603376241485899}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:08,909] Trial 1 finished with value: 1.0 and parameters: {'num_leaves': 9, 'learning_rate': 0.02988568047253621, 'n_estimators': 133, 'min_child_samples': 28, 'subsample': 0.7685098017527628, 'colsample_bytree': 0.6538595859049122}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:10,302] Trial 2 finished with value: 1.0 and parameters: {'num_leaves': 11, 'learning_rate': 0.0280301632583062, 'n_estimators': 157, 'min_child_samples': 42, 'subsample': 0.8578470410547561, 'colsample_bytree': 0.6159593155225976}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:11,420] Trial 3 finished with value: 1.0 and parameters: {'num_leaves': 16, 'learning_rate': 0.06619541461560273, 'n_estimators': 105, 'min_child_samples': 39, 'subsample': 0.8098672503627985, 'colsample_bytree': 0.6794067836741787}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:13,151] Trial 4 finished with value: 1.0 and parameters: {'num_leaves': 28, 'learning_rate': 0.05334097327273916, 'n_estimators': 127, 'min_child_samples': 14, 'subsample': 0.8014128094737025, 'colsample_bytree': 0.6844445990314464}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:16,805] Trial 5 finished with value: 1.0 and parameters: {'num_leaves': 27, 'learning_rate': 0.0326565902147186, 'n_estimators': 282, 'min_child_samples': 21, 'subsample': 0.6783886266080315, 'colsample_bytree': 0.8631040537325656}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:17,962] Trial 6 finished with value: 1.0 and parameters: {'num_leaves': 16, 'learning_rate': 0.023166370768851446, 'n_estimators': 101, 'min_child_samples': 13, 'subsample': 0.6131909422135272, 'colsample_bytree': 0.8460619430398518}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:19,710] Trial 7 finished with value: 1.0 and parameters: {'num_leaves': 23, 'learning_rate': 0.07844816725210269, 'n_estimators': 92, 'min_child_samples': 18, 'subsample': 0.634164953114548, 'colsample_bytree': 0.6111847477347211}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:22,441] Trial 8 finished with value: 1.0 and parameters: {'num_leaves': 19, 'learning_rate': 0.0809756084881905, 'n_estimators': 159, 'min_child_samples': 18, 'subsample': 0.8304493631890396, 'colsample_bytree': 0.8211078865020347}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:24,457] Trial 9 finished with value: 1.0 and parameters: {'num_leaves': 18, 'learning_rate': 0.09686803738994959, 'n_estimators': 129, 'min_child_samples': 34, 'subsample': 0.848294292083259, 'colsample_bytree': 0.7960331440500917}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:27,794] Trial 10 finished with value: 1.0 and parameters: {'num_leaves': 31, 'learning_rate': 0.048115373874781986, 'n_estimators': 226, 'min_child_samples': 27, 'subsample': 0.7116160388899516, 'colsample_bytree': 0.7553859138259016}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:29,432] Trial 11 finished with value: 1.0 and parameters: {'num_leaves': 8, 'learning_rate': 0.039701523696677024, 'n_estimators': 212, 'min_child_samples': 50, 'subsample': 0.7586350869335764, 'colsample_bytree': 0.8908084046747187}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:31,968] Trial 12 finished with value: 1.0 and parameters: {'num_leaves': 24, 'learning_rate': 0.01439293012813233, 'n_estimators': 202, 'min_child_samples': 25, 'subsample': 0.7534300257936692, 'colsample_bytree': 0.7024454264838752}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:32,646] Trial 13 finished with value: 1.0 and parameters: {'num_leaves': 12, 'learning_rate': 0.011903536621655712, 'n_estimators': 51, 'min_child_samples': 32, 'subsample': 0.7069389590853646, 'colsample_bytree': 0.7626750959880264}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:36,670] Trial 14 finished with value: 1.0 and parameters: {'num_leaves': 23, 'learning_rate': 0.0393321345910803, 'n_estimators': 254, 'min_child_samples': 11, 'subsample': 0.8871931539760917, 'colsample_bytree': 0.7224857562886556}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:40,625] Trial 15 finished with value: 1.0 and parameters: {'num_leaves': 32, 'learning_rate': 0.06176459924572494, 'n_estimators': 182, 'min_child_samples': 25, 'subsample': 0.7825155254873574, 'colsample_bytree': 0.6317998061037}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:42,238] Trial 16 finished with value: 1.0 and parameters: {'num_leaves': 13, 'learning_rate': 0.04237565468104472, 'n_estimators': 178, 'min_child_samples': 37, 'subsample': 0.7079770995065905, 'colsample_bytree': 0.6571294237433356}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:43,980] Trial 17 finished with value: 1.0 and parameters: {'num_leaves': 8, 'learning_rate': 0.021493633150135423, 'n_estimators': 237, 'min_child_samples': 46, 'subsample': 0.6555901290650435, 'colsample_bytree': 0.7902123217028545}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:44,853] Trial 18 finished with value: 1.0 and parameters: {'num_leaves': 26, 'learning_rate': 0.031884357862982535, 'n_estimators': 50, 'min_child_samples': 30, 'subsample': 0.77448766747249, 'colsample_bytree': 0.7271889624582539}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


[I 2024-11-13 19:58:46,653] Trial 19 finished with value: 1.0 and parameters: {'num_leaves': 21, 'learning_rate': 0.06028468788880089, 'n_estimators': 141, 'min_child_samples': 21, 'subsample': 0.7086412587526603, 'colsample_bytree': 0.8893459816637278}. Best is trial 0 with value: 1.0.


[LightGBM] [Info] Number of positive: 5174, number of negative: 5100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17782
[LightGBM] [Info] Number of data points in the train set: 10274, number of used features: 111
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503601 -> initscore=0.014406
[LightGBM] [Info] Start training from score 0.014406


2024-11-13 19:58:49,140 - INFO - LightGBM Evaluation:
2024-11-13 19:58:49,142 - INFO -   Accuracy: 1.0000
2024-11-13 19:58:49,144 - INFO -   F1 Score: 1.0000
2024-11-13 19:58:49,146 - INFO -   ROC AUC: 1.0000
2024-11-13 19:58:49,148 - INFO -   Precision: 1.0000
2024-11-13 19:58:49,150 - INFO -   Recall: 1.0000
2024-11-13 19:58:49,154 - INFO -   Confusion Matrix:
2024-11-13 19:58:49,156 - INFO -     TN: 1090, FP: 0
2024-11-13 19:58:49,157 - INFO -     FN: 0, TP: 1113
2024-11-13 19:58:49,158 - INFO - Training Neural Network model...
2024-11-13 19:58:49,159 - INFO - Creating neural network with input dimension: 111


Epoch 1/200
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.7436 - loss: 1.7870 - val_accuracy: 0.9696 - val_loss: 0.9578 - learning_rate: 0.0010
Epoch 2/200
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9634 - loss: 0.7348 - val_accuracy: 0.9932 - val_loss: 0.3813 - learning_rate: 0.0010
Epoch 3/200
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9775 - loss: 0.3556 - val_accuracy: 0.9964 - val_loss: 0.1933 - learning_rate: 0.0010
Epoch 4/200
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9805 - loss: 0.2068 - val_accuracy: 0.9945 - val_loss: 0.1249 - learning_rate: 0.0010
Epoch 5/200
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9885 - loss: 0.1334 - val_accuracy: 0.9959 - val_loss: 0.0897 - learning_rate: 0.0010
Epoch 6/200
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

2024-11-13 20:01:57,479 - INFO - Neural Network Evaluation:
2024-11-13 20:01:57,482 - INFO -   Accuracy: 1.0000
2024-11-13 20:01:57,483 - INFO -   F1 Score: 1.0000
2024-11-13 20:01:57,486 - INFO -   ROC AUC: 1.0000
2024-11-13 20:01:57,494 - INFO -   Precision: 1.0000
2024-11-13 20:01:57,501 - INFO -   Recall: 1.0000
2024-11-13 20:01:57,514 - INFO -   Confusion Matrix:
2024-11-13 20:01:57,518 - INFO -     TN: 1090, FP: 0
2024-11-13 20:01:57,523 - INFO -     FN: 0, TP: 1113
2024-11-13 20:01:57,528 - INFO - Saving models...
2024-11-13 20:01:57,786 - INFO - Initialization and training completed.


In [195]:
# Cell 12: Game Schedule and Predictions Formatter
# This class formats predictions for upcoming games.

class NBAGamePredictor:
    def __init__(self, models, scaler, processor):
        self.predictor = GamePredictor(models, scaler, processor)
        self.config = NotebookConfig()

    def fetch_upcoming_games(self):
        """Fetch upcoming NBA games and odds."""
        params = {
            'apiKey': self.config.ODDS_API_KEY,
            'regions': self.config.REGIONS,
            'markets': 'h2h',
            'oddsFormat': self.config.ODDS_FORMAT
        }

        try:
            response = requests.get(self.config.ODDS_API_URL, params=params)
            if response.status_code == 200:
                return response.json()
            else:
                logging.error(f"Error fetching odds: {response.status_code}")
                return None
        except Exception as e:
            logging.error(f"Error fetching upcoming games: {str(e)}")
            return None

    def format_predictions(self):
        """Format predictions for upcoming games."""
        games = self.fetch_upcoming_games()
        if not games:
            return "Unable to fetch upcoming games"

        output = "Model Predictions:\n"
        for game in games:
            home_team = game['home_team']
            away_team = game['away_team']

            # Get win probabilities
            home_pred = self.predictor.predict_game(pd.DataFrame([{
                'TEAM_ABBREVIATION': home_team,
                'OPPONENT_ABBREV': away_team,
                'GAME_DATE': datetime.now().strftime('%Y-%m-%d'),
                'MATCHUP': f"{home_team} vs. {away_team}",
                'WL': 'W'  # Placeholder
            }]))['prediction']

            output += f"{home_team} vs {away_team} - {home_team} Win Probability: {home_pred:.2%}\n"

        return output

# Test NBAGamePredictor class
if __name__ == "__main__":
    try:
        # Assume models, scaler, and processor are already initialized
        nba_predictor = NBAGamePredictor(models, scaler, processor)
        predictions_output = nba_predictor.format_predictions()
        print(predictions_output)
    except Exception as e:
        logging.error(f"Error running predictions: {str(e)}")

2024-11-13 20:02:05,459 - ERROR - Error running predictions: GamePredictor.predict_game() missing 1 required positional argument: 'opponent_abbrev'
