In [None]:
import os
import pandas as pd

# Path to your dataset folder
base_dir = r"E:\Imran Projects\Sports-Project-main\data\ml_raw_datasets"

# List all CSV files
csv_files = [f for f in os.listdir(base_dir) if f.endswith('.csv')]

print(f"Found {len(csv_files)} CSV files:\n")
for f in csv_files:
    print(f" - {f}")

print("\n===============================")

# Function to explore a single dataset
def explore_dataset(file_path):
    df = pd.read_csv(file_path)
    print(f"\nüìò Dataset: {os.path.basename(file_path)}")
    print("=" * 60)
    
    # Basic info
    print(f"‚û°Ô∏è Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
    print("\n‚û°Ô∏è Columns and Data Types:")
    print(df.dtypes)
    
    # Missing values
    missing = df.isnull().sum()
    print("\n‚û°Ô∏è Missing Values:")
    print(missing[missing > 0] if missing.sum() > 0 else "No missing values ‚úÖ")
    
    # Duplicate rows
    duplicates = df.duplicated().sum()
    print(f"\n‚û°Ô∏è Duplicated Rows: {duplicates}")
    
    # Summary statistics (for numeric columns)
    print("\n‚û°Ô∏è Summary Statistics:")
    print(df.describe())
    
    # Unique values (for small categorical columns)
    print("\n‚û°Ô∏è Unique Value Counts (Top 10 columns by fewest uniques):")
    nunique = df.nunique().sort_values().head(10)
    for col, count in nunique.items():
        print(f"{col}: {count} unique values")
    
    # Quick sample preview
    print("\n‚û°Ô∏è Sample Data:")
    print(df.head(5))
    
    print("\n" + "-" * 60)
    return df

# Load and explore all datasets
datasets = {}
for file in csv_files:
    file_path = os.path.join(base_dir, file)
    datasets[file] = explore_dataset(file_path)

print("\n‚úÖ All datasets analyzed successfully!")


Found 15 CSV files:

 - american_football_games_2020.csv
 - american_football_games_2021.csv
 - american_football_games_2022.csv
 - american_football_games_2023.csv
 - american_football_games_2024.csv
 - baseball_games_2020.csv
 - baseball_games_2021.csv
 - baseball_games_2022.csv
 - baseball_games_2023.csv
 - baseball_games_2024.csv
 - basketball_games_2020-2021.csv
 - basketball_games_2021-2022.csv
 - basketball_games_2022-2023.csv
 - basketball_games_2023-2024.csv
 - basketball_games_2024-2025.csv


üìò Dataset: american_football_games_2020.csv
‚û°Ô∏è Shape: 269 rows √ó 37 columns

‚û°Ô∏è Columns and Data Types:
game_id                    int64
game_stage               float64
game_week                float64
game_date_timezone        object
game_date_date            object
game_date_time            object
game_date_timestamp        int64
game_venue_name          float64
game_venue_city          float64
game_status_short         object
game_status_long          object
game_status_t

In [2]:
import requests
import json

API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e"

def get_leagues(sport_name):
    """
    Fetch leagues for a given sport name (e.g., basketball, american-football).
    Works with API-Sports different data structures.
    """
    url = f"https://v1.{sport_name}.api-sports.io/leagues"
    headers = {"x-apisports-key": API_KEY}
    response = requests.get(url, headers=headers)

    print(f"\n=== {sport_name.upper()} LEAGUES ===")

    if response.status_code != 200:
        print(f"‚ùå Error fetching leagues for {sport_name}: {response.status_code}")
        print(response.text)
        return

    try:
        data = response.json()
    except json.JSONDecodeError:
        print("‚ùå Failed to decode JSON response.")
        print(response.text)
        return

    if "response" not in data or not data["response"]:
        print("‚ö†Ô∏è No leagues found or empty response.")
        print("Raw API response:", data)
        return

    for item in data["response"]:
        # Handle differences between football API and others
        league_info = item.get("league") or item.get("competition") or {}
        country_info = item.get("country", {})
        seasons_info = item.get("seasons", [])

        league_name = league_info.get("name", "N/A")
        league_id = league_info.get("id", "N/A")
        country = country_info.get("name", "N/A")
        season = (
            seasons_info[-1].get("year", "N/A")
            if isinstance(seasons_info, list) and seasons_info
            else "N/A"
        )

        print(f"üèü {league_name} | ID: {league_id} | Country: {country} | Season: {season}")

if __name__ == "__main__":
    get_leagues("basketball")        # should show NBA info
    get_leagues("american-football") # should show NFL info



=== BASKETBALL LEAGUES ===
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Africa | Season: N/A
üèü N/A | ID: N/A | Country: Albania | Season: N/A
üèü N/A | ID: N/A | Country: Argentina | Season: N/A
üèü N/A | ID: N/A | Country: Argentina | Season: N/A
üèü N/A | ID: N/A | Country: Argentina | Season: N/A
üèü N/A | ID: N/A | Country: Argentina | Season: N/A
üèü N/A | ID: N/A | Country: Argentina | Season: N/A
üèü N/A | ID: N/A | Country: Asia | Season: N/A
üèü N/A | ID: N/A | Country: Asia | Sea

In [16]:
import requests
import json

# =============================================================
# CONFIGURATION
# =============================================================
API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e"
HEADERS = {"x-apisports-key": API_KEY}

# Supported sports & endpoints
SPORTS_ENDPOINTS = {
    "american-football": "https://v1.american-football.api-sports.io/leagues",
    "basketball": "https://v1.basketball.api-sports.io/leagues",
    "baseball": "https://v1.baseball.api-sports.io/leagues",
    "hockey": "https://v1.hockey.api-sports.io/leagues"
}

# =============================================================
# FETCH LEAGUE IDS
# =============================================================
print("\n=============================================================")
print("üîç Fetching available leagues from API-Sports")
print("=============================================================\n")

results = {}

for sport, url in SPORTS_ENDPOINTS.items():
    print(f"üìò Checking leagues for {sport.upper()}...")
    try:
        response = requests.get(url, headers=HEADERS)
        data = response.json()
        leagues = data.get("response", [])
        if not leagues:
            print(f"‚ö†Ô∏è  No leagues found for {sport}")
            continue

        for lg in leagues:
            name = lg.get("name", "").upper()
            country = (lg.get("country", {}) or {}).get("name", "Unknown")
            league_id = lg.get("id")

            # Detect main leagues
            if sport == "american-football" and "NFL" in name:
                results["NFL"] = league_id
            elif sport == "basketball" and "NBA" in name:
                results["NBA"] = league_id
            elif sport == "baseball" and ("MLB" in name or "MAJOR LEAGUE" in name):
                results["MLB"] = league_id
            elif sport == "hockey" and ("NHL" in name or "NATIONAL HOCKEY LEAGUE" in name):
                results["NHL"] = league_id

        print(f"‚úÖ Done ({len(leagues)} leagues found)\n")

    except Exception as e:
        print(f"‚ùå Error fetching {sport}: {e}\n")

# =============================================================
# SHOW RESULTS
# =============================================================
print("===== MAIN LEAGUE IDS (Detected) =====")
for k, v in results.items():
    print(f"{k} | ID: {v}")

print("======================================")

# Optional: Save to a JSON file
with open("league_ids.json", "w") as f:
    json.dump(results, f, indent=4)

print("\nüíæ Saved to 'league_ids.json'\n")



üîç Fetching available leagues from API-Sports

üìò Checking leagues for AMERICAN-FOOTBALL...
‚úÖ Done (2 leagues found)

üìò Checking leagues for BASKETBALL...
‚úÖ Done (425 leagues found)

üìò Checking leagues for BASEBALL...
‚úÖ Done (77 leagues found)

üìò Checking leagues for HOCKEY...
‚úÖ Done (262 leagues found)

===== MAIN LEAGUE IDS (Detected) =====
NBA | ID: 13
MLB | ID: 71
NHL | ID: 271

üíæ Saved to 'league_ids.json'



In [19]:
import requests

import requests
import json

API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e"
HEADERS = {"x-apisports-key": API_KEY}
URL = "https://v1.american-football.api-sports.io/leagues"

print("\n=============================================================")
print("üèà Re-checking American Football Leagues (Raw JSON Mode)")
print("=============================================================\n")

response = requests.get(URL, headers=HEADERS)
print(response.text[:1000])  # Print first 1000 chars only


response = requests.get(URL, headers=HEADERS)
data = response.json()

if "response" not in data:
    print("‚ùå No 'response' in API result. Raw data:")
    print(data)
else:
    for lg in data["response"]:
        lid = lg.get("league", {}).get("id")
        name = lg.get("league", {}).get("name")
        country = (lg.get("country", {}) or {}).get("name", "Unknown")
        season = lg.get("seasons", [{}])[0].get("year", "N/A")
        print(f"ID: {lid} | Name: {name} | Country: {country} | Season: {season}")



üèà Re-checking American Football Leagues (Raw JSON Mode)

{"get":"leagues","parameters":[],"errors":[],"results":2,"response":[{"league":{"id":1,"name":"NFL","logo":"https:\/\/media.api-sports.io\/american-football\/leagues\/1.png"},"country":{"name":"USA","code":"US","flag":"https:\/\/media.api-sports.io\/flags\/us.svg"},"seasons":[{"year":2025,"start":"2025-08-01","end":"2026-02-08","current":true,"coverage":{"games":{"events":true,"statisitcs":{"teams":true,"players":true}},"statistics":{"season":{"players":true}},"players":true,"injuries":true,"standings":true}},{"year":2024,"start":"2024-08-02","end":"2025-02-09","current":false,"coverage":{"games":{"events":true,"statisitcs":{"teams":true,"players":true}},"statistics":{"season":{"players":true}},"players":true,"injuries":false,"standings":true}},{"year":2023,"start":"2023-08-04","end":"2024-02-11","current":false,"coverage":{"games":{"events":true,"statisitcs":{"teams":true,"players":true}},"statistics":{"season":{"players":tr

In [21]:
import pandas as pd

# File paths
files = {
    "NFL": "C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED/american_football_SUPER_FINAL_FIXED.csv",
    "MLB": "C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED/baseball_SUPER_FINAL_FIXED.csv",
    "NBA": "C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED/basketball_SUPER_FINAL_FIXED.csv",
    "NHL": "C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED/ice_hockey_SUPER_FINAL_FIXED.csv"
}

# Load all datasets into a dictionary
data = {}
for sport, path in files.items():
    df = pd.read_csv(path)
    print("="*80)
    print(f"üìÇ Loading: {sport}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Head:\n{df.head()}")
    data[sport] = df


üìÇ Loading: NFL
Shape: (1002, 28)
Columns: ['game_id', 'date', 'home_team', 'away_team', 'home_total', 'away_total', 'total_points', 'over_under_line', 'beat_over', 'home_ppg_last5', 'away_ppg_last5', 'home_momentum', 'rest_home', 'rest_away', 'month', 'is_weekend', 'home_last5_avg', 'away_last5_avg', 'home_away_diff', 'rest_diff', 'over_signal', 'home_total_norm', 'away_total_norm', 'api_odds', 'venue_advantage', 'player_injuries', 'team_efficiency', 'weather_factor']
Head:
   game_id        date             home_team             away_team  \
0     3905  2022-08-05     Las Vegas Raiders  Jacksonville Jaguars   
1     3907  2022-08-11      Baltimore Ravens      Tennessee Titans   
2     3906  2022-08-11  New England Patriots       New York Giants   
3     3911  2022-08-12   Philadelphia Eagles         New York Jets   
4     3908  2022-08-12         Detroit Lions       Atlanta Falcons   

   home_total  away_total  total_points  over_under_line  beat_over  \
0        27.0        11.0 

In [4]:
import pandas as pd
from pathlib import Path

# Source and destination folders
src_folder = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED")
dest_folder = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SPREAD_DATASETS")
dest_folder.mkdir(parents=True, exist_ok=True)

# File paths
files = {
    "NFL": src_folder / "american_football_SUPER_FINAL_FIXED.csv",
    "MLB": src_folder / "baseball_SUPER_FINAL_FIXED.csv",
    "NBA": src_folder / "basketball_SUPER_FINAL_FIXED.csv",
    "NHL": src_folder / "ice_hockey_SUPER_FINAL_FIXED.csv"
}

# Process and save new copies
for sport, path in files.items():
    print("=" * 80)
    print(f"üìÇ Processing: {sport}")

    df = pd.read_csv(path)

    # Check basic columns
    if 'home_total' not in df.columns or 'away_total' not in df.columns:
        print(f"‚ö†Ô∏è Missing required columns for {sport}! Skipping.")
        continue

    # Create new regression target
    df['home_margin_of_victory'] = df['home_total'] - df['away_total']

    # Print stats for verification
    print(f"‚úÖ Added 'home_margin_of_victory' (sample): {df['home_margin_of_victory'].head().tolist()}")
    print(f"Mean: {df['home_margin_of_victory'].mean():.3f}, Std: {df['home_margin_of_victory'].std():.3f}")

    # Save new dataset copy
    save_path = dest_folder / f"{sport}_SPREAD_REGRESSION.csv"
    df.to_csv(save_path, index=False)
    print(f"üíæ Saved new dataset at: {save_path}")


üìÇ Processing: NFL
‚úÖ Added 'home_margin_of_victory' (sample): [16.0, 13.0, -2.0, -3.0, -4.0]
Mean: 2.246, Std: 13.590
üíæ Saved new dataset at: C:\Users\mimra\Downloads\FINAL_100_100_DATASET\FINAL_SPREAD_DATASETS\NFL_SPREAD_REGRESSION.csv
üìÇ Processing: MLB
‚úÖ Added 'home_margin_of_victory' (sample): [13.0, 3.0, -1.0, 2.0, -2.0]
Mean: 0.009, Std: 4.445
üíæ Saved new dataset at: C:\Users\mimra\Downloads\FINAL_100_100_DATASET\FINAL_SPREAD_DATASETS\MLB_SPREAD_REGRESSION.csv
üìÇ Processing: NBA
‚úÖ Added 'home_margin_of_victory' (sample): [-9.0, -5.0, 38.0, -9.0, 41.0]
Mean: 2.244, Std: 15.147
üíæ Saved new dataset at: C:\Users\mimra\Downloads\FINAL_100_100_DATASET\FINAL_SPREAD_DATASETS\NBA_SPREAD_REGRESSION.csv
üìÇ Processing: NHL
‚úÖ Added 'home_margin_of_victory' (sample): [-1.0, 1.0, 1.0, 3.0, 6.0]
Mean: 0.284, Std: 2.633
üíæ Saved new dataset at: C:\Users\mimra\Downloads\FINAL_100_100_DATASET\FINAL_SPREAD_DATASETS\NHL_SPREAD_REGRESSION.csv


In [5]:
import pandas as pd
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Paths
data_folder = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SPREAD_DATASETS")
model_folder = Path("C:/Users/mimra/Downloads/SPREAD_MODELS")
model_folder.mkdir(parents=True, exist_ok=True)

# Datasets
datasets = {
    "NFL": data_folder / "NFL_SPREAD_REGRESSION.csv",
    "MLB": data_folder / "MLB_SPREAD_REGRESSION.csv",
    "NBA": data_folder / "NBA_SPREAD_REGRESSION.csv",
    "NHL": data_folder / "NHL_SPREAD_REGRESSION.csv"
}

# Features to drop (non-numeric or irrelevant columns)
drop_cols = [
    "game_id", "date", "home_team", "away_team", 
    "beat_over", "total_points", "home_total", "away_total"
]

# Training loop
for sport, file_path in datasets.items():
    print("=" * 90)
    print(f"üèãÔ∏è Training Spread Regression Model for: {sport}")
    df = pd.read_csv(file_path)

    # Drop unnecessary columns
    X = df.drop(columns=drop_cols + ["home_margin_of_victory"], errors="ignore")
    X = X.select_dtypes(include=['number'])  # only numeric features
    y = df["home_margin_of_victory"]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Models
    models = {
        "xgb": XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42),
        "lgb": LGBMRegressor(n_estimators=300, learning_rate=0.05, max_depth=-1, random_state=42),
        "rf": RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
    }

    # Train and evaluate
    for name, model in models.items():
        print(f"\nüöÄ Training {name.upper()} Regressor...")
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        r2 = r2_score(y_test, preds)
        print(f"‚úÖ {name.upper()} | RMSE: {rmse:.3f} | R¬≤: {r2:.3f}")

        # Save model
        model_filename = model_folder / f"{sport}_spread_{name}.pkl"
        joblib.dump(model, model_filename)
        print(f"üíæ Saved model: {model_filename}")

    print(f"üéØ Finished training for {sport} ‚úÖ")

print("\n‚úÖ All spread regression models trained and saved successfully!")


üèãÔ∏è Training Spread Regression Model for: NFL

üöÄ Training XGB Regressor...
‚úÖ XGB | RMSE: 1.589 | R¬≤: 0.986
üíæ Saved model: C:\Users\mimra\Downloads\SPREAD_MODELS\NFL_spread_xgb.pkl

üöÄ Training LGB Regressor...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1832
[LightGBM] [Info] Number of data points in the train set: 801, number of used features: 19
[LightGBM] [Info] Start training from score 2.027466
‚úÖ LGB | RMSE: 1.818 | R¬≤: 0.982
üíæ Saved model: C:\Users\mimra\Downloads\SPREAD_MODELS\NFL_spread_lgb.pkl

üöÄ Training RF Regressor...
‚úÖ RF | RMSE: 1.556 | R¬≤: 0.987
üíæ Saved model: C:\Users\mimra\Downloads\SPREAD_MODELS\NFL_spread_rf.pkl
üéØ Finished training for NFL ‚úÖ
üèãÔ∏è Training Spread Regression Model for: MLB

üöÄ Training XGB Regressor...
‚úÖ

In [6]:
import pandas as pd
from pathlib import Path

# Paths
original_files = {
    "NFL": "C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED/american_football_SUPER_FINAL_FIXED.csv",
    "MLB": "C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED/baseball_SUPER_FINAL_FIXED.csv",
    "NBA": "C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED/basketball_SUPER_FINAL_FIXED.csv",
    "NHL": "C:/Users/mimra/Downloads/FINAL_100_100_DATASET/FINAL_SUPER_ENRICHED/FINAL_SUPER_ENRICHED_FIXED/ice_hockey_SUPER_FINAL_FIXED.csv"
}

# Create a new folder for Winner dataset
new_dataset_dir = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/WINNER_DATASET")
new_dataset_dir.mkdir(parents=True, exist_ok=True)

# Process each file
for sport, path in original_files.items():
    df = pd.read_csv(path)
    
    # Create home_team_won column
    df['home_team_won'] = df.apply(lambda row: 1 if row['home_total'] > row['away_total'] else 0, axis=1)
    
    # Save new CSV
    new_file_path = new_dataset_dir / f"{sport}_winner_dataset.csv"
    df.to_csv(new_file_path, index=False)
    
    print(f"‚úÖ {sport} dataset created with Winner column: {new_file_path}")


‚úÖ NFL dataset created with Winner column: C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_DATASET\NFL_winner_dataset.csv
‚úÖ MLB dataset created with Winner column: C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_DATASET\MLB_winner_dataset.csv
‚úÖ NBA dataset created with Winner column: C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_DATASET\NBA_winner_dataset.csv
‚úÖ NHL dataset created with Winner column: C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_DATASET\NHL_winner_dataset.csv


In [7]:
import pandas as pd
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV

# Directory for saving models
model_dir = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/WINNER_MODELS")
model_dir.mkdir(parents=True, exist_ok=True)

# Choose a sport to train (example: NFL)
sport = "NFL"
df = pd.read_csv(new_dataset_dir / f"{sport}_winner_dataset.csv")

# Features: use same as O/U model (exclude target & identifiers)
exclude_cols = ['game_id', 'date', 'home_team', 'away_team', 'home_total', 'away_total', 'total_points', 'over_under_line', 'beat_over', 'home_team_won']
X = df.drop(columns=exclude_cols)
y = df['home_team_won']

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train models ---
# 1Ô∏è‚É£ Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_calibrator = CalibratedClassifierCV(rf, cv='prefit')
rf_calibrator.fit(X_train, y_train)

# 2Ô∏è‚É£ XGBoost
xgb = XGBClassifier(n_estimators=200, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_calibrator = CalibratedClassifierCV(xgb, cv='prefit')
xgb_calibrator.fit(X_train, y_train)

# 3Ô∏è‚É£ LightGBM
lgbm = lgb.LGBMClassifier(n_estimators=200, random_state=42)
lgbm.fit(X_train, y_train)
lgbm_calibrator = CalibratedClassifierCV(lgbm, cv='prefit')
lgbm_calibrator.fit(X_train, y_train)

# --- Save models ---
joblib.dump(rf, model_dir / f"{sport}_winner_rf.pkl")
joblib.dump(rf_calibrator, model_dir / f"{sport}_winner_calibrator.pkl")
joblib.dump(xgb, model_dir / f"{sport}_winner_xgb.pkl")
joblib.dump(xgb_calibrator, model_dir / f"{sport}_winner_xgb_calibrator.pkl")
joblib.dump(lgbm, model_dir / f"{sport}_winner_lgb.pkl")
joblib.dump(lgbm_calibrator, model_dir / f"{sport}_winner_lgb_calibrator.pkl")

print(f"‚úÖ Winner models trained and saved for {sport} at {model_dir}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 449, number of negative: 352
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1577
[LightGBM] [Info] Number of data points in the train set: 801, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.560549 -> initscore=0.243392
[LightGBM] [Info] Start training from score 0.243392
‚úÖ Winner models trained and saved for NFL at C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_MODELS


In [14]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

leagues = ["NFL", "NBA", "MLB", "NHL"]
model_types = ["rf", "xgb", "lgbm"]

base_model_path = r"C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_MODELS"
base_test_path = r"C:\Users\mimra\Downloads\FINAL_100_100_DATASET\TEST_DATA"

for league in leagues:
    # Load test data
    test_file = f"{base_test_path}/{league}_test.csv"
    df_test = pd.read_csv(test_file)
    
    X_test = df_test.drop(columns=["target"])  # replace 'target' with your target column
    y_test = df_test["target"]

    print(f"\n===== {league} Winner Models =====")
    
    for model_type in model_types:
        model_file = f"{base_model_path}/{league}_{model_type}_winner.pkl"
        model = joblib.load(model_file)
        
        y_pred = model.predict(X_test)
        print(f"\nModel: {model_type.upper()}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\mimra\\Downloads\\FINAL_100_100_DATASET\\TEST_DATA/NFL_test.csv'

In [None]:
import pandas as pd
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV

# Directory containing the datasets you just created
new_dataset_dir = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/WINNER_DATASET")

# Directory to save the trained winner models
model_dir = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/WINNER_MODELS")
model_dir.mkdir(parents=True, exist_ok=True)

# List of sports to train
sports = ["NFL", "NBA", "MLB", "NHL"]

# Columns to exclude from features
exclude_cols = [
    'game_id', 'date', 'home_team', 'away_team',
    'home_total', 'away_total', 'total_points',
    'over_under_line', 'beat_over', 'home_team_won'
]

for sport in sports:
    print(f"\nüîπ Training winner models for {sport}...")
    
    # Load dataset
    df = pd.read_csv(new_dataset_dir / f"{sport}_winner_dataset.csv")
    
    # Prepare features and target
    X = df.drop(columns=exclude_cols)
    y = df['home_team_won']
    
    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # --- 1Ô∏è‚É£ Random Forest ---
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)
    rf_calibrator = CalibratedClassifierCV(rf, cv='prefit')
    rf_calibrator.fit(X_train, y_train)
    
    # --- 2Ô∏è‚É£ XGBoost ---
    xgb = XGBClassifier(
        n_estimators=200,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)
    xgb_calibrator = CalibratedClassifierCV(xgb, cv='prefit')
    xgb_calibrator.fit(X_train, y_train)
    
    # --- 3Ô∏è‚É£ LightGBM ---
    lgbm = lgb.LGBMClassifier(n_estimators=200, random_state=42)
    lgbm.fit(X_train, y_train)
    lgbm_calibrator = CalibratedClassifierCV(lgbm, cv='prefit')
    lgbm_calibrator.fit(X_train, y_train)
    
    # --- Save models ---
    joblib.dump(rf, model_dir / f"{sport}_winner_rf.pkl")
    joblib.dump(rf_calibrator, model_dir / f"{sport}_winner_calibrator.pkl")
    joblib.dump(xgb, model_dir / f"{sport}_winner_xgb.pkl")
    joblib.dump(xgb_calibrator, model_dir / f"{sport}_winner_xgb_calibrator.pkl")
    joblib.dump(lgbm, model_dir / f"{sport}_winner_lgb.pkl")
    joblib.dump(lgbm_calibrator, model_dir / f"{sport}_winner_lgb_calibrator.pkl")
    
    print(f"‚úÖ Winner models trained and saved for {sport} at {model_dir}")



üîπ Training winner models for NFL...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 449, number of negative: 352
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1577
[LightGBM] [Info] Number of data points in the train set: 801, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.560549 -> initscore=0.243392
[LightGBM] [Info] Start training from score 0.243392
‚úÖ Winner models trained and saved for NFL at C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_MODELS

üîπ Training winner models for NBA...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 1850, number of negative: 1469
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1953
[LightGBM] [Info] Number of data points in the train set: 3319, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.557397 -> initscore=0.230604
[LightGBM] [Info] Start training from score 0.230604
‚úÖ Winner models trained and saved for NBA at C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_MODELS

üîπ Training winner models for MLB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3609, number of negative: 3273
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1242
[LightGBM] [Info] Number of data points in the train set: 6882, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524412 -> initscore=0.097724
[LightGBM] [Info] Start training from score 0.097724
‚úÖ Winner models trained and saved for MLB at C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_MODELS

üîπ Training winner models for NHL...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 1991, number of negative: 1628
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 3619, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.550152 -> initscore=0.201285
[LightGBM] [Info] Start training from score 0.201285
‚úÖ Winner models trained and saved for NHL at C:\Users\mimra\Downloads\FINAL_100_100_DATASET\WINNER_MODELS


In [2]:
#!/usr/bin/env python3
"""
Fetch all available games for NFL, NBA, MLB, and NHL and save them to CSV files.
"""

import requests
import logging
import csv

# ----------------------------
# Configuration
# ----------------------------
API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e"  # Replace with your API key
BASE_URL = "https://v1.american-football.api-sports.io"  # For NFL; other leagues may have different endpoints
HEADERS = {"x-apisports-key": API_KEY}

LOG_FORMAT = "%(asctime)s | %(levelname)-8s | %(message)s"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
log = logging.getLogger("fetch_sports")

# Map league names to their IDs (API-specific)
LEAGUES = {
    "NFL": 1,
    "NBA": 12,
    "MLB": 1,   # May need to adjust if API uses a different ID
    "NHL": 57
}


# ----------------------------
# Helper functions
# ----------------------------
def get_available_seasons(league_name, league_id):
    """Fetch available seasons for a given league."""
    url = f"{BASE_URL}/leagues"
    log.info(f"Fetching available seasons for {league_name}...")
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    data = response.json()

    seasons = []
    for league_info in data.get("response", []):
        league = league_info.get("league", {})
        if league.get("id") == league_id:
            seasons = [s.get("year") for s in league_info.get("seasons", [])]
            break

    if not seasons:
        log.warning(f"No seasons found for {league_name}")
    else:
        log.info(f"Found seasons: {seasons}")
    return seasons


# In your existing script, replace the fetch_games_for_season function 
# with the following modified code:

def fetch_games_for_season(league_name, league_id, season):
    """Fetch all available details for games in a given league and season."""
    log.info(f"Fetching ALL details for {league_name} season {season}...")
    games = []

    # --- NOTE: BASE_URL must be updated for each league if API-Sports requires it ---
    url = f"{BASE_URL}/games"
    params = {"league": league_id, "season": season}
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    data = response.json()

    for item in data.get("response", []):
        game_info = item.get("game", {})
        teams = item.get("teams", {})
        scores = item.get("scores", {})
        venue = item.get("venue", {})
        odds_raw = item.get("odds", {}) # Original raw odds data
        status = game_info.get("status", {})
        
        # üîë START MODIFICATION: Search for O/U and Spread Lines
        over_under_line = None
        point_spread_line = None
        
        # The odds are usually nested inside a 'bookmakers' array.
        for bookmaker_data in odds_raw.get('bookmakers', []):
            for bet in bookmaker_data.get('bets', []):
                # 1. Search for Over/Under Line (usually named 'Total')
                if bet.get('name') == 'Total' and bet.get('values'):
                    # The value contains the actual O/U number (e.g., 42.5)
                    # We assume the first value in the array is the current line
                    over_under_line = bet['values'][0].get('value')
                    
                # 2. Search for Point Spread Line (usually named 'Spread')
                if bet.get('name') == 'Spread' and bet.get('values'):
                    # The value contains the actual Spread number (e.g., -3.5 or +7)
                    point_spread_line = bet['values'][0].get('value')

                # We break if both are found to save time
                if over_under_line is not None and point_spread_line is not None:
                    break
            
            # If found inside this bookmaker, we stop checking others.
            if over_under_line is not None and point_spread_line is not None:
                 break
        # üîë END MODIFICATION
        

        # Extract all details safely (Keep your original code structure)
        games.append({
            "league": league_name,
            "season": season,
            "game_id": game_info.get("id"),
            # ... (all other original fields)
            "home_winner": teams.get("home", {}).get("winner"),
            "away_score_q4": scores.get("away", {}).get("quarter_4"),
            "total_points": (
                (scores.get("home", {}).get("total") or 0)
                + (scores.get("away", {}).get("total") or 0)
            ),
            
            # üîë MODIFIED FIELDS: Use the deeply searched values
            "odds_home": odds_raw.get("home"),
            "odds_away": odds_raw.get("away"),
            "odds_draw": odds_raw.get("draw"),
            
            "over_under_line": over_under_line,       # <--- Now uses the result of deep search
            "point_spread_line": point_spread_line,   # <--- New field for spread
        })

    log.info(f"‚úÖ Found {len(games)} games for {league_name} season {season}")
    return games


def save_games_to_csv(games, league_name):
    """Save list of games to CSV."""
    if not games:
        log.warning(f"No games to save for {league_name}.")
        return

    filename = f"{league_name.lower()}_games.csv"
    keys = games[0].keys()
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(games)
    log.info(f"Saved {len(games)} games to {filename}")


# ----------------------------
# Main
# ----------------------------
def main():
    for league_name, league_id in LEAGUES.items():
        all_games = []
        seasons = get_available_seasons(league_name, league_id)
        for season in seasons:
            games = fetch_games_for_season(league_name, league_id, season)
            all_games.extend(games)
        # Pass the league_name to the save function
        save_games_to_csv(all_games, league_name)
    log.info("‚úÖ All sports data fetched and saved successfully!")



if __name__ == "__main__":
    main()


2025-11-11 20:01:59,602 | INFO     | Fetching available seasons for NFL...
2025-11-11 20:02:02,404 | INFO     | Found seasons: [2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010]
2025-11-11 20:02:02,407 | INFO     | Fetching ALL details for NFL season 2025...
2025-11-11 20:02:04,294 | INFO     | ‚úÖ Found 333 games for NFL season 2025
2025-11-11 20:02:04,297 | INFO     | Fetching ALL details for NFL season 2024...
2025-11-11 20:02:06,772 | INFO     | ‚úÖ Found 335 games for NFL season 2024
2025-11-11 20:02:06,776 | INFO     | Fetching ALL details for NFL season 2023...
2025-11-11 20:02:08,439 | INFO     | ‚úÖ Found 335 games for NFL season 2023
2025-11-11 20:02:08,441 | INFO     | Fetching ALL details for NFL season 2022...
2025-11-11 20:02:13,618 | INFO     | ‚úÖ Found 335 games for NFL season 2022
2025-11-11 20:02:13,621 | INFO     | Fetching ALL details for NFL season 2021...
2025-11-11 20:02:19,354 | INFO     | ‚úÖ Found 331 games for NF

KeyboardInterrupt: 

In [None]:
import sqlite3
import pandas as pd

# Paths
db_path = r"E:\Imran Projects\Sports-Project-main\Sports-Project-main\sports_forecast.db"
nfl_csv = r"E:\Imran Projects\Sports-Project-main\Sports-Project-main\nfl_games.csv"
mlb_csv = r"E:\Imran Projects\Sports-Project-main\Sports-Project-main\mlb_games.csv"

# Connect to database
conn = sqlite3.connect(db_path)

# Read CSVs
nfl_df = pd.read_csv(nfl_csv)
mlb_df = pd.read_csv(mlb_csv)

# Clean column names (optional but useful)
nfl_df.columns = nfl_df.columns.str.strip()
mlb_df.columns = mlb_df.columns.str.strip()

# Save data to new tables in the database
nfl_df.to_sql("nfl_games", conn, if_exists="replace", index=False)
mlb_df.to_sql("mlb_games", conn, if_exists="replace", index=False)

print("‚úÖ Successfully imported both CSV files into the database!")

# Check tables
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("\nüìã Tables in the database:")
print(tables)

# Check number of rows for each
for table in ["nfl_games", "mlb_games"]:
    count = pd.read_sql_query(f"SELECT COUNT(*) as total FROM {table};", conn)
    print(f"\nüß© Rows in {table}: {count['total'][0]}")

# Preview first few rows from nfl_games
preview = pd.read_sql_query("SELECT * FROM nfl_games LIMIT 5;", conn)
print("\nüèà First 5 NFL Rows:")
print(preview)

# Close connection
conn.close()


‚úÖ Successfully imported both CSV files into the database!

üìã Tables in the database:
        name
0  game_data
1  nfl_games
2  mlb_games

üß© Rows in nfl_games: 5239

üß© Rows in mlb_games: 5239

üèà First 5 NFL Rows:
  league  season  game_id                                               date  \
0    NFL    2025    17279  {'timezone': 'UTC', 'date': '2025-08-01', 'tim...   
1    NFL    2025    17280  {'timezone': 'UTC', 'date': '2025-08-07', 'tim...   
2    NFL    2025    17281  {'timezone': 'UTC', 'date': '2025-08-07', 'tim...   
3    NFL    2025    19193  {'timezone': 'UTC', 'date': '2025-08-08', 'tim...   
4    NFL    2025    17282  {'timezone': 'UTC', 'date': '2025-08-08', 'tim...   

                   week  time timezone       stage referee status_short  ...  \
0  Hall of Fame Weekend  None     None  Pre Season    None           FT  ...   
1                Week 1  None     None  Pre Season    None           FT  ...   
2                Week 1  None     None  Pre Season   

In [None]:
import requests
from flask import Flask, jsonify
from apscheduler.schedulers.background import BackgroundScheduler
from datetime import datetime

API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e"
BASE_URL = "https://api-football-v1.p.rapidapi.com/v3/fixtures"
HEADERS = {
    "X-RapidAPI-Key": API_KEY,
    "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com"
}

# Supported leagues (example)
LEAGUES = {
    "NFL": 1,
    "NBA": 12,
    "NHL": 57
}

# In-memory store for live games
live_games = []

def fetch_live_games():
    global live_games
    today = datetime.utcnow().strftime("%Y-%m-%d")
    games = []

    for league_name, league_id in LEAGUES.items():
        # Basketball requires ?date= instead of live=all
        if league_name in ["NBA", "Liga A (Argentina)", "NBL (Australia)", "KBL (Korea)"]:
            params = {"league": league_id, "season": 2025, "date": today}
        else:
            params = {"league": league_id, "season": 2025, "live": "all"}

        try:
            response = requests.get(BASE_URL, headers=HEADERS, params=params)
            data = response.json()
            if "response" in data:
                for game in data["response"]:
                    games.append({
                        "league": league_name,
                        "home": game["teams"]["home"]["name"],
                        "away": game["teams"]["away"]["name"],
                        "score": game["score"]["fulltime"],
                        "status": game["fixture"]["status"]["short"],
                        "time": game["fixture"]["date"]
                    })
        except Exception as e:
            print(f"Error fetching {league_name}: {e}")

    live_games = games
    print(f"Updated live games at {datetime.utcnow()}")

# Start scheduler to fetch live games every 60 seconds
scheduler = BackgroundScheduler()
scheduler.add_job(fetch_live_games, "interval", seconds=60)
scheduler.start()

# Flask app to serve live games
app = Flask(__name__)

@app.route("/live-games")
def get_live_games():
    return jsonify(live_games)

if __name__ == "__main__":
    fetch_live_games()  # initial fetch
    app.run(host="0.0.0.0", port=5000)


Updated live games at 2025-11-03 05:03:23.826062
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.14.84.31:5000
Press CTRL+C to quit


Updated live games at 2025-11-03 05:04:25.803428


127.0.0.1 - - [03/Nov/2025 10:04:58] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [03/Nov/2025 10:04:58] "GET /favicon.ico HTTP/1.1" 404 -


Updated live games at 2025-11-03 05:05:22.432385
Updated live games at 2025-11-03 05:06:24.624621
Error fetching NHL: HTTPSConnectionPool(host='api-football-v1.p.rapidapi.com', port=443): Read timed out. (read timeout=None)
Updated live games at 2025-11-03 05:07:46.294860
Updated live games at 2025-11-03 05:08:26.027631
Updated live games at 2025-11-03 05:09:21.586786
Updated live games at 2025-11-03 05:10:25.304401
Error fetching NBA: HTTPSConnectionPool(host='api-football-v1.p.rapidapi.com', port=443): Max retries exceeded with url: /v3/fixtures?league=12&season=2025&date=2025-11-03 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000010F53FB9A80>: Failed to resolve 'api-football-v1.p.rapidapi.com' ([Errno 11001] getaddrinfo failed)"))
Updated live games at 2025-11-03 05:11:49.777327
Updated live games at 2025-11-03 05:12:26.998566
Updated live games at 2025-11-03 05:13:26.306995
Updated live games at 2025-11-03 05:14:23.108699


Execution of job "fetch_live_games (trigger: interval[0:01:00], next run at: 2025-11-03 10:16:19 PKT)" skipped: maximum number of running instances reached (1)
Execution of job "fetch_live_games (trigger: interval[0:01:00], next run at: 2025-11-03 10:17:19 PKT)" skipped: maximum number of running instances reached (1)
Execution of job "fetch_live_games (trigger: interval[0:01:00], next run at: 2025-11-03 10:18:19 PKT)" skipped: maximum number of running instances reached (1)
Execution of job "fetch_live_games (trigger: interval[0:01:00], next run at: 2025-11-03 10:19:19 PKT)" skipped: maximum number of running instances reached (1)
Execution of job "fetch_live_games (trigger: interval[0:01:00], next run at: 2025-11-03 10:20:19 PKT)" skipped: maximum number of running instances reached (1)
Execution of job "fetch_live_games (trigger: interval[0:01:00], next run at: 2025-11-03 10:21:19 PKT)" skipped: maximum number of running instances reached (1)
Execution of job "fetch_live_games (trig

In [5]:
import pandas as pd
from pathlib import Path

# Path to the NHL dataset
nhl_dataset_path = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/WINNER_DATASET/NHL_winner_dataset.csv")

# Load the dataset
nhl_df = pd.read_csv(nhl_dataset_path)

# Check the number of rows and columns
print(f"‚úÖ NHL Dataset shape: {nhl_df.shape}")

# Show the first 5 rows to understand the structure
print("\nFirst 5 rows:")
print(nhl_df.head())

# Check for missing values
print("\nMissing values per column:")
print(nhl_df.isnull().sum())

# Quick summary statistics
print("\nDataset description:")
print(nhl_df.describe())


‚úÖ NHL Dataset shape: (4524, 30)

First 5 rows:
   game_id        date           home_team      away_team  home_total  \
0   324218  2022-09-25     Arizona Coyotes  Anaheim Ducks         2.0   
1   324242  2022-09-28     San Jose Sharks  Anaheim Ducks         5.0   
2   324277  2022-10-03   Los Angeles Kings  Anaheim Ducks         2.0   
3   324307  2022-10-08   Los Angeles Kings  Anaheim Ducks         6.0   
4   324329  2022-10-15  New York Islanders  Anaheim Ducks         7.0   

   away_total  total_points  over_under_line  beat_over  home_ppg_last5  ...  \
0         3.0           5.0         5.573989          0        4.000000  ...   
1         4.0           9.0         5.695814          1        3.000000  ...   
2         1.0           3.0         6.153145          0        1.000000  ...   
3         3.0           9.0         5.735789          1        2.333333  ...   
4         1.0           8.0         5.547567          1        2.750000  ...   

   api_odds  venue_advantage  p

In [27]:
# prediction_nhl_robust.py

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ==============================
# 1. Load dataset
# ==============================
nhl_dataset_path = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/WINNER_DATASET/NHL_winner_dataset.csv")
df = pd.read_csv(nhl_dataset_path)

# ==============================
# 2. Select features and target
# ==============================
features = [
    'away_total_norm', 'home_total_norm', 'over_signal', 'api_odds',
    'team_efficiency', 'rest_diff', 'rest_away', 'home_ppg_last5',
    'home_last5_avg', 'player_injuries', 'away_ppg_last5', 'home_momentum',
    'away_momentum', 'home_away_diff', 'venue_advantage', 'weather_factor'
]

# Use the actual outcome column from your dataset
target = 'home_team_won'  # 1 if home team wins, 0 if loses

# Ensure the features exist in the dataset
features = [f for f in features if f in df.columns]

# ==============================
# 3. Handle missing values
# ==============================
X = df[features].copy()
y = df[target]

X.fillna(0, inplace=True)  # safer than inplace on slice

# ==============================
# 4. Split dataset
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==============================
# 5. Scale numeric features
# ==============================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ==============================
# 6. Train robust Random Forest
# ==============================
model = RandomForestClassifier(
    n_estimators=1000,      # more trees for stability
    max_depth=None,
    min_samples_split=5,    # prevent overfitting
    min_samples_leaf=2,     # prevent overfitting
    random_state=42,
    n_jobs=-1               # use all cores
)

model.fit(X_train_scaled, y_train)

# ==============================
# 7. Evaluate the model
# ==============================
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy*100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

# ==============================
# 8. Feature importance
# ==============================
importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
print("Feature Importance:\n", importances)

# ==============================
# 9. Save model and scaler
# ==============================
joblib.dump(model, 'nhl_model_robust.pkl')
joblib.dump(scaler, 'nhl_scaler_robust.pkl')

# Save features along with model and scaler
joblib.dump(features, 'nhl_features.pkl')
print("Features list saved successfully!")


Model Accuracy: 99.89%
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       411
           1       1.00      1.00      1.00       494

    accuracy                           1.00       905
   macro avg       1.00      1.00      1.00       905
weighted avg       1.00      1.00      1.00       905

Feature Importance:
 away_total_norm    0.432438
home_total_norm    0.418712
over_signal        0.104006
api_odds           0.008743
team_efficiency    0.007627
rest_diff          0.005693
rest_away          0.004206
home_last5_avg     0.003882
player_injuries    0.003828
home_away_diff     0.003793
home_ppg_last5     0.003711
home_momentum      0.003360
away_ppg_last5     0.000000
away_momentum      0.000000
venue_advantage    0.000000
weather_factor     0.000000
dtype: float64
Features list saved successfully!


In [29]:
import pandas as pd
import joblib

# Load model, scaler, and features
model = joblib.load("nhl_model_robust.pkl")
scaler = joblib.load("nhl_scaler_robust.pkl")

# Features used in training
features = [
    'away_total_norm', 'home_total_norm', 'over_signal', 'api_odds',
    'team_efficiency', 'rest_diff', 'rest_away', 'home_ppg_last5',
    'home_last5_avg', 'player_injuries', 'away_ppg_last5', 'home_momentum',
    'away_momentum', 'home_away_diff', 'venue_advantage', 'weather_factor'
]

# 10 different game scenarios
scenarios = [
    {'away_total_norm': 2.5, 'home_total_norm': 3.0, 'over_signal':1, 'api_odds':1.9, 'team_efficiency':0.95, 'rest_diff':1, 'rest_away':2, 'home_ppg_last5':3.2, 'home_last5_avg':2.9, 'player_injuries':0, 'home_away_diff':0.5, 'home_momentum':0.8, 'away_ppg_last5':2.8, 'away_momentum':0, 'venue_advantage':0, 'weather_factor':0},
    {'away_total_norm': 3.0, 'home_total_norm': 2.7, 'over_signal':0, 'api_odds':2.1, 'team_efficiency':0.9, 'rest_diff':0, 'rest_away':1, 'home_ppg_last5':2.5, 'home_last5_avg':2.6, 'player_injuries':1, 'home_away_diff':-0.3, 'home_momentum':0.6, 'away_ppg_last5':3.1, 'away_momentum':0.7, 'venue_advantage':0, 'weather_factor':0},
    {'away_total_norm': 2.8, 'home_total_norm': 3.5, 'over_signal':1, 'api_odds':1.8, 'team_efficiency':0.97, 'rest_diff':2, 'rest_away':0, 'home_ppg_last5':3.8, 'home_last5_avg':3.6, 'player_injuries':0, 'home_away_diff':0.7, 'home_momentum':1.0, 'away_ppg_last5':2.9, 'away_momentum':0.2, 'venue_advantage':1, 'weather_factor':0},
    {'away_total_norm': 2.0, 'home_total_norm': 2.2, 'over_signal':0, 'api_odds':2.5, 'team_efficiency':0.88, 'rest_diff':-1, 'rest_away':2, 'home_ppg_last5':2.1, 'home_last5_avg':2.0, 'player_injuries':2, 'home_away_diff':-0.1, 'home_momentum':0.4, 'away_ppg_last5':2.5, 'away_momentum':0.3, 'venue_advantage':0, 'weather_factor':0},
    {'away_total_norm': 3.2, 'home_total_norm': 3.0, 'over_signal':1, 'api_odds':1.7, 'team_efficiency':0.93, 'rest_diff':0, 'rest_away':1, 'home_ppg_last5':3.0, 'home_last5_avg':2.8, 'player_injuries':0, 'home_away_diff':0.2, 'home_momentum':0.9, 'away_ppg_last5':3.1, 'away_momentum':0.4, 'venue_advantage':1, 'weather_factor':0},
    {'away_total_norm': 2.1, 'home_total_norm': 2.5, 'over_signal':0, 'api_odds':2.2, 'team_efficiency':0.91, 'rest_diff':1, 'rest_away':0, 'home_ppg_last5':2.6, 'home_last5_avg':2.5, 'player_injuries':1, 'home_away_diff':0.3, 'home_momentum':0.5, 'away_ppg_last5':2.4, 'away_momentum':0.6, 'venue_advantage':0, 'weather_factor':0},
    {'away_total_norm': 2.9, 'home_total_norm': 3.1, 'over_signal':1, 'api_odds':1.9, 'team_efficiency':0.96, 'rest_diff':0, 'rest_away':2, 'home_ppg_last5':3.4, 'home_last5_avg':3.2, 'player_injuries':0, 'home_away_diff':0.6, 'home_momentum':0.7, 'away_ppg_last5':2.9, 'away_momentum':0.2, 'venue_advantage':1, 'weather_factor':0},
    {'away_total_norm': 2.6, 'home_total_norm': 2.8, 'over_signal':0, 'api_odds':2.0, 'team_efficiency':0.92, 'rest_diff':-1, 'rest_away':1, 'home_ppg_last5':2.9, 'home_last5_avg':2.7, 'player_injuries':1, 'home_away_diff':0.1, 'home_momentum':0.6, 'away_ppg_last5':2.7, 'away_momentum':0.3, 'venue_advantage':0, 'weather_factor':0},
    {'away_total_norm': 3.3, 'home_total_norm': 3.6, 'over_signal':1, 'api_odds':1.6, 'team_efficiency':0.98, 'rest_diff':1, 'rest_away':0, 'home_ppg_last5':3.9, 'home_last5_avg':3.7, 'player_injuries':0, 'home_away_diff':0.8, 'home_momentum':1.0, 'away_ppg_last5':3.0, 'away_momentum':0, 'venue_advantage':1, 'weather_factor':0},
    {'away_total_norm': 2.4, 'home_total_norm': 2.7, 'over_signal':0, 'api_odds':2.3, 'team_efficiency':0.89, 'rest_diff':0, 'rest_away':1, 'home_ppg_last5':2.4, 'home_last5_avg':2.5, 'player_injuries':2, 'home_away_diff':-0.2, 'home_momentum':0.4, 'away_ppg_last5':2.6, 'away_momentum':0.5, 'venue_advantage':0, 'weather_factor':0}
]

# Convert to DataFrame
new_games = pd.DataFrame(scenarios)

# Ensure all features exist in the correct order
for col in features:
    if col not in new_games.columns:
        new_games[col] = 0
new_games = new_games[features]

# Scale features
new_games_scaled = scaler.transform(new_games)

# Predict
predictions = model.predict(new_games_scaled)

# Show results
for i, pred in enumerate(predictions):
    outcome = "Home Win" if pred == 1 else "Away Win"
    print(f"Game Scenario {i+1}: {outcome}")


Game Scenario 1: Home Win
Game Scenario 2: Home Win
Game Scenario 3: Home Win
Game Scenario 4: Home Win
Game Scenario 5: Home Win
Game Scenario 6: Home Win
Game Scenario 7: Home Win
Game Scenario 8: Home Win
Game Scenario 9: Home Win
Game Scenario 10: Home Win


In [30]:
print(df['home_team_won'].value_counts())


home_team_won
1    2471
0    2053
Name: count, dtype: int64


In [None]:
#!/usr/bin/env python3
import os
import ast
import pandas as pd
from src.data_storage import data_storage_service
import logging
from pathlib import Path
from datetime import datetime
import json

# ----------------------------
# Logging setup
# ----------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)-8s | %(message)s")
log = logging.getLogger("store_csv_to_db_corrected")

# ----------------------------
# Supported Leagues
# ----------------------------
LEAGUES = ["nfl", "nba", "mlb", "nhl"]

# ----------------------------
# Columns mapping for DB
# ----------------------------
DB_COLUMNS = [
    "game_id", "sport", "date", "home_team", "away_team",
    "home_score", "away_score", "status", "over_under",
    "point_spread", "raw_data", "match_time", "api_last_fetched"
]

# ----------------------------
# Helper: Safe type conversion
# ----------------------------
def to_float(val):
    try:
        return float(val)
    except (TypeError, ValueError):
        return None

# ----------------------------
# Helper: Robust date parsing
# ----------------------------
def extract_date(value):
    if pd.isna(value):
        return None

    # If dict
    if isinstance(value, dict):
        value = value.get("date")

    # If stringified dict
    elif isinstance(value, str):
        try:
            if value.startswith("{") and "date" in value:
                parsed = ast.literal_eval(value)
                value = parsed.get("date")
        except Exception:
            pass

    # Parse timestamp or date string
    try:
        date_parsed = pd.to_datetime(value, errors="coerce")
        if pd.notnull(date_parsed):
            return date_parsed.strftime("%Y-%m-%d")
    except Exception:
        pass

    return None

# ----------------------------
# Normalize row for DB
# ----------------------------
def normalize_row(row, league_name):
    record = {}

    # Required DB columns
    record["game_id"] = row.get("game_id") or f"{league_name}_{row.name}"
    record["sport"] = row.get("league") or league_name.upper()
    record["date"] = extract_date(row.get("date"))
    record["home_team"] = row.get("home_team_name") or row.get("home_team")
    record["away_team"] = row.get("away_team_name") or row.get("away_team")
    record["home_score"] = to_float(row.get("home_score_total") or row.get("home_score"))
    record["away_score"] = to_float(row.get("away_score_total") or row.get("away_score"))
    record["status"] = row.get("status_long") or row.get("status")
    record["over_under"] = to_float(row.get("over_under_line") or row.get("over_under"))
    record["point_spread"] = to_float(row.get("point_spread") or row.get("point_spread_line"))
    record["match_time"] = row.get("time") or "00:00:00"
    record["api_last_fetched"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Save everything else in raw_data for future reference
    record["raw_data"] = json.dumps(row.to_dict())

    return record

# ----------------------------
# Store function
# ----------------------------
def store_csv_to_db(csv_file, league_name):
    csv_path = Path(csv_file)
    if not csv_path.exists():
        log.warning(f"‚ö†Ô∏è File not found: {csv_file} ‚Äî skipping {league_name.upper()}")
        return

    log.info(f"Processing CSV: {csv_file}")
    df = pd.read_csv(csv_file)

    # Normalize all rows
    games = [normalize_row(row, league_name) for _, row in df.iterrows()]

    # Log validation info
    valid_dates = sum(1 for g in games if g["date"])
    invalid_dates = len(games) - valid_dates
    log.info(f"üìÖ {valid_dates} valid dates | ‚ö†Ô∏è {invalid_dates} invalid dates in {league_name.upper()}")

    # Save to DB
    data_storage_service.save_games(games)
    log.info(f"‚úÖ Stored {len(games)} games for {league_name.upper()} into DB.")

# ----------------------------
# Main runner
# ----------------------------
def main():
    for league in LEAGUES:
        csv_file = f"{league}_games.csv"
        store_csv_to_db(csv_file, league)
    log.info("üéØ All CSV files processed and saved to DB successfully!")

if __name__ == "__main__":
    main()


INFO:store_csv_to_db_corrected:Processing CSV: nfl_games.csv
INFO:store_csv_to_db_corrected:üìÖ 0 valid dates | ‚ö†Ô∏è 5239 invalid dates in NFL


OperationalError: table game_data has 13 columns but 11 values were supplied

In [60]:
from src.data_storage import data_storage_service

for sport in ["NFL", "NBA", "MLB", "NHL"]:
    games = data_storage_service.fetch_historical_games(sport)
    print(f"{sport}: {len(games)} games")


INFO:data_storage:Fetched 5239 historical games for NFL from database.
INFO:data_storage:Fetched 0 historical games for NBA from database.
INFO:data_storage:Fetched 10478 historical games for MLB from database.
INFO:data_storage:Fetched 0 historical games for NHL from database.


NFL: 5239 games
NBA: 0 games
MLB: 10478 games
NHL: 0 games


In [64]:
#!/usr/bin/env python3
import sqlite3
import logging
import os
from datetime import datetime

# --- Configuration ---
DB_FILE_NAME = "sports_forecast.db"
logger = logging.getLogger("db_migration")
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')

def get_db_connection():
    """Establishes connection to the SQLite database."""
    db_path = os.path.join(os.getcwd(), DB_FILE_NAME)
    logger.info(f"Attempting to connect to database at: {db_path}")
    try:
        conn = sqlite3.connect(db_path)
        return conn
    except sqlite3.Error as e:
        logger.error(f"Failed to connect to database: {e}")
        return None

def get_existing_columns(conn: sqlite3.Connection, table_name: str) -> list:
    """Retrieves a list of all column names in a given table."""
    cursor = conn.cursor()
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = [row[1] for row in cursor.fetchall()]
    return columns

def run_migration(conn: sqlite3.Connection):
    """Executes necessary schema changes (migrations)."""
    if not conn:
        return

    cursor = conn.cursor()
    table_name = "game_data"
    
    logger.info(f"Starting migration for table: {table_name}")
    existing_columns = get_existing_columns(conn, table_name)
    logger.info(f"Current columns: {existing_columns}")

    # --- MIGRATION STEP 1: ADDING A NEW COLUMN (EXAMPLE) ---
    NEW_COLUMN = "match_time"
    if NEW_COLUMN not in existing_columns:
        try:
            # Note: You must provide a default value if the column is NOT NULL
            cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {NEW_COLUMN} TEXT DEFAULT '00:00:00'")
            conn.commit()
            logger.info(f"‚úÖ Successfully added column '{NEW_COLUMN}' to {table_name}.")
        except sqlite3.Error as e:
            logger.error(f"‚ùå Failed to add column '{NEW_COLUMN}': {e}")
    else:
        logger.info(f"‚è≠Ô∏è Column '{NEW_COLUMN}' already exists. Skipping.")
        
    # --- MIGRATION STEP 2: ADDING A SECOND NEW COLUMN (EXAMPLE) ---
    NEW_COLUMN_2 = "api_last_fetched"
    if NEW_COLUMN_2 not in existing_columns:
        try:
            current_timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {NEW_COLUMN_2} TEXT DEFAULT '{current_timestamp}'")
            conn.commit()
            logger.info(f"‚úÖ Successfully added column '{NEW_COLUMN_2}' to {table_name}.")
        except sqlite3.Error as e:
            logger.error(f"‚ùå Failed to add column '{NEW_COLUMN_2}': {e}")
    else:
        logger.info(f"‚è≠Ô∏è Column '{NEW_COLUMN_2}' already exists. Skipping.")

    logger.info("üéâ **Migration process finished!**")

def main():
    conn = get_db_connection()
    if conn:
        run_migration(conn)
        conn.close()

if __name__ == "__main__":
    main()

INFO:db_migration:Attempting to connect to database at: e:\Imran Projects\Sports-Project-main\sports_forecast.db
INFO:db_migration:Starting migration for table: game_data
INFO:db_migration:Current columns: ['game_id', 'sport', 'date', 'home_team', 'away_team', 'home_score', 'away_score', 'status', 'over_under', 'point_spread', 'raw_data']
INFO:db_migration:‚úÖ Successfully added column 'match_time' to game_data.
INFO:db_migration:‚úÖ Successfully added column 'api_last_fetched' to game_data.
INFO:db_migration:üéâ **Migration process finished!**


In [1]:
import requests

url = "https://v1.american-football.api-sports.io/status"
headers = {"x-apisports-key": "9d1dbc393fa470ff6f25a0bf1fe1647e"}

response = requests.get(url, headers=headers)
print(response.json())


{'get': 'status', 'parameters': [], 'errors': [], 'results': 0, 'paging': {'current': 1, 'total': 1}, 'response': {'account': {'firstname': 'David', 'lastname': 'Na', 'email': 'netcastillo2@hotmail.com'}, 'subscription': {'plan': 'Pro', 'end': '2025-11-25T21:57:40+00:00', 'active': True}, 'requests': {'current': 3, 'limit_day': 7500}}}


In [4]:
import http.client

conn = http.client.HTTPSConnection("v3.football.api-sports.io")

headers = {
    'x-rapidapi-host': "v3.football.api-sports.io",
    'x-rapidapi-key': "9d1dbc393fa470ff6f25a0bf1fe1647e"
    }

conn.request("GET", "/leagues", headers=headers)

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))s
      

SyntaxError: invalid syntax (632667964.py, line 15)

In [3]:
import requests
from datetime import date

# ===============================
# ‚öôÔ∏è Configuration
# ===============================
API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e"
BASE_URL = "https://v1.american-football.api-sports.io"

today = date.today().strftime("%Y-%m-%d")

headers = {
    "x-apisports-key": API_KEY
}

# ===============================
# üß≠ Fetch Today's Games
# ===============================
response = requests.get(f"{BASE_URL}/games?date={today}", headers=headers)
data = response.json()

# ===============================
# üß© Safe Status Extractor
# ===============================
def get_status(game):
    """Try to extract the game status from various possible API structures."""
    # Check multiple possible paths
    if "status" in game and isinstance(game["status"], dict):
        return game["status"].get("short") or game["status"].get("long") or "Unknown"
    if "game" in game and "status" in game["game"]:
        status = game["game"]["status"]
        if isinstance(status, dict):
            return status.get("short") or status.get("long") or "Unknown"
        return status
    if "fixture" in game and "status" in game["fixture"]:
        status = game["fixture"]["status"]
        if isinstance(status, dict):
            return status.get("short") or status.get("long") or "Unknown"
        return status
    # fallback: sometimes status may be a string directly
    return game.get("status", "Unknown")

# ===============================
# üßÆ Process & Categorize Games
# ===============================
if "response" not in data:
    print("‚ö†Ô∏è Unexpected response format:", data)
else:
    games = data.get("response", [])
    if not games:
        print(f"üì≠ No NFL games found for {today}")
    else:
        live_status_codes = ("1Q", "2Q", "3Q", "4Q", "OT", "BT", "LIVE", "Q1", "Q2", "Q3", "Q4")
        scheduled_status_codes = ("NS", "TBD", "Scheduled")
        finished_status_codes = ("FT", "AOT", "CANC", "POST", "Finished")

        def classify_game(game):
            status = get_status(game)
            if status in live_status_codes:
                return "live"
            elif status in scheduled_status_codes:
                return "scheduled"
            elif status in finished_status_codes:
                return "finished"
            return "unknown"

        live_games = []
        scheduled_games = []
        finished_games = []
        unknown_games = []

        for g in games:
            cat = classify_game(g)
            if cat == "live":
                live_games.append(g)
            elif cat == "scheduled":
                scheduled_games.append(g)
            elif cat == "finished":
                finished_games.append(g)
            else:
                unknown_games.append(g)

        # ===============================
        # üèÅ Print Summary
        # ===============================
        print(f"üìÖ NFL Games on {today}")
        print(f"üèà Live games: {len(live_games)}")
        print(f"üïí Scheduled games: {len(scheduled_games)}")
        print(f"‚úÖ Finished games: {len(finished_games)}")
        print(f"‚ùì Unknown status games: {len(unknown_games)}")
        print("-" * 60)

        for g in games:
            home = g.get("teams", {}).get("home", {}).get("name", "Unknown")
            away = g.get("teams", {}).get("away", {}).get("name", "Unknown")
            status = get_status(g)
            print(f"{home} vs {away} ‚Äî Status: {status}")


üìÖ NFL Games on 2025-11-11
üèà Live games: 0
üïí Scheduled games: 0
‚úÖ Finished games: 1
‚ùì Unknown status games: 0
------------------------------------------------------------
Green Bay Packers vs Philadelphia Eagles ‚Äî Status: FT


In [1]:
import requests

API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e"
BASE_URL = "https://v1.american-football.api-sports.io"
headers = {"x-apisports-key": API_KEY}

# Get all odds
res = requests.get(f"{BASE_URL}/odds?league=1&season=2025", headers=headers)
odds_data = res.json().get("response", [])

for g in odds_data:
    print(g.get("game", {}).get("id"), g.get("bookmakers", []))


In [2]:
import torch
import os
from transformers import pipeline

# --- Configuration ---
# The model ID for the fast, optimized Whisper v3 Turbo model.
MODEL_ID = "openai/whisper-large-v3-turbo"
# Change this path to your actual audio file (e.g., 'recording.mp3' or 'speech.wav')
# NOTE: The model supports various formats: FLAC, MP3, M4A, WAV, etc.
AUDIO_FILE_PATH = "my_uploaded_audio.mp3"

def transcribe_audio_with_turbo():
    """
    Loads the Whisper Large v3 Turbo model using the Hugging Face pipeline
    and transcribes the specified audio file.
    """
    
    # 1. Check for the audio file
    if not os.path.exists(AUDIO_FILE_PATH):
        print(f"Error: Audio file not found at '{AUDIO_FILE_PATH}'")
        print("Please replace 'my_uploaded_audio.mp3' with the path to your actual audio file.")
        # Optionally create a dummy file instruction or exit
        # For simplicity, we'll exit here and let the user replace the file.
        return

    print("--- Starting Transcription Process ---")
    print(f"Using model: {MODEL_ID}")
    print(f"Processing file: {AUDIO_FILE_PATH}")

    # 2. Determine the device (GPU for speed, fallback to CPU)
    # Using float16 (half-precision) significantly speeds up inference on GPU.
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    # 3. Initialize the ASR pipeline
    try:
        pipe = pipeline(
            "automatic-speech-recognition",
            model=MODEL_ID,
            torch_dtype=torch_dtype,
            device=device,
            # Force the use of the optimized model's configuration
            model_kwargs={"low_cpu_mem_usage": True}
        )
        print(f"Model successfully loaded on device: {device.upper()}")
        
    except Exception as e:
        print(f"An error occurred during model loading: {e}")
        print("Ensure you have all dependencies (torch, transformers, accelerate) correctly installed.")
        return

    # 4. Perform the transcription
    try:
        # For short audios, a standard call is sufficient. 
        # The Turbo model is fast enough for low-latency scenarios.
        result = pipe(AUDIO_FILE_PATH)
        
        transcribed_text = result["text"]
        
        print("\n" + "="*50)
        print("‚úÖ TRANSCRIPTION COMPLETE")
        print("="*50)
        print(transcribed_text)
        print("="*50)

    except Exception as e:
        print(f"An error occurred during transcription: {e}")


if __name__ == "__main__":
    transcribe_audio_with_turbo()

ModuleNotFoundError: Could not import module 'pipeline'. Are this object's requirements defined correctly?

In [4]:
# ==========================================
# üèí NHL Game Outcome Prediction (Final Version)
# ==========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

# ==========================================
# 1Ô∏è‚É£ Load Dataset
# ==========================================
nhl_dataset_path = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/WINNER_DATASET/NHL_winner_dataset.csv")
df = pd.read_csv(nhl_dataset_path)
print("‚úÖ Dataset loaded successfully!")
print(f"Total Rows: {len(df)}\n")

# ==========================================

# 2Ô∏è‚É£ Feature Selection
# ==========================================
features = [
    'home_total_norm', 'away_total_norm', 'over_signal', 'api_odds',
    'team_efficiency', 'rest_diff', 'rest_away',
    'home_ppg_last5', 'home_away_diff', 'home_last5_avg'
]

target = 'home_team_won'

X = df[features]
y = df[target]

print(f"Feature Count: {len(features)}")
print(f"Class Distribution:\n {y.value_counts().rename('count')}\n")

# ==========================================
# 3Ô∏è‚É£ Data Splitting
# ==========================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================================
# 4Ô∏è‚É£ Feature Scaling
# ==========================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ==========================================
# 5Ô∏è‚É£ Model Training
# ==========================================
model = RandomForestClassifier(
    n_estimators=500,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced_subsample',
    random_state=42,
    n_jobs=-1
)
model.fit(X_train_scaled, y_train)

# ==========================================
# 6Ô∏è‚É£ Model Evaluation
# ==========================================
y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"‚úÖ Model Performance:")
print(f"Accuracy: {accuracy*100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ==========================================
# 7Ô∏è‚É£ Feature Importance
# ==========================================
feature_importance = pd.Series(model.feature_importances_, index=features)
feature_importance = feature_importance.sort_values(ascending=False)
print("\nTop Features:\n", feature_importance.head(10))

# ==========================================
# 8Ô∏è‚É£ Save Model
# ==========================================
joblib.dump(model, "nhl_prediction_model.pkl")
joblib.dump(scaler, "nhl_scaler.pkl")
print("\n‚úÖ Final NHL model saved successfully!")

# ==========================================
# 9Ô∏è‚É£ Simulate 10 Random Game Scenarios
# ==========================================
print("\nüéØ Testing 10 Random Game Scenarios:\n")

random_games = X_test.sample(10, random_state=42)
predictions = model.predict(scaler.transform(random_games))

for i, pred in enumerate(predictions, 1):
    outcome = "üè† Home Win" if pred == 1 else "üö® Away Win"
    print(f"Game Scenario {i}: {outcome}")


‚úÖ Dataset loaded successfully!
Total Rows: 4524

Feature Count: 10
Class Distribution:
 home_team_won
1    2471
0    2053
Name: count, dtype: int64

‚úÖ Model Performance:
Accuracy: 99.89%
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       411
           1       1.00      1.00      1.00       494

    accuracy                           1.00       905
   macro avg       1.00      1.00      1.00       905
weighted avg       1.00      1.00      1.00       905

Confusion Matrix:
 [[410   1]
 [  0 494]]

Top Features:
 away_total_norm    0.446406
home_total_norm    0.426455
over_signal        0.100489
api_odds           0.006681
team_efficiency    0.005559
rest_diff          0.004068
rest_away          0.002948
home_ppg_last5     0.002568
home_last5_avg     0.002465
home_away_diff     0.002362
dtype: float64

‚úÖ Final NHL model saved successfully!

üéØ Testing 10 Random Game Scenarios:

Game Scenario 1: üè† H

In [1]:
# --------------------------------------------------------------
# inspect_nhl_dataset.py
# --------------------------------------------------------------
import pandas as pd
from pathlib import Path

# ------------------------------------------------------------------
# 1. PATHS
# ------------------------------------------------------------------
DATASET_DIR = Path("C:/Users/mimra/Downloads/FINAL_100_100_DATASET/WINNER_DATASET")
NHL_FILE    = DATASET_DIR / "NHL_winner_dataset.csv"

# ------------------------------------------------------------------
# 2. LOAD DATASET
# ------------------------------------------------------------------
if not NHL_FILE.exists():
    raise FileNotFoundError(f"Could not find {NHL_FILE}")

df = pd.read_csv(NHL_FILE)
print("\n" + "="*60)
print("NHL DATASET LOADED".center(60))
print("="*60 + "\n")

# ------------------------------------------------------------------
# 3. BASIC SHAPE & INFO
# ------------------------------------------------------------------
print(f"Total rows   (games) : {df.shape[0]:,}")
print(f"Total columns         : {df.shape[1]:,}\n")

print("Column names & dtypes".center(60, "-"))
print(df.dtypes.to_string())
print("-"*60)

# ------------------------------------------------------------------
# 4. MISSING VALUES
# ------------------------------------------------------------------
missing = df.isnull().sum()
if missing.any():
    print("\nMissing values per column".center(60, "-"))
    print(missing[missing > 0].to_string())
else:
    print("\nNo missing values in any column.")

# ------------------------------------------------------------------
# 5. TARGET DISTRIBUTION (home_team_won)
# ------------------------------------------------------------------
target = "home_team_won"
if target in df.columns:
    print(f"\nTarget column: '{target}'".center(60, "-"))
    vc = df[target].value_counts().sort_index()
    pc = df[target].value_counts(normalize=True).sort_index().round(4)
    print(f"0 (away win) : {vc.get(0,0):,}  ({pc.get(0,0):.2%})")
    print(f"1 (home win) : {vc.get(1,0):,}  ({pc.get(1,0):.2%})")
else:
    print(f"\nTarget column '{target}' NOT FOUND!")

# ------------------------------------------------------------------
# 6. EXCLUDED COLUMNS (as in your training script)
# ------------------------------------------------------------------
exclude_cols = [
    'game_id', 'date', 'home_team', 'away_team',
    'home_total', 'away_total', 'total_points',
    'over_under_line', 'beat_over', 'home_team_won'
]

feature_cols = [c for c in df.columns if c not in exclude_cols]
print(f"\nFeatures used for modelling ({len(feature_cols)} columns)".center(60, "-"))
print(", ".join(feature_cols))

# ------------------------------------------------------------------
# 7. SAMPLE ROWS (first 5 games)
# ------------------------------------------------------------------
sample_cols = ['date', 'home_team', 'away_team', target] + feature_cols[:5]
print(f"\nSample of first 5 games (showing {len(sample_cols)} columns)".center(60, "-"))
print(df[sample_cols].head(5).to_string(index=False))

# ------------------------------------------------------------------
# 8. BASIC STATISTICS FOR NUMERIC FEATURES
# ------------------------------------------------------------------
numeric_feats = df[feature_cols].select_dtypes(include="number")
if not numeric_feats.empty:
    print(f"\nNumeric feature statistics".center(60, "-"))
    print(numeric_feats.describe().round(3).to_string())
else:
    print("\nNo numeric features detected among the modelling columns.")

# ------------------------------------------------------------------
# 9. SAVE A QUICK CSV SNAPSHOT (optional)
# ------------------------------------------------------------------
snapshot_path = Path("nhl_dataset_snapshot.csv")
df.head(100).to_csv(snapshot_path, index=False)
print(f"\nSnapshot of first 100 rows saved to: {snapshot_path.resolve()}")
print("\n" + "="*60)
print("INSPECTION COMPLETE".center(60))
print("="*60)


                     NHL DATASET LOADED                     

Total rows   (games) : 4,524
Total columns         : 30

-------------------Column names & dtypes--------------------
game_id              int64
date                object
home_team           object
away_team           object
home_total         float64
away_total         float64
total_points       float64
over_under_line    float64
beat_over            int64
home_ppg_last5     float64
rest_home          float64
month                int64
is_weekend           int64
home_last5_avg     float64
away_last5_avg       int64
home_away_diff     float64
rest_diff          float64
over_signal        float64
home_total_norm    float64
away_total_norm    float64
api_odds           float64
venue_advantage      int64
player_injuries      int64
team_efficiency    float64
weather_factor     float64
away_ppg_last5       int64
home_momentum      float64
rest_away          float64
away_momentum      float64
home_team_won        int64
---------

In [7]:
import requests
import pandas as pd
from tqdm import tqdm

API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e"
LEAGUE_ID = 57
SEASONS = list(range(2015, 2026))  # Seasons from 2015 to 2025
BASE_URL = "https://v1.hockey.api-sports.io/games"

headers = {"x-apisports-key": API_KEY}
all_games = []

for season in SEASONS:
    print(f"Fetching games for season {season}...")
    params = {"league": LEAGUE_ID, "season": season}
    response = requests.get(BASE_URL, headers=headers, params=params)
    
    if response.status_code != 200:
        print(f"Error fetching season {season}: {response.status_code}")
        continue

    data = response.json()
    
    for game in data.get("response", []):
        if game["status"]["short"] == "FT":  # Only finished games
            all_games.append({
                "season": season,
                "game_id": game["id"],
                "date": game["date"],
                "home_team": game["teams"]["home"]["name"],
                "away_team": game["teams"]["away"]["name"],
                "home_score": game["scores"]["home"],
                "away_score": game["scores"]["away"]
            })
    
    print(f"Season {season}: {len(all_games)} finished games collected so far.")

# Convert to DataFrame
df = pd.DataFrame(all_games)
print(f"\nTotal finished games collected: {len(df)}")
print(df.head())

# Optionally save to CSV
df.to_csv("nhl_finished_games.csv", index=False)


Fetching games for season 2015...
Season 2015: 1109 finished games collected so far.
Fetching games for season 2016...
Season 2016: 2190 finished games collected so far.
Fetching games for season 2017...
Season 2017: 3328 finished games collected so far.
Fetching games for season 2018...
Season 2018: 4491 finished games collected so far.
Fetching games for season 2019...
Season 2019: 5517 finished games collected so far.
Fetching games for season 2020...
Season 2020: 6247 finished games collected so far.
Fetching games for season 2021...
Season 2021: 7428 finished games collected so far.
Fetching games for season 2022...
Season 2022: 8595 finished games collected so far.
Fetching games for season 2023...
Season 2023: 9795 finished games collected so far.
Fetching games for season 2024...
Season 2024: 10989 finished games collected so far.
Fetching games for season 2025...
Season 2025: 11263 finished games collected so far.

Total finished games collected: 11263
   season  game_id      

In [17]:
import requests
import pandas as pd
from tqdm import tqdm
import time
import random
from concurrent.futures import ThreadPoolExecutor 
import itertools 

# --- CONFIGURATION ---
API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e" # ‚ö†Ô∏è IMPORTANT: Replace with your actual API key
LEAGUE_ID = 57 # NHL
SEASONS = list(range(2015, 2026)) # Seasons from 2015 to 2025
BASE_URL = "https://v1.hockey.api-sports.io"

HEADERS = {"x-apisports-key": API_KEY}
MAX_WORKERS = 10 # Number of concurrent threads for statistics fetching

all_games_data = []
all_stats_data = []
total_game_count = 0

# --- HELPER FUNCTIONS ---

def fetch_game_data_and_stats(game):
    """
    Fetches game details and detailed statistics for a single game ID.
    This function is run concurrently by the executor.
    """
    season = game['season'] 
    game_id = game["id"]
    
    # Base Game Data
    game_data = {
        "season": season,
        "game_id": game_id,
        "date": game.get("date"),
        "home_team_name": game.get("teams", {}).get("home", {}).get("name"),
        "away_team_name": game.get("teams", {}).get("away", {}).get("name"),
        "home_score_final": game.get("scores", {}).get("home"),
        "away_score_final": game.get("scores", {}).get("away")
    }

    # Detailed Statistics API Call
    stats_url = f"{BASE_URL}/games/statistics"
    params = {"id": game_id}
    stats_list = []
    
    try:
        response = requests.get(stats_url, headers=HEADERS, params=params)
        response.raise_for_status() 
        data = response.json()
        
        # Flatten the statistics response
        for team_stat in data.get("response", []):
            team_info = team_stat.get("team", {})
            team_id = team_info.get("id")
            team_name = team_info.get("name")
            
            for stat in team_stat.get("statistics", []):
                stats_list.append({
                    "game_id": game_id,
                    "team_id": team_id,
                    "team_name": team_name,
                    "stat_type": stat.get("type"),
                    "stat_value": stat.get("value")
                })
                
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è Error fetching stats for game {game_id}: {e}")
        
    return game_data, stats_list

# --- MAIN SCRAPER LOGIC ---

print("Starting concurrent data collection...")
print("-" * 30)

for season in tqdm(SEASONS, desc="Overall Progress"):
    print(f"\nFetching games for season {season}...")
    
    # 1. Fetch all games for the season (Sequential call - usually fast)
    games_url = f"{BASE_URL}/games"
    params = {"league": LEAGUE_ID, "season": season}
    
    try:
        response = requests.get(games_url, headers=HEADERS, params=params)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è Error fetching season {season} games: {e}")
        continue

    # Filter for finished games and prepare data structure for concurrent processing
    games_to_process = [
        {"id": game["id"], "season": season, **game} 
        for game in data.get("response", []) 
        if game.get("status", {}).get("short") == "FT"
    ]
    
    if not games_to_process:
        print(f"No finished games found for season {season}.")
        continue

    print(f"Found {len(games_to_process)} finished games for season {season}. Fetching statistics concurrently with {MAX_WORKERS} workers...")

    # 2. Use ThreadPoolExecutor for concurrent stats fetching
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Map the function to all games and show progress with tqdm
        results = list(tqdm(
            executor.map(fetch_game_data_and_stats, games_to_process),
            total=len(games_to_process),
            desc=f"Season {season} Stats (Concurrent)"
        ))

    # 3. Collect results from concurrent threads
    for game_data, stats_list in results:
        all_games_data.append(game_data)
        all_stats_data.extend(stats_list)
                
    total_game_count += len(games_to_process)
    print(f"Season {season} finished. Total games collected so far: {total_game_count}")

print("-" * 30)
print("\nData Collection Complete. Processing DataFrames...")

# --- DATA PROCESSING & MERGING ---

df_games = pd.DataFrame(all_games_data)
df_raw_stats = pd.DataFrame(all_stats_data)

if not df_raw_stats.empty:
    # 1. Pivot the raw stats to get features like 'Shots', 'Hits', etc. as columns
    df_pivot_stats = df_raw_stats.pivot_table(
        index=['game_id', 'team_id', 'team_name'], 
        columns='stat_type', 
        values='stat_value', 
        aggfunc='first'
    ).reset_index()
    df_pivot_stats.columns.name = None # Clean up column names

    # 2. Identify and rename Home/Away stats for merging onto a single row
    # Merge with df_games to know which team is home and which is away
    df_temp = pd.merge(df_games[['game_id', 'home_team_name', 'away_team_name']], 
                       df_pivot_stats, on=['game_id', 'team_name'], how='left')

    # Separate and rename columns for Home
    home_cols = [col for col in df_pivot_stats.columns if col not in ['game_id', 'team_id', 'team_name']]
    df_home_stats = df_temp[df_temp['team_name'] == df_temp['home_team_name']].rename(
        columns={c: f'home_{c.replace(" ", "_").lower()}' for c in home_cols}
    ).drop(columns=['team_id', 'team_name', 'away_team_name'])

    # Separate and rename columns for Away
    away_cols = [col for col in df_pivot_stats.columns if col not in ['game_id', 'team_id', 'team_name']]
    df_away_stats = df_temp[df_temp['team_name'] == df_temp['away_team_name']].rename(
        columns={c: f'away_{c.replace(" ", "_").lower()}' for c in away_cols}
    ).drop(columns=['team_id', 'team_name', 'home_team_name'])

    # 3. Final merge: Game data + Home Stats + Away Stats
    df_final = df_games.merge(df_home_stats, on='game_id', how='left').merge(df_away_stats, on='game_id', how='left')
    
    # Final cleanup: Remove redundant team name columns created during temporary merge
    df_final = df_final.drop(columns=['home_team_name_y', 'away_team_name_y', 'home_team_name_x', 'away_team_name_x'], errors='ignore')
    
    print(f"Total rows in FINAL DataFrame (Game + Stats): {len(df_final)}")
    
else:
    df_final = df_games.copy()
    print("No detailed statistics collected. Check API key and rate limits.")

# --- SAVE RESULTS ---

OUTPUT_FILENAME = "nhl_finished_games_complete_features.csv"
df_final.to_csv(OUTPUT_FILENAME, index=False)

print("\n‚úÖ Script completed.")
print(f"Saved {len(df_final)} rows to **{OUTPUT_FILENAME}**")
print("\n--- Example of Collected Features ---")
print(df_final.head(2).T)

Starting concurrent data collection...
------------------------------


Overall Progress:   0%|          | 0/11 [00:00<?, ?it/s]


Fetching games for season 2015...
Found 1109 finished games for season 2015. Fetching statistics concurrently with 10 workers...


Season 2015 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1109/1109 [01:55<00:00,  9.64it/s]
Overall Progress:   9%|‚ñâ         | 1/11 [01:57<19:32, 117.23s/it]

Season 2015 finished. Total games collected so far: 1109

Fetching games for season 2016...
Found 1081 finished games for season 2016. Fetching statistics concurrently with 10 workers...


Season 2016 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1081/1081 [01:55<00:00,  9.35it/s]
Overall Progress:  18%|‚ñà‚ñä        | 2/11 [03:54<17:37, 117.46s/it]

Season 2016 finished. Total games collected so far: 2190

Fetching games for season 2017...
Found 1138 finished games for season 2017. Fetching statistics concurrently with 10 workers...


Season 2017 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1138/1138 [01:59<00:00,  9.49it/s]
Overall Progress:  27%|‚ñà‚ñà‚ñã       | 3/11 [05:56<15:56, 119.60s/it]

Season 2017 finished. Total games collected so far: 3328

Fetching games for season 2018...
Found 1163 finished games for season 2018. Fetching statistics concurrently with 10 workers...


Season 2018 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1163/1163 [02:03<00:00,  9.39it/s]
Overall Progress:  36%|‚ñà‚ñà‚ñà‚ñã      | 4/11 [08:03<14:15, 122.23s/it]

Season 2018 finished. Total games collected so far: 4491

Fetching games for season 2019...
Found 1026 finished games for season 2019. Fetching statistics concurrently with 10 workers...


Season 2019 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1026/1026 [01:51<00:00,  9.23it/s]
Overall Progress:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 5/11 [09:56<11:54, 119.01s/it]

Season 2019 finished. Total games collected so far: 5517

Fetching games for season 2020...
Found 730 finished games for season 2020. Fetching statistics concurrently with 10 workers...


Season 2020 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 730/730 [01:20<00:00,  9.12it/s]
Overall Progress:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 6/11 [11:18<08:52, 106.47s/it]

Season 2020 finished. Total games collected so far: 6247

Fetching games for season 2021...
Found 1181 finished games for season 2021. Fetching statistics concurrently with 10 workers...


Season 2021 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1181/1181 [02:08<00:00,  9.22it/s]
Overall Progress:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 7/11 [13:29<07:37, 114.28s/it]

Season 2021 finished. Total games collected so far: 7428

Fetching games for season 2022...
Found 1167 finished games for season 2022. Fetching statistics concurrently with 10 workers...


Season 2022 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1167/1167 [02:08<00:00,  9.09it/s]
Overall Progress:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 8/11 [15:39<05:58, 119.48s/it]

Season 2022 finished. Total games collected so far: 8595

Fetching games for season 2023...
Found 1200 finished games for season 2023. Fetching statistics concurrently with 10 workers...


Season 2023 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1200/1200 [02:10<00:00,  9.23it/s]
Overall Progress:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 9/11 [17:52<04:07, 123.74s/it]

Season 2023 finished. Total games collected so far: 9795

Fetching games for season 2024...
Found 1194 finished games for season 2024. Fetching statistics concurrently with 10 workers...


Season 2024 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1194/1194 [02:06<00:00,  9.41it/s]
Overall Progress:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 10/11 [20:01<02:05, 125.42s/it]

Season 2024 finished. Total games collected so far: 10989

Fetching games for season 2025...
Found 274 finished games for season 2025. Fetching statistics concurrently with 10 workers...


Season 2025 Stats (Concurrent): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 274/274 [00:28<00:00,  9.74it/s]
Overall Progress: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [20:32<00:00, 112.01s/it]

Season 2025 finished. Total games collected so far: 11263
------------------------------

Data Collection Complete. Processing DataFrames...
No detailed statistics collected. Check API key and rate limits.

‚úÖ Script completed.
Saved 11263 rows to **nhl_finished_games_complete_features.csv**

--- Example of Collected Features ---
                                          0                          1
season                                 2015                       2015
game_id                              102464                     102465
date              2015-09-20T20:30:00+00:00  2015-09-20T23:00:00+00:00
home_team_name          Nashville Predators              Boston Bruins
away_team_name             Florida Panthers          New Jersey Devils
home_score_final                          5                          2
away_score_final                          2                          0





In [None]:
9d1dbc393fa470ff6f25a0bf1fe1647e

In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor 
import itertools 

# --- CONFIGURATION ---
API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e" # ‚ö†Ô∏è REPLACE WITH YOUR ACTUAL KEY
LEAGUE_ID = 57 # NHL
SEASONS = list(range(2015, 2026)) # Seasons from 2015 to 2025
BASE_URL = "https://v1.hockey.api-sports.io"

HEADERS = {"x-apisports-key": API_KEY}
MAX_WORKERS = 10 

all_games_data = []
all_stats_data = []
total_game_count = 0

# --- HELPER FUNCTIONS ---

def fetch_game_data_and_stats(game):
    """
    Fetches game details and detailed statistics for a single game ID 
    using the CORRECTED API URL path.
    """
    season = game['season'] 
    game_id = game["id"]
    
    # Base Game Data
    game_data = {
        "season": season,
        "game_id": game_id,
        "date": game.get("date"),
        "home_team_name": game.get("teams", {}).get("home", {}).get("name"),
        "away_team_name": game.get("teams", {}).get("away", {}).get("name"),
        "home_score_final": game.get("scores", {}).get("home"),
        "away_score_final": game.get("scores", {}).get("away")
    }

    # üõë CORRECTED API CALL üõë 
    # Using /statistics?id= instead of /games/statistics?id=
    stats_url = f"{BASE_URL}/statistics" 
    params = {"id": game_id}
    stats_list = []
    
    try:
        response = requests.get(stats_url, headers=HEADERS, params=params)
        response.raise_for_status() # Check for non-200 errors (4xx, 5xx)
        data = response.json()
        
        # Flatten the statistics response
        for team_stat in data.get("response", []):
            team_info = team_stat.get("team", {})
            team_id = team_info.get("id")
            team_name = team_info.get("name")
            
            for stat in team_stat.get("statistics", []):
                stats_list.append({
                    "game_id": game_id,
                    "team_id": team_id,
                    "team_name": team_name,
                    "stat_type": stat.get("type"),
                    "stat_value": stat.get("value")
                })
                
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è Error fetching stats for game {game_id}: {e}")
        
    return game_data, stats_list

# --- MAIN SCRAPER LOGIC (Unchanged) ---

print("Starting concurrent data collection with corrected URL...")
print("-" * 30)

for season in tqdm(SEASONS, desc="Overall Progress"):
    print(f"\nFetching games for season {season}...")
    
    # 1. Fetch all games for the season
    games_url = f"{BASE_URL}/games"
    params = {"league": LEAGUE_ID, "season": season}
    
    try:
        response = requests.get(games_url, headers=HEADERS, params=params)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching season {season} games: {e}")
        continue

    # Filter for finished games and prepare data structure for concurrent processing
    games_to_process = [
        {"id": game["id"], "season": season, **game} 
        for game in data.get("response", []) 
        if game.get("status", {}).get("short") == "FT"
    ]
    
    if not games_to_process:
        print(f"No finished games found for season {season}.")
        continue

    print(f"Found {len(games_to_process)} finished games for season {season}. Fetching statistics concurrently...")

    # 2. Use ThreadPoolExecutor for concurrent stats fetching
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results = list(tqdm(
            executor.map(fetch_game_data_and_stats, games_to_process),
            total=len(games_to_process),
            desc=f"Season {season} Stats (Concurrent)"
        ))

    # 3. Collect results from concurrent threads
    for game_data, stats_list in results:
        all_games_data.append(game_data)
        all_stats_data.extend(stats_list)
                
    total_game_count += len(games_to_process)
    print(f"Season {season} finished. Total games collected so far: {total_game_count}")

print("-" * 30)
print("\nData Collection Complete. Processing DataFrames...")

# --- DATA PROCESSING & MERGING (Unchanged) ---

df_games = pd.DataFrame(all_games_data)
df_raw_stats = pd.DataFrame(all_stats_data)

if not df_raw_stats.empty:
    df_pivot_stats = df_raw_stats.pivot_table(
        index=['game_id', 'team_id', 'team_name'], 
        columns='stat_type', 
        values='stat_value', 
        aggfunc='first'
    ).reset_index()
    df_pivot_stats.columns.name = None

    df_temp = pd.merge(df_games[['game_id', 'home_team_name', 'away_team_name']], 
                       df_pivot_stats, on=['game_id', 'team_name'], how='left')

    home_cols = [col for col in df_pivot_stats.columns if col not in ['game_id', 'team_id', 'team_name']]
    df_home_stats = df_temp[df_temp['team_name'] == df_temp['home_team_name']].rename(
        columns={c: f'home_{c.replace(" ", "_").lower()}' for c in home_cols}
    ).drop(columns=['team_id', 'team_name', 'away_team_name'], errors='ignore')

    away_cols = [col for col in df_pivot_stats.columns if col not in ['game_id', 'team_id', 'team_name']]
    df_away_stats = df_temp[df_temp['team_name'] == df_temp['away_team_name']].rename(
        columns={c: f'away_{c.replace(" ", "_").lower()}' for c in away_cols}
    ).drop(columns=['team_id', 'team_name', 'home_team_name'], errors='ignore')

    df_final = df_games.merge(df_home_stats, on='game_id', how='left').merge(df_away_stats, on='game_id', how='left')
    df_final = df_final.drop(columns=[c for c in df_final.columns if c.endswith('_y') or c.endswith('_x')], errors='ignore')
    
    print(f"Total rows in FINAL DataFrame (Game + Stats): {len(df_final)}")
    
else:
    df_final = df_games.copy()
    print("No detailed statistics collected. Check API plan/quota/URL again.")

# --- SAVE RESULTS ---

OUTPUT_FILENAME = "nhl_finished_games_complete_features_fixed.csv"
df_final.to_csv(OUTPUT_FILENAME, index=False)

print("\n‚úÖ Script completed.")
print(f"Saved {len(df_final)} rows to **{OUTPUT_FILENAME}**")

Starting concurrent data collection with corrected URL...
------------------------------


Overall Progress:   0%|          | 0/11 [00:00<?, ?it/s]


Fetching games for season 2015...
Found 1109 finished games for season 2015. Fetching statistics concurrently...


Season 2015 Stats (Concurrent):   0%|          | 0/1109 [00:01<?, ?it/s]
Overall Progress:   0%|          | 0/11 [00:07<?, ?it/s]


KeyboardInterrupt: 

In [1]:
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor 
import time
import itertools 
import threading 
import random
import sys

# --- ‚ö†Ô∏è CONFIGURATION - YOU MUST UPDATE THIS ‚ö†Ô∏è ---
API_KEY = "9d1dbc393fa470ff6f25a0bf1fe1647e" # üõë REPLACE WITH YOUR WORKING PRO KEY
LEAGUE_ID = 57 # NHL
SEASONS = list(range(2015, 2026)) 
BASE_URL = "https://v1.hockey.api-sports.io"

HEADERS = {"x-apisports-key": API_KEY}
MAX_WORKERS = 10 

OUTPUT_FILENAME = "nhl_ml_ready_features.csv" 
INTERMEDIATE_GAMES_CSV = "nhl_collected_games_temp.csv" 
INTERMEDIATE_STATS_CSV = "nhl_collected_stats_temp.csv" 

# Columns needed for rolling performance calculations (must be present in raw stats)
PERFORMANCE_COLS = [
    'shots_on_goal', 
    'penalty_minutes',
    'powerplay_opportunities'
]
ROLLING_WINDOW = 5 

# Thread-safe containers
all_games_data = []
all_stats_data = []
data_lock = threading.Lock() 

# ----------------------------------------------------------------------
# --- PHASE 1: DATA COLLECTION (MAX STABILITY & SPEED) ---
# ----------------------------------------------------------------------

def fetch_all_games_for_season(season: int):
    """Fetches all finished game IDs for a single season (Parallelized)."""
    games_url = f"{BASE_URL}/games"
    params = {"league": LEAGUE_ID, "season": season}
    
    try:
        response = requests.get(games_url, headers=HEADERS, params=params)
        response.raise_for_status()
        data = response.json()
        
        games_to_process = [
            {"id": game["id"], "season": season, **game} 
            for game in data.get("response", []) 
            if game.get("status", {}).get("short") == "FT"
        ]
        return games_to_process
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching season {season} games: {e}")
        return []

def fetch_game_data_and_stats(game):
    """
    Fetches basic game details and detailed statistics for a single game ID.
    Includes the stability delay and corrected endpoint logic.
    """
    # üõë STABILITY FIX: Mandatory delay to prevent server overload/blocking
    time.sleep(random.uniform(0.5, 1.0)) 
    
    season = game['season'] 
    game_id = game["id"]
    
    game_data = {
        "season": season,
        "game_id": game_id,
        "date": game.get("date"),
        "home_team_name": game.get("teams", {}).get("home", {}).get("name"),
        "away_team_name": game.get("teams", {}).get("away", {}).get("name"),
        "home_score_final": game.get("scores", {}).get("home"),
        "away_score_final": game.get("scores", {}).get("away")
    }

    # CRITICAL FIX: Corrected URL and parameter name
    stats_url = f"{BASE_URL}/statistics"
    params = {"game": game_id} 
    stats_list = []
    
    try:
        response = requests.get(stats_url, headers=HEADERS, params=params)
        response.raise_for_status() 
        data = response.json()
        
        if not data.get("response"):
             # Empty response (API Key restriction/no data available)
             return game_data, stats_list

        for team_stat in data.get("response", []):
            team_info = team_stat.get("team", {})
            team_name = team_info.get("name")
            
            for stat in team_stat.get("statistics", []):
                stats_list.append({
                    "game_id": game_id,
                    "team_name": team_name,
                    "stat_type": stat.get("type"),
                    "stat_value": stat.get("value")
                })
                
    except requests.exceptions.RequestException as e:
        # Connection aborted errors are handled by returning base data
        return game_data, stats_list
        
    return game_data, stats_list

# ----------------------------------------------------------------------
# --- PHASE 2: DATA MERGING & CLEANING ---
# ----------------------------------------------------------------------

def merge_and_clean_data(df_games, df_raw_stats):
    """Pivots stats and merges them onto the main game row (wide format)."""
    
    if df_raw_stats.empty:
        print("‚ùå CRITICAL: No detailed statistics collected.")
        return df_games.copy()

    df_pivot_stats = df_raw_stats.pivot_table(
        index=['game_id', 'team_name'], 
        columns='stat_type', 
        values='stat_value', 
        aggfunc='first'
    ).reset_index()
    df_pivot_stats.columns.name = None

    df_games['home_score_final'] = pd.to_numeric(df_games['home_score_final'], errors='coerce')
    df_games['away_score_final'] = pd.to_numeric(df_games['away_score_final'], errors='coerce')

    df_temp = pd.merge(df_games[['game_id', 'home_team_name', 'away_team_name', 'home_score_final', 'away_score_final']], 
                       df_pivot_stats, on=['game_id', 'team_name'], how='left')

    home_cols = [col for col in df_pivot_stats.columns if col not in ['game_id', 'team_name']]
    df_home_stats = df_temp[df_temp['team_name'] == df_temp['home_team_name']].rename(
        columns={c: f'home_{c.replace(" ", "_").lower()}' for c in home_cols}
    ).drop(columns=['team_name', 'away_team_name', 'away_score_final'], errors='ignore')

    away_cols = [col for col in df_pivot_stats.columns if col not in ['game_id', 'team_name']]
    df_away_stats = df_temp[df_temp['team_name'] == df_temp['away_team_name']].rename(
        columns={c: f'away_{c.replace(" ", "_").lower()}' for c in away_cols}
    ).drop(columns=['team_name', 'home_team_name', 'home_score_final'], errors='ignore')

    df_final = df_games.merge(df_home_stats, on='game_id', how='left').merge(df_away_stats, on='game_id', how='left')
    df_final = df_final.drop(columns=[c for c in df_final.columns if c.endswith('_y') or c.endswith('_x')], errors='ignore')
    
    print(f"Total rows merged with raw stats: {len(df_final)}")
    return df_final

# ----------------------------------------------------------------------
# --- PHASE 3: FEATURE ENGINEERING (ML Feature Creation) ---
# ----------------------------------------------------------------------

def calculate_team_features(df_team):
    """Calculates time-series features (Rest, Rolling Averages) for a single team."""
    
    df_team['score_final'] = pd.to_numeric(df_team['score_final'], errors='coerce')
    df_team['opponent_score'] = pd.to_numeric(df_team['opponent_score'], errors='coerce')

    # Calculate REST (Days since last game)
    df_team['prev_date'] = df_team['date'].shift(1)
    df_team['rest'] = (df_team['date'] - df_team['prev_date']).dt.days.fillna(30).clip(upper=30)

    # Calculate Rolling Averages (Momentum)
    for col in ['score_final'] + PERFORMANCE_COLS:
        roll_col_name = f'rolling_{col}_{ROLLING_WINDOW}'
        if col not in df_team.columns:
            df_team[col] = 0.0 # Fill if statistical columns are missing
            
        df_team[roll_col_name] = df_team[col].shift(1).rolling(
            window=ROLLING_WINDOW, min_periods=1, center=False
        ).mean()

    # Calculate Rolling Win Rate (Momentum)
    df_team['win'] = (df_team['score_final'] > df_team['opponent_score']).astype(int)
    df_team['rolling_win_rate'] = df_team['win'].shift(1).rolling(
        window=ROLLING_WINDOW, min_periods=1, center=False
    ).mean().fillna(0.5) 
    
    cols_to_drop = ['prev_date', 'win', 'score_final', 'opponent_score'] + PERFORMANCE_COLS
    return df_team.drop(columns=[c for c in cols_to_drop if c in df_team.columns], errors='ignore')


def feature_engineer_data(df):
    """Runs the feature engineering pipeline."""
    
    print("\nStarting feature engineering...")
    
    df['date'] = pd.to_datetime(df['date'].str.split('T').str[0], errors='coerce')
    df = df.sort_values(by='date').reset_index(drop=True)
    
    # --- Create Long (Team-Level) Format ---
    cols = ['game_id', 'date', 'home_team_name', 'away_team_name', 'home_score_final', 'away_score_final'] + [
        c for c in df.columns if c.startswith('home_') and c not in ['home_team_name', 'home_score_final']
    ]
    
    df_home = df[cols].rename(columns={
        'home_team_name': 'team_name', 'away_team_name': 'opponent_name',
        'home_score_final': 'score_final', 'away_score_final': 'opponent_score',
        **{c: c.replace('home_', '') for c in cols if c.startswith('home_')}
    })
    
    df_away = df[cols].rename(columns={
        'away_team_name': 'team_name', 'home_team_name': 'opponent_name',
        'away_score_final': 'score_final', 'home_score_final': 'opponent_score',
        **{c: c.replace('away_', '') for c in cols if c.startswith('away_')}
    })
    
    df_team_level = pd.concat([df_home, df_away], ignore_index=True)
    df_team_level = df_team_level.sort_values(by=['team_name', 'date']).reset_index(drop=True)

    # --- Calculate Rolling Features (Rest, Momentum, PPG, etc.) ---
    tqdm.pandas(desc="Calculating rolling features per team")
    df_team_features = df_team_level.groupby('team_name', group_keys=False).progress_apply(calculate_team_features)
    
    # --- Align Features Back to Game Level ---
    feature_cols = [col for col in df_team_features.columns if col.startswith('rolling_') or col == 'rest']
    df_features = df_team_features[['game_id', 'team_name'] + feature_cols]
    
    # Merge Home Features
    df_home_features = df_features.rename(columns={c: f'home_{c}' for c in feature_cols})
    df_final = pd.merge(df, df_home_features, left_on=['game_id', 'home_team_name'], right_on=['game_id', 'team_name'], how='left').drop(columns=['team_name'])

    # Merge Away Features
    df_away_features = df_features.rename(columns={c: f'away_{c}' for c in feature_cols})
    df_final = pd.merge(df_final, df_away_features, left_on=['game_id', 'away_team_name'], right_on=['game_id', 'team_name'], how='left').drop(columns=['team_name'])
                        
    # --- Create Comparative/Difference Features ---
    df_final['rest_diff'] = df_final['home_rest'] - df_final['away_rest']
    df_final['momentum_diff'] = df_final['home_rolling_win_rate'] - df_final['away_rolling_win_rate']
    df_final['home_away_diff'] = df_final[f'home_rolling_score_final_{ROLLing_WINDOW}'] - df_final[f'away_rolling_score_final_{ROLLING_WINDOW}']
    
    return df_final


# --- MAIN EXECUTION ---

def main():
    print("--- NHL ML Data Pipeline Starting (MAX STABILITY) ---")
    
    try:
        # PHASE 1: Data Collection
        # 1. Fetch all game IDs for all seasons concurrently
        print("\n1. Fetching all season calendars concurrently...")
        all_game_lists = []
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            results_list = list(tqdm(
                executor.map(fetch_all_games_for_season, SEASONS),
                total=len(SEASONS),
                desc="Fetching Seasons"
            ))
        
        all_games_to_process = list(itertools.chain.from_iterable(results_list))
        total_games = len(all_games_to_process)
        print(f"Found a total of {total_games} finished games across all seasons.")
        
        if not all_games_to_process:
            raise Exception("Fatal: Failed to fetch any games. Check API Key or internet.")

        # 2. Fetch stats for all games concurrently (Stable Speed)
        print("2. Fetching stats for all games concurrently (Stable Speed)...")
        
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            all_results = list(tqdm(
                executor.map(fetch_game_data_and_stats, all_games_to_process),
                total=total_games,
                desc="Fetching All Stats"
            ))
            
        # 3. Collect and organize data
        for game_data, stats_list in all_results:
            with data_lock:
                all_games_data.append(game_data)
                all_stats_data.extend(stats_list)

    except Exception as e:
        print(f"\n\nüö® Process interrupted! Error: {e}. Saving collected data...")
        # Save collected data right away upon interruption
        pd.DataFrame(all_games_data).to_csv(INTERMEDIATE_GAMES_CSV, index=False)
        pd.DataFrame(all_stats_data).to_csv(INTERMEDIATE_STATS_CSV, index=False)
        print(f"‚úÖ Saved {len(all_games_data)} game results and {len(all_stats_data)} raw stats records.")
        print("Please check the intermediate CSV files.")
        sys.exit(1)

    # --- PHASE 2 & 3: MERGING AND FEATURE ENGINEERING ---
    
    df_games = pd.DataFrame(all_games_data)
    df_raw_stats = pd.DataFrame(all_stats_data)

    df_merged_raw = merge_and_clean_data(df_games, df_raw_stats)
    
    if len(df_merged_raw.columns) > 7:
        df_final = feature_engineer_data(df_merged_raw)
        df_final.to_csv(OUTPUT_FILENAME, index=False)

        print("\n" + "="*50)
        print(f"‚úÖ Success! Full Pipeline Complete.")
        print(f"Final ML-Ready Features Saved: {len(df_final)} rows to {OUTPUT_FILENAME}")
        print("="*50)
        
    else:
        print("\n" + "="*50)
        print("‚ùå CRITICAL FAILURE: Cannot generate ML features.")
        print("The API did not provide statistical data (still a key/quota issue).")
        df_merged_raw.to_csv("nhl_base_data.csv", index=False)
        print("Base game data saved to nhl_base_data.csv.")
        print("="*50)


if __name__ == "__main__":
    main()

--- NHL ML Data Pipeline Starting (MAX STABILITY) ---

1. Fetching all season calendars concurrently...


Fetching Seasons: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:05<00:00,  1.87it/s]


Found a total of 11263 finished games across all seasons.
2. Fetching stats for all games concurrently (Stable Speed)...


Fetching All Stats:  20%|‚ñà‚ñà        | 2298/11263 [09:12<35:54,  4.16it/s]  


KeyboardInterrupt: 