In [7]:
pip install requests beautifulsoup4 pandas numpy tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import logging

In [9]:
# Set up logging with more detailed format
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Scraping Statistics

In [10]:
import requests
import logging
import time
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

# Optional: configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class NBADataScraper:
    def __init__(self):
        self.base_url = "https://www.basketball-reference.com"
        self.headers = {
            'User-Agent': (
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                'AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/91.0.4472.124 Safari/537.36'
            )
        }

    def get_page_content(self, url):
        """Fetch page content with error handling and rate limiting."""
        try:
            logging.info(f"Fetching URL: {url}")
            time.sleep(3)
            resp = requests.get(url, headers=self.headers)
            resp.raise_for_status()
            return BeautifulSoup(resp.content, 'html.parser')
        except requests.exceptions.RequestException as e:
            logging.error(f"Request error for {url}: {e}")
            return None
        except Exception as e:
            logging.error(f"Unexpected error fetching {url}: {e}")
            return None

    def get_mvp_winner(self, year):
        """Extract the MVP winner’s name for a given season."""
        url  = f"{self.base_url}/awards/awards_{year}.html"
        soup = self.get_page_content(url)
        if not soup:
            return None

        table = soup.find('table', {'id': 'mvp'})
        if not table:
            logging.error(f"No MVP table for {year}")
            return None

        row = table.find('tbody').find('tr')
        if not row:
            logging.error(f"No MVP winner row for {year}")
            return None

        return row.find('td', {'data-stat': 'player'}).text.strip()

    def get_mvp_shares(self, year):
        """
        Extract the full MVP voting table for `year` and return
        a dict mapping player → share (float).
        """
        url  = f"{self.base_url}/awards/awards_{year}.html"
        soup = self.get_page_content(url)
        if not soup:
            return {}

        table = soup.find('table', {'id': 'mvp'})
        if not table:
            logging.error(f"No MVP table for {year}")
            return {}

        df = pd.read_html(str(table))[0]
        # flatten if pandas created a MultiIndex
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.droplevel(0)

        df['Player'] = df['Player'].str.replace(r"\*+", "", regex=True).str.strip()
        df['Share']  = pd.to_numeric(df['Share'], errors='coerce').fillna(0.0)

        return dict(zip(df['Player'], df['Share']))

    def get_basic_stats(self, year):
        """Extract per‑game stats for `year`."""
        url  = f"{self.base_url}/leagues/NBA_{year}_per_game.html"
        soup = self.get_page_content(url)
        if not soup:
            return None

        table = soup.find('table', {'id': 'per_game_stats'})
        if not table:
            logging.error(f"No basic stats table for {year}")
            return None

        df = pd.read_html(str(table))[0]
        # flatten any MultiIndex
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.droplevel(0)

        df = df[df['Player'].notna()]
        df = df[~df['Player'].str.contains('Player')]

        col_map = {
            'Tm': 'Team', 'Pos': 'Position', 'G': 'Games', 'MP': 'Minutes',
            'PTS': 'Points', 'TRB': 'Rebounds', 'AST': 'Assists',
            'STL': 'Steals', 'BLK': 'Blocks', 'TOV': 'Turnovers',
            'FG%': 'FG_Pct', '3P%': '3P_Pct', 'FT%': 'FT_Pct'
        }
        for old, new in col_map.items():
            if old in df.columns:
                df = df.rename(columns={old: new})

        wanted = [
            'Player','Team','Position','Season','Games','Minutes','Points',
            'Rebounds','Assists','Steals','Blocks','Turnovers',
            'FG_Pct','3P_Pct','FT_Pct'
        ]
        available = [c for c in wanted if c in df.columns]
        return df[available]

    def get_advanced_stats(self, year):
        """Extract advanced stats for `year`."""
        url  = f"{self.base_url}/leagues/NBA_{year}_advanced.html"
        soup = self.get_page_content(url)
        if not soup:
            return None

        table = soup.find('table', {'class': 'stats_table'})
        if not table:
            logging.error(f"No advanced stats table for {year}")
            return None

        df = pd.read_html(str(table))[0]
        # flatten any MultiIndex
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.droplevel(0)

        df = df[df['Player'].notna()]
        df = df[~df['Player'].str.contains('Player')]

        col_map = {
            'PER':'Player_Efficiency_Rating','WS':'Win_Shares',
            'BPM':'Box_Plus_Minus','USG%':'Usage_Rate','VORP':'Value_Over_Replacement',
            'WS/48':'Win_Shares_Per_48'
        }
        for old, new in col_map.items():
            if old in df.columns:
                df = df.rename(columns={old: new})

        wanted = [
            'Player','Player_Efficiency_Rating','Win_Shares',
            'Box_Plus_Minus','Usage_Rate','Value_Over_Replacement',
            'Win_Shares_Per_48'
        ]
        available = [c for c in wanted if c in df.columns]
        return df[available]

    def scrape_season(self, year):
        """Combine basic, advanced, MVP‑winner and vote‐share into one DF."""
        logging.info(f"Scraping season {year}…")

        winner       = self.get_mvp_winner(year)
        basic        = self.get_basic_stats(year)
        advanced     = self.get_advanced_stats(year)
        shares_lookup= self.get_mvp_shares(year)

        if winner is None or basic is None or advanced is None:
            logging.error(f"Skipping {year}: missing data")
            return None

        df = pd.merge(basic, advanced, on='Player', how='left')
        df['Season'] = year
        df['MVP']    = (df['Player'] == winner).astype(int)
        df['Share']  = df['Player'].map(shares_lookup).fillna(0.0)

        logging.info(f"Finished {year}: {len(df)} players")
        return df

    def scrape_all_seasons(self, start_year=1981, end_year=2024):
        """Loop through seasons and concatenate into one DataFrame."""
        all_seasons = []
        for yr in tqdm(range(start_year, end_year+1), desc="Seasons"):
            season_df = self.scrape_season(yr)
            if season_df is not None:
                all_seasons.append(season_df)

        if not all_seasons:
            logging.error("No seasons scraped!")
            return None

        final = pd.concat(all_seasons, ignore_index=True)
        final = final.fillna(0)
        num_cols = [
            'Minutes','Points','Rebounds','Assists','Steals','Blocks','Turnovers',
            'FG_Pct','3P_Pct','FT_Pct','Player_Efficiency_Rating','Win_Shares',
            'Box_Plus_Minus','Usage_Rate','Value_Over_Replacement',
            'Win_Shares_Per_48','Share'
        ]
        for c in num_cols:
            if c in final.columns:
                final[c] = pd.to_numeric(final[c], errors='coerce')

        return final


In [11]:
def main():
    scraper = NBADataScraper()
    final_df = scraper.scrape_all_seasons()
    
    if final_df is not None:
        # Save to CSV
        output_file = 'nba_mvp_data_NEW_TESTTTT.csv'
        final_df.to_csv(output_file, index=False)
        logging.info(f"Data successfully saved to {output_file}")
        
        # Print some basic statistics about the dataset
        logging.info(f"\nDataset Statistics:")
        logging.info(f"Total number of player-seasons: {len(final_df)}")
        logging.info(f"Number of MVP winners: {final_df['MVP'].sum()}")
        logging.info(f"Seasons covered: {final_df['Season'].min()} to {final_df['Season'].max()}")
    else:
        logging.error("Failed to create the dataset")

In [12]:
if __name__ == "__main__":
    main() 

Seasons:   0%|          | 0/44 [00:00<?, ?it/s]2025-05-10 20:18:41,908 - INFO - Scraping season 1981…
2025-05-10 20:18:41,912 - INFO - Fetching URL: https://www.basketball-reference.com/awards/awards_1981.html
2025-05-10 20:18:45,051 - INFO - Fetching URL: https://www.basketball-reference.com/leagues/NBA_1981_per_game.html
  df = pd.read_html(str(table))[0]
2025-05-10 20:18:48,847 - INFO - Fetching URL: https://www.basketball-reference.com/leagues/NBA_1981_advanced.html
  df = pd.read_html(str(table))[0]
2025-05-10 20:18:52,338 - INFO - Fetching URL: https://www.basketball-reference.com/awards/awards_1981.html
  df = pd.read_html(str(table))[0]
2025-05-10 20:18:55,468 - INFO - Finished 1981: 546 players
Seasons:   2%|▏         | 1/44 [00:13<09:43, 13.56s/it]2025-05-10 20:18:55,469 - INFO - Scraping season 1982…
2025-05-10 20:18:55,469 - INFO - Fetching URL: https://www.basketball-reference.com/awards/awards_1982.html
2025-05-10 20:18:58,838 - INFO - Fetching URL: https://www.basketball

KeyboardInterrupt: 

In [None]:
data = pd.read_csv('nba_mvp_data_NEW_TESTTTT.csv')
data.head()

In [None]:
data.fillna(0, inplace=True)

In [None]:
mvplist = data.loc[data.groupby('Season')['Share'].idxmax(), ['Player','Season']]
mvplist = mvplist.assign(is_mvp=True)

data = data.merge(mvplist, on=['Player','Season'], how='left')
data['is_mvp'] = data['is_mvp'].fillna(False).astype(bool)

print(data['is_mvp'].value_counts())

In [None]:
# Narrow data set to those who played at least 30 games, averaged more than 10 minutes, and scored at least 5 points

data = data[(data['Games'] >= 30) & (data['Minutes'] > 10) & (data['Points'] > 5)]

In [None]:
data[data['MVP'] == 1]

# Determining Predictors

We can use a correlation matrix to see how impactful each column in the table is relative to determining MVP

In [None]:
# Drop unecessary columns 

mvpfactors = data.copy()
mvpfactors

# list the players who have won the MVP award
mvpfactors[mvpfactors['MVP'] == 1]

In [None]:
pip install matplotlib


In [None]:
import matplotlib.pyplot as plt

In [None]:
#Modeling different stats and their correlation to winning MVP
# - First, looking at Value Over Replacement to Award Shares
plt.figure(figsize = (10,6))
plt.scatter(data['Value_Over_Replacement'], data['Share'])
plt.title('MVP Award Shares vs. Value Over Replacement')
plt.xlabel('Vorp')
plt.ylabel('Award Shares')
plt.show()

In [None]:
mvpfactors.drop(columns=['Share', 'Team', 'Player', 'MVP', 'Season', 'Position'], inplace=True)


In [None]:

corrmatrix = mvpfactors.corr()


# get top 20 factors that impact mvp
# mvp is right now yes(1) or no(0)
top_20_factors = corrmatrix['is_mvp'].abs().sort_values(ascending=False).head(20).index

top_20_factors

corrmatrix



In [None]:
pip install seaborn

In [None]:
import seaborn as sns

In [None]:
sorted_corr_matrix = corrmatrix.loc[top_20_factors, top_20_factors]

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))  # Adjust size for clarity
sns.heatmap(
    sorted_corr_matrix,
    vmin=-1, vmax=1,
    cmap="ocean",
    center=0,
    annot=True,
    fmt=".2f",
    annot_kws={"fontsize": 8},
    linewidths=0.5,
    linecolor="white",
    cbar=True,
    cbar_kws={"orientation": "vertical"},
    square=True,
    xticklabels=True,
    yticklabels=True,
    ax=ax
)

In [None]:
# generate predictors by taking the top 20 factors

predictors = sorted_corr_matrix.index[:21]
predictors = predictors[predictors != 'is_mvp']
predictors

# Linear Regression

In [None]:
pip install scikit-learn

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score
from sklearn.metrics import classification_report, accuracy_score


# ─── 2) Split train/test by season ────────────────────────────────────────────
test_years = [2021, 2022, 2023, 2024]

train = data[~data['Season'].isin(test_years)]
test  = data[ data['Season'].isin(test_years)]

X_train = train[predictors]
X_test  = test[predictors]

y_train_mvp   = train['MVP']    # binary 0/1
y_test_mvp    = test['MVP']
y_train_share = train['Share']  # continuous 0–1
y_test_share  = test['Share']

print(f"Training on {len(train)} rows; testing on {len(test)} rows.\n")

# ─── 3) Logistic Regression for MVP (0/1) ─────────────────────────────────────
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train_mvp)

print("— Logistic Regression (MVP) —")
for yr in test_years:
    sub = test[test['Season'] == yr]
    y_true = sub['MVP']
    y_proba = log_model.predict_proba(sub[predictors])[:,1]
    auc = roc_auc_score(y_true, y_proba)
    # top‑1 hit?
    top1    = int(sub.iloc[y_proba.argmax()]['MVP'])
    print(f"{yr}  ROC‑AUC: {auc:.3f}, Top‑1 Acc: {top1}")

# overall classification report on all four seasons combined:
y_pred_all = log_model.predict(X_test)
y_proba_all = log_model.predict_proba(X_test)[:,1]
print("\nCombined classification report:")
print(classification_report(y_test_mvp, y_pred_all))
print(f"Overall accuracy: {accuracy_score(y_test_mvp, y_pred_all):.3f}")
print(f"Overall ROC‑AUC : {roc_auc_score(y_test_mvp, y_proba_all):.3f}")

# ─── 4) Linear Regression for MVP Vote Share ─────────────────────────────────────
lin_model = LinearRegression()
lin_model.fit(X_train, y_train_share)

y_pred_share = lin_model.predict(X_test)
mse = mean_squared_error(y_test_share, y_pred_share)
r2  = r2_score(y_test_share, y_pred_share)

print("\n— Linear Regression (Vote Share) —")
print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {np.sqrt(mse):.4f}")
print(f"Test R² : {r2:.4f}")

# ─── 5) Feature Importance ───────────────────────────────────────────────────
coef_log   = pd.Series(log_model.coef_[0], index=predictors)
coef_lin   = pd.Series(lin_model.coef_,    index=predictors)

print("\nTop 10 logistic coefficients (|β| for MVP):")
print(coef_log.abs().sort_values(ascending=False).head(10))

print("\nTop 10 linear coefficients (|β| for Share):")
print(coef_lin.abs().sort_values(ascending=False).head(10))


In [None]:
test_years= [2021, 2022, 2023, 2024]
test= data[data['Season'].isin(test_years)].copy()
test['predicted_share'] = lin_model.predict(test[predictors])

# 2) For each year, sort & show top 10
for yr in test_years:
    sub = test[test['Season'] == yr]
    top10 = (
        sub[['Player','Share','predicted_share']]
        .sort_values('predicted_share', ascending=False)
        .head(10)
        .reset_index(drop=True)
    )
    print(f"\n=== Top 10 Predicted Vote Shares for {yr} ===")
    print(top10.to_string(index=False))


In [None]:
sub2023 = test[test['Season']==2023]
proba2023 = log_model.predict_proba(sub2023[predictors])[:,1]
print(sub2023.assign(proba=proba2023)
              .sort_values('proba', ascending=False)
              .head(5)[['Player','proba','MVP']])

print('------')

sub2024 = test[test['Season']==2024]
proba2024 = log_model.predict_proba(sub2024[predictors])[:,1]
print(sub2024.assign(proba=proba2024)
              .sort_values('proba', ascending=False)
              .head(5)[['Player','proba','MVP']])



In [None]:
for k in (1,3,5):
    hits = []
    for yr in test_years:
        sub = test[test['Season']==yr]
        proba = model.predict_proba(sub[predictors])[:,1]
        topk = sub.iloc[proba.argsort()[-k:]]['MVP']
        hits.append(topk.any())
    print(f"Top-{k} accuracy:", sum(hits)/len(hits))


# Ridge Regression

In [None]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# 1) Re‑build your train/test split
test_years = [2021, 2022, 2023, 2024]
train = data[~data['Season'].isin(test_years)]
test  = data[ data['Season'].isin(test_years)]

X_train, y_train = train[predictors], train['Share']
X_test,  y_test  = test[predictors],  test['Share']

# 2) Set up Leave‑One‑Season‑Out
logo   = LeaveOneGroupOut()
groups = train['Season'].values

# 3) Grid‑search over alpha with group CV
param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
grid = GridSearchCV(
    Ridge(), 
    param_grid,
    scoring='neg_mean_squared_error',
    cv=logo.split(X_train, y_train, groups),
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)

print("Best α (by L-OO-CV MSE):", grid.best_params_['alpha'])

# 4) Fit final Ridge with that α
ridge = Ridge(alpha=grid.best_params_['alpha'])
ridge.fit(X_train, y_train)

# 5) Evaluate on your 2021–24 hold‑out
y_pred = ridge.predict(X_test)
print("Hold‑out MSE:", mean_squared_error(y_test, y_pred))
print("Hold‑out R² :", r2_score(y_test, y_pred))

# 6) Top features
coef = pd.Series(ridge.coef_, index=predictors).abs().sort_values(ascending=False)
print("\nTop 10 features by |coef|:\n", coef.head(10))

# 7) Per‑year “Top‑10 predicted share” breakdown
for yr in test_years:
    sub = test[test['Season'] == yr].copy()
    sub['predicted_share'] = ridge.predict(sub[predictors])
    top10 = sub[['Player','Share','predicted_share']]\
            .sort_values('predicted_share', ascending=False)\
            .head(10)\
            .reset_index(drop=True)
    print(f"\n=== Top 10 Ridge‑predicted Vote Shares for {yr} ===")
    print(top10.to_string(index=False))


In [None]:
# 1) Add back Player, Season, and actual vote share
results = test.copy().loc[:, ['Player','Season','Share']].rename(columns={'Share':'Award_Shares'})

# 2) Predict vote shares for the 2022 slice only
slice_2022 = results[results['Season']==2024].copy()
slice_2022['predictions'] = ridge.predict(test.loc[slice_2022.index, predictors])

# 3) Compute the true rank (Rk) by Award_Shares
slice_2022 = slice_2022.sort_values('Award_Shares', ascending=False)
slice_2022['Rank'] = range(1, len(slice_2022)+1)

# 4) Compute the predicted rank (Predicted_Rk)
slice_2022 = slice_2022.sort_values('predictions', ascending=False)
slice_2022['Predicted_Rank'] = range(1, len(slice_2022)+1)

# 5) Re‑sort by actual vote share to display
combination_2022 = slice_2022.sort_values('Rank').reset_index(drop=True)

print(combination_2022.head(10).to_string(index=False))


# Random Forest + GradientBoostingRegressor

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import classification_report, roc_auc_score, mean_squared_error, r2_score

# ─── 1) RandomForest for MVP classification ────────────────────────────────
rf_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'   # helps with the 1-vs-many imbalance
)
rf_clf.fit(X_train, y_train_mvp)

y_pred_rf    = rf_clf.predict(X_test)
y_proba_rf   = rf_clf.predict_proba(X_test)[:,1]

print("\n— RandomForestClassifier (MVP) —")
print(classification_report(y_test_mvp, y_pred_rf))
print("ROC‑AUC:", roc_auc_score(y_test_mvp, y_proba_rf))

# ─── 3) RandomForest for vote‑share regression ─────────────────────────────
rf_reg = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)
rf_reg.fit(X_train, y_train_share)

y_pred_rf_reg = rf_reg.predict(X_test)

print("\n— RandomForestRegressor (Share) —")
print("MSE:", mean_squared_error(y_test_share, y_pred_rf_reg))
print("R² :", r2_score(y_test_share, y_pred_rf_reg))



In [None]:


# ─── 2) GradientBoosting for MVP classification ───────────────────────────
gb_clf = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
gb_clf.fit(X_train, y_train_mvp)

y_pred_gb  = gb_clf.predict(X_test)
y_proba_gb = gb_clf.predict_proba(X_test)[:,1]

print("\n— GradientBoostingClassifier (MVP) —")
print(classification_report(y_test_mvp, y_pred_gb))
print("ROC‑AUC:", roc_auc_score(y_test_mvp, y_proba_gb))


# ─── 4) GradientBoosting for share regression ────────────────────────
gb_reg = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
gb_reg.fit(X_train, y_train_share)

y_pred_gb_reg = gb_reg.predict(X_test)

print("\n— GradientBoostingRegressor (Share) —")
print("MSE:", mean_squared_error(y_test_share, y_pred_gb_reg))
print("R² :", r2_score(y_test_share, y_pred_gb_reg))


In [None]:
# 1) Filter to the 2022 season
slice_2022 = test[test['Season'] == 2022].copy()

# 2) Grab the top 10 by actual vote share
top10 = slice_2022.nlargest(10, 'Share').copy()

# 3) Compute RF and GB predictions
top10['predicted_rf'] = rf_reg.predict(top10[predictors])
top10['predicted_gb'] = gb_reg.predict(top10[predictors])

# 4) Display the comparison
print(top10[['Player', 'Share', 'predicted_rf', 'predicted_gb']].to_string(index=False))


# Summary


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score, roc_auc_score,
    mean_squared_error, r2_score
)

# 1) List your models
models = [
    ('Logistic Regression',           log_model, 'clf'),
    ('Random Forest Classifier',      rf_clf,    'clf'),
    ('Gradient Boosting Classifier',  gb_clf,    'clf'),
    ('Linear Regression',             lin_model, 'reg'),
    ('Ridge Regression',              ridge,     'reg'),
    ('Random Forest Regressor',       rf_reg,    'reg'),
    ('Gradient Boosting Regressor',   gb_reg,    'reg'),
]

# 2) Compute metrics
rows = []
for name, model, mtype in models:
    r = {'Model': name}
    if mtype == 'clf':
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1]
        r.update({
            'Precision': precision_score(y_test_mvp, y_pred, zero_division=0),
            'Recall':    recall_score(   y_test_mvp, y_pred, zero_division=0),
            'F1':        f1_score(       y_test_mvp, y_pred, zero_division=0),
            'Accuracy':  accuracy_score( y_test_mvp, y_pred),
            'ROC_AUC':   roc_auc_score(  y_test_mvp, y_proba),
            'MSE':       np.nan,
            'RMSE':      np.nan,
            'R2':        np.nan,
        })
    else:
        y_pred = model.predict(X_test)
        mse  = mean_squared_error(y_test_share, y_pred)
        r.update({
            'Precision': np.nan,
            'Recall':    np.nan,
            'F1':        np.nan,
            'Accuracy':  np.nan,
            'ROC_AUC':   np.nan,
            'MSE':       mse,
            'RMSE':      np.sqrt(mse),
            'R2':        r2_score(y_test_share, y_pred),
        })
    rows.append(r)

# 3) Build and display DataFrame
df_summary = pd.DataFrame(rows)
# order columns
df_summary = df_summary[[
    'Model','Precision','Recall','F1','Accuracy','ROC_AUC','MSE','RMSE','R2'
]]
display(df_summary)




In [23]:
class NBADataScraper2025:
    def __init__(self):
        self.base_url = "https://www.basketball-reference.com"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
    def get_page_content(self, url):
        """Fetch page content with error handling and rate limiting"""
        try:
            logging.info(f"Fetching URL: {url}")
            time.sleep(3)  # Rate limiting
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.exceptions.RequestException as e:
            logging.error(f"Request error for {url}: {str(e)}")
            return None
        except Exception as e:
            logging.error(f"Unexpected error fetching {url}: {str(e)}")
            return None

    def get_basic_stats(self, year):
        """Extract basic statistics for a given season"""
        url = f"{self.base_url}/leagues/NBA_{year}_per_game.html"
        soup = self.get_page_content(url)
        if not soup:
            logging.error(f"Failed to get basic stats page for {year}")
            return None
            
        try:
            stats_table = soup.find('table', {'id': 'per_game_stats'})
            if not stats_table:
                logging.error(f"Could not find basic stats table for {year}")
                return None
                
            # Convert table to DataFrame using StringIO
            df = pd.read_html(str(stats_table))[0]
            
            # Print available columns for debugging
            logging.info(f"Available columns for {year}: {df.columns.tolist()}")
            
            # Clean up the DataFrame
            df = df[df['Player'].notna()]  # Remove rows where Player is NaN
            df = df[~df['Player'].str.contains('Player')]  # Remove header rows
            
            # Map column names to handle different naming conventions
            column_mapping = {
                'Tm': 'Team',
                'Pos': 'Position',
                'G': 'Games',
                'MP': 'Minutes',
                'PTS': 'Points',
                'TRB': 'Rebounds',
                'AST': 'Assists',
                'STL': 'Steals',
                'BLK': 'Blocks',
                'TOV': 'Turnovers',
                'FG%': 'FG_Pct',
                '3P%': '3P_Pct',
                'FT%': 'FT_Pct'
            }
            
            # Rename columns if they exist
            for old_col, new_col in column_mapping.items():
                if old_col in df.columns:
                    df = df.rename(columns={old_col: new_col})
            
            # Select relevant columns (using new names)
            columns = ['Player', 'Team', 'Position', 'Season', 'Games', 'Minutes', 'Points', 
                      'Rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers', 
                      'FG_Pct', '3P_Pct', 'FT_Pct']
            
            # Only select columns that exist in the DataFrame
            available_columns = [col for col in columns if col in df.columns]
            if not available_columns:
                logging.error(f"No matching columns found for {year}")
                return None
                
            df = df[available_columns]
            logging.info(f"Successfully extracted basic stats for {year}")
            return df
        except Exception as e:
            logging.error(f"Error extracting basic stats for {year}: {str(e)}")
            return None

    def get_advanced_stats(self, year):
        """Extract advanced statistics for a given season"""
        url = f"{self.base_url}/leagues/NBA_{year}_advanced.html"
        soup = self.get_page_content(url)
        if not soup:
            logging.error(f"Failed to get advanced stats page for {year}")
            return None
            
        try:
            # Look for the table with class 'stats_table'
            stats_table = soup.find('table', {'class': 'stats_table'})
            if not stats_table:
                logging.error(f"Could not find advanced stats table for {year}")
                return None
                
            # Convert table to DataFrame using StringIO
            df = pd.read_html(str(stats_table))[0]
            
            # Print available columns for debugging
            logging.info(f"Available advanced columns for {year}: {df.columns.tolist()}")
            
            # Clean up the DataFrame
            df = df[df['Player'].notna()]  # Remove rows where Player is NaN
            df = df[~df['Player'].str.contains('Player')]  # Remove header rows
            
            # Map column names to handle different naming conventions
            column_mapping = {
                'PER': 'Player_Efficiency_Rating',
                'WS': 'Win_Shares',
                'BPM': 'Box_Plus_Minus',
                'USG%': 'Usage_Rate',
                'VORP': 'Value_Over_Replacement',
                'WS/48': 'Win_Shares_Per_48'
            }
            
            # Rename columns if they exist
            for old_col, new_col in column_mapping.items():
                if old_col in df.columns:
                    df = df.rename(columns={old_col: new_col})
            
            # Select relevant columns (using new names)
            columns = ['Player', 'Player_Efficiency_Rating', 'Win_Shares', 
                      'Box_Plus_Minus', 'Usage_Rate', 'Value_Over_Replacement', 
                      'Win_Shares_Per_48']
            
            # Only select columns that exist in the DataFrame
            available_columns = [col for col in columns if col in df.columns]
            if not available_columns:
                logging.error(f"No matching advanced columns found for {year}")
                return None
                
            df = df[available_columns]
            logging.info(f"Successfully extracted advanced stats for {year}")
            return df
        except Exception as e:
            logging.error(f"Error extracting advanced stats for {year}: {str(e)}")
            return None

    def scrape_season(self, year):
        """Scrape all data for a given season"""
        logging.info(f"Starting to scrape data for {year} season...")
        
        # Get basic stats
        basic_stats = self.get_basic_stats(year)
        if basic_stats is None:
            logging.error(f"Failed to get basic stats for {year}")
            return None
            
        # Get advanced stats
        advanced_stats = self.get_advanced_stats(year)
        if advanced_stats is None:
            logging.error(f"Failed to get advanced stats for {year}")
            return None
            
        try:
            # Merge basic and advanced stats
            merged_stats = pd.merge(basic_stats, advanced_stats, on='Player', how='left')
            
            # Add Season column
            merged_stats['Season'] = year
            
            logging.info(f"Successfully merged all data for {year}")
            return merged_stats
        except Exception as e:
            logging.error(f"Error merging data for {year}: {str(e)}")
            return None

In [24]:
def main():
    scraper = NBADataScraper2025()
    year = 2025  # Only scrape 2024-2025 season
    season_data = scraper.scrape_season(year)
    
    if season_data is not None:
        # Reorder columns to put Season first
        cols = season_data.columns.tolist()
        cols.remove('Season')
        cols = ['Season'] + cols
        season_data = season_data[cols]
        
        # Clean up the DataFrame
        season_data = season_data.fillna(0)  # Fill missing values with 0
        
        # Ensure all numeric columns are float
        numeric_columns = ['Minutes', 'Points', 'Rebounds', 'Assists', 'Steals', 
                         'Blocks', 'Turnovers', 'FG_Pct', '3P_Pct', 'FT_Pct', 
                         'Player_Efficiency_Rating', 'Win_Shares', 'Box_Plus_Minus', 
                         'Usage_Rate', 'Value_Over_Replacement', 'Win_Shares_Per_48']
        
        for col in numeric_columns:
            if col in season_data.columns:
                season_data[col] = pd.to_numeric(season_data[col], errors='coerce')
        
        # Save to CSV
        output_file = 'nba_2025_season.csv'
        season_data.to_csv(output_file, index=False)
        logging.info(f"Data successfully saved to {output_file}")
        
        # Print some basic statistics about the dataset
        logging.info(f"\nDataset Statistics:")
        logging.info(f"Total number of players: {len(season_data)}")
        logging.info(f"Season: {season_data['Season'].iloc[0]}")
    else:
        logging.error("Failed to create the dataset")

In [25]:
if __name__ == "__main__":
    main() 

2025-05-10 20:31:04,098 - INFO - Starting to scrape data for 2025 season...
2025-05-10 20:31:04,099 - INFO - Fetching URL: https://www.basketball-reference.com/leagues/NBA_2025_per_game.html
  df = pd.read_html(str(stats_table))[0]
2025-05-10 20:31:07,988 - INFO - Available columns for 2025: ['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards']
2025-05-10 20:31:07,990 - INFO - Successfully extracted basic stats for 2025
2025-05-10 20:31:07,991 - INFO - Fetching URL: https://www.basketball-reference.com/leagues/NBA_2025_advanced.html
  df = pd.read_html(str(stats_table))[0]
2025-05-10 20:31:12,116 - INFO - Available advanced columns for 2025: ['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'D