In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the dataset
df = pd.read_csv(r'/Users/manishkavuri/Downloads/nba_player_stats_with_scores.csv')

# Exclude the 2022-23 season (since we don't have MVP labels for it)
df = df[df["season"] != "2022-23"]

# Define the features for the model
features = ["PPG", "APG", "RPG", "SPG", "winShares", "per", "usagePercent"]
X = df[features]
y = df["MVP"]  # Target variable (0 = Non-MVP, 1 = MVP)

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred = log_reg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)

# Display evaluation results
print("\n🔍 Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=1))



🔍 Logistic Regression Model Evaluation:
Accuracy: 0.9750
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       276
           1       0.00      0.00      0.00         4

    accuracy                           0.97       280
   macro avg       0.49      0.49      0.49       280
weighted avg       0.97      0.97      0.97       280



In [3]:
# Count the number of MVPs in the dataset
mvp_count = df["MVP"].sum()
print(f"Total MVPs in the dataset: {mvp_count}")


Total MVPs in the dataset: 19


In [4]:
# Count MVPs in training and test sets
train_mvp_count = y_train.sum()
test_mvp_count = y_test.sum()

print(f"MVPs in Training Set: {train_mvp_count}")
print(f"MVPs in Test Set: {test_mvp_count}")


MVPs in Training Set: 15
MVPs in Test Set: 4


In [5]:
# Count MVPs in training and test sets again
train_mvp_count = y_train.sum()
test_mvp_count = y_test.sum()

print(f"MVPs in Training Set: {train_mvp_count}")
print(f"MVPs in Test Set: {test_mvp_count}")


MVPs in Training Set: 15
MVPs in Test Set: 4


Based on the inital logistic model, it was evident that 

In [6]:


# Define features and target variable
features = ["PPG", "APG", "RPG", "SPG", "winShares", "per", "usagePercent"]
X = df[features]
y = df["MVP"]  # Target variable (0 = Non-MVP, 1 = MVP)

# Split data into training and testing sets (Stratified to balance MVPs)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression with Class Weighting
log_reg = LogisticRegression(class_weight="balanced", random_state=42)  # Adjusts for class imbalance
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred = log_reg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)

# Display evaluation results
print("\n🔍 Logistic Regression with Class Weighting Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=1))



🔍 Logistic Regression with Class Weighting Evaluation:
Accuracy: 0.8607
Precision: 0.0732
Recall: 0.7500
F1-Score: 0.1333

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.86      0.92       276
           1       0.07      0.75      0.13         4

    accuracy                           0.86       280
   macro avg       0.53      0.81      0.53       280
weighted avg       0.98      0.86      0.91       280

