In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns


In [28]:
import pandas as pd

file_path = 'sportsref_download.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'sportsref_download.csv'

In [None]:
# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class (Finalist = 1)


In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"AUC Score: {auc_score:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Finalist', 'Finalist'], yticklabels=['Non-Finalist', 'Finalist'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()


In [None]:
# Model Coefficients (interpreted as feature importance in Logistic Regression)
coefficients = model.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)

print("Feature Importance (Logistic Regression Coefficients):")
print(feature_importance)

# Visualization of Feature Importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=feature_importance)
plt.title('Feature Importance Based on Coefficients')
plt.show()


In [None]:
# Aggregate scores for team and individual metrics
team_weights = {'Net Rating': 0.6, 'Offensive Rating': 0.4}
individual_weights = {'PER': 0.5, 'Win Shares': 0.5}

df['Team Score'] = df['Net Rating'] * team_weights['Net Rating'] + df['Offensive Rating'] * team_weights['Offensive Rating']
df['Individual Score'] = df['PER'] * individual_weights['PER'] + df['Win Shares'] * individual_weights['Win Shares']

# Calculate correlations with Conference Finalist status
team_correlation = df['Team Score'].corr(df['Conference Finalist'])
individual_correlation = df['Individual Score'].corr(df['Conference Finalist'])

print(f"Correlation of Team Score with Finalist Status: {team_correlation:.2f}")
print(f"Correlation of Individual Score with Finalist Status: {individual_correlation:.2f}")

# Visualization of Team vs. Individual Scores
plt.figure(figsize=(12, 6))
sns.boxplot(x='Conference Finalist', y='Team Score', data=df)
plt.title('Team Score by Conference Finalist Status')
plt.xlabel('Conference Finalist (0 = No, 1 = Yes)')
plt.ylabel('Team Score')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='Conference Finalist', y='Individual Score', data=df)
plt.title('Individual Score by Conference Finalist Status')
plt.xlabel('Conference Finalist (0 = No, 1 = Yes)')
plt.ylabel('Individual Score')
plt.show()


In [None]:
from scipy.stats import ttest_ind

# T-test for team scores
team_ttest = ttest_ind(df[df['Conference Finalist'] == 1]['Team Score'], df[df['Conference Finalist'] == 0]['Team Score'])
print(f"T-Test for Team Scores: p-value = {team_ttest.pvalue:.4f}")

# T-test for individual scores
individual_ttest = ttest_ind(df[df['Conference Finalist'] == 1]['Individual Score'], df[df['Conference Finalist'] == 0]['Individual Score'])
print(f"T-Test for Individual Scores: p-value = {individual_ttest.pvalue:.4f}")
