## The value the model seeks to predict is the `mvp_share` column, which represents the result of the MVP voting for each season.

### mvp_share $= \frac{PtsWon}{PtsMax}$

In [2]:
import os
os.chdir('/Users/wyattscott/Documents/DS5110/Project_Final_Files')
from helper_functions import print_importances, print_dict_imps, avg_imps, percent_formatter, plot_comparison_for_season

import numpy as np
import pandas as pd
import seaborn as sns

import plotly.graph_objs as go
from ipywidgets import interact, widgets
import plotly.express as px
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

import joblib
# Load the best model from Models.ipynb
best_model = joblib.load('best_model.pkl')

In [3]:
# Load the data
df_selected = pd.read_csv('df_selected.csv')
features = list(df_selected.columns)
features.append('mvp_share')
features.append('Rank')
df_train = pd.read_csv('df_clean.csv', usecols=features)
labels = df_train.pop("mvp_share")
stratify = df_train.pop("Rank")
del features[10]
del features[10]
features.append('Season')
features.append('name')
df_test = pd.read_csv('df_last.csv', usecols=features)
df_test.rename(columns={'name': 'Name'}, inplace=True)
del features[10]
del features[10]

In [4]:
(X_train, X_test, y_train, y_test) = train_test_split(df_train, 
                                                      labels, 
                                                      test_size=0.2, 
                                                      shuffle=True, 
                                                      random_state=28, 
                                                      stratify=stratify)

# Convert each dataset to array
y_train = y_train.values
y_test = y_test.values
X_train = X_train.values
X_test = X_test.values

#### Use the best model from `Models.ipynb`

In [5]:
best_model.fit(X_train, y_train)

In [6]:
# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model using mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Test MSE:", mse)
print("Test R-squared:", r2)

Test MSE: 0.0014443088499275286
Test R-squared: 0.7169794565142685


---

## Testing

In [7]:
dfs_n_last = []
for season_n in df_test['Season'].unique():
        df_n = df_test[df_test['Season'] == season_n].copy()
        names_n = df_n["Name"].values
        df_n.drop(['Season', 'Name'], axis="columns", inplace=True)
        feature_n = df_n.to_numpy()

        prediction = best_model.predict(feature_n)
        df_curr = pd.DataFrame(data=feature_n, index=None, columns=features)
        df_curr['Season'] = season_n
        df_curr['name'] = names_n
        df_curr['predicted'] = prediction * 100
        dfs_n_last.append(df_curr)
        df_curr = df_curr.sort_values(by=['predicted'], ascending=False, ignore_index=True)
        
df_pred = pd.concat(dfs_n_last, ignore_index=True)

### Compare Against True Share

In [8]:
#df_pred = pd.read_csv('predictions.csv')
keep = list(df_pred.columns)
del keep[12]
keep.append('mvp_share')
df_full = pd.read_csv('mvp_data_edit.csv', usecols=keep)
# Merge df_pred with df_full on "name" and "Season" columns
merged_df = pd.merge(df_pred, df_full[['name', 'Season', 'mvp_share']], 
                     on=['name', 'Season'], how='left')
# Rename the 'mvp_share' column to 'actual' in the merged dataframe
merged_df.rename(columns={'mvp_share': 'actual'}, inplace=True)
merged_df['actual'] *= 100

In [9]:
interactive_df = merged_df.copy()
# Filter to seasons predicted (2018-22)
interactive_df = interactive_df[interactive_df['Season'] >= 2018]
# Filter top 7 players for each Season
interactive_df = interactive_df.groupby('Season').apply(lambda x: x.nlargest(7, 'actual')).reset_index(drop=True)
# Pull out the feature importances
feature_importances = best_model.feature_importances_
# Normalize feature importances
normalized_importances = feature_importances / np.sum(feature_importances)
# Construct index
index_values = np.dot(interactive_df[features].values, normalized_importances)
# Add index values as a new column to the DataFrame
interactive_df['index'] = index_values

In [None]:
# Define custom colors for 'predicted' and 'actual'
custom_palette = {'predicted': '#E57200', 'actual': '#232D4B'}

In [None]:
# Iterate over unique values in the 'Season' column and create separate plots for each
unique_seasons = merged_df['Season'].unique()

In [None]:
plot_comparison_for_season(merged_df, 2022)

In [None]:
plot_comparison_for_season(merged_df, 2021)

In [None]:
plot_comparison_for_season(merged_df, 2020)

In [None]:
plot_comparison_for_season(merged_df, 2019)

In [None]:
plot_comparison_for_season(merged_df, 2018)

In [None]:
df_results = pd.read_csv('mvp_data_edit.csv')
# Drop Wins and Conference because we combined in cleaning notebook
df_results.drop(columns=['conference', 'W'], inplace=True)
# Filter to seasons after 1980 as we do in training
df_results = df_results[df_results['Season'] >= 1980]
# Pull out the feature importances
feature_importances = best_model.feature_importances_
# Normalize feature importances
normalized_importances = feature_importances / np.sum(feature_importances)
# Construct index
index_values = np.dot(df_results[features].values, normalized_importances)
# Add index values as a new column to the DataFrame
df_results['index'] = index_values
# Rank the 'Index' column within each season group and store the result in a new column 'Ranked_Index'
df_results['Ranked_Index'] = df_results.groupby('Season')['index'].rank(ascending=False)

In [None]:
df_results.to_csv('results.csv', index=False)

---

In [11]:
df = interactive_df.copy()

In [15]:
# Function to generate and display the bar chart
def display_bar_chart(player_name, year):
    filtered_data = df[(df['name'] == player_name) & (df['Season'] == year)]
    if filtered_data.empty:
        print("No data found for the selected player and year.")
        return

    fig = go.Figure()
    fig.add_trace(go.Bar(x=filtered_data["name"],
                         y=filtered_data["predicted"],
                         name="Predicted",
                         marker_color='rgb(229, 114, 0)'
                         ))
    fig.add_trace(go.Bar(x=filtered_data["name"],
                         y=filtered_data["actual"],
                         name="Actual",
                         marker_color='rgb(35, 45, 75)'
                         ))

    fig.update_layout(barmode='group', xaxis_tickangle=0,
                      title=f"Player: {player_name}, Year: {year}",
                      xaxis=dict(title='Player'),
                      yaxis=dict(title='Value'),
                      legend=dict(x=0, y=1.0, bgcolor='rgba(255, 255, 255, 0)', bordercolor='rgba(255, 255, 255, 0)')
                      )
    
    return fig

# Create interactive widgets for player name and year
player_name_widget = widgets.Dropdown(options=df['name'].unique(), description='Player:')
year_widget = widgets.Dropdown(options=df['Season'].unique(), description='Year:')

# Generate the interactive chart
interactive_chart = interact(display_bar_chart, player_name=player_name_widget, year=year_widget)

# Export the interactive chart as HTML
interactive_chart_fig = interactive_chart.widget.result
interactive_chart_html = interactive_chart_fig.to_html(full_html=False, include_plotlyjs='cdn')

# Save the HTML file
file_path = "interactive_chart.html"
with open(file_path, "w") as file:
    file.write(interactive_chart_html)

interactive(children=(Dropdown(description='Player:', options=('James Harden', 'LeBron James', 'Anthony Davis'…