In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
data = pd.read_csv('C:/Users/whisk/OneDrive/Documents/Bristol/Economics/Year 4/Data Science/slblundell.github.io/nba_project/data/playoff_per_game_2013-22.csv')
pred_data = pd.read_csv('C:/Users/whisk/OneDrive/Documents/Bristol/Economics/Year 4/Data Science/slblundell.github.io/nba_project/data/team_per_game_2023.csv')
data

In [None]:
# creating list of variables conditioning upon a correlation coefficient of above 0.25 with playoff wins, our 
# indepentent variable
corr = data.corr().abs()
corr = corr.loc[corr['Playoff_W']>.25]
corr.index
variables = list(corr.index)

In [None]:
variables

In [None]:
plt.figure(figsize=(20,10))
plt.title("Correlation between Filtered Variables")
sns.heatmap(data=data[variables].corr(), annot=True)

In [None]:
# Exporting correlation table for graph 3
# HOWEVER, given these values are for the 2013-2021 seasons, they will not change. 
# As such, no need to overwrite the CSV each time the model is run

# var_heatmap = variables.copy()
# var_heatmap.append('Adjusted Payroll')
# df_correlations = data[var_heatmap].corr().reset_index(level=0).rename(columns={"Playoff_W": "P_W", "Adjusted Payroll": "Pay"}).replace({"Playoff_W": "P_W", "Adjusted Payroll": "Pay"})

# df_correlations = df_correlations.melt('index', var_name='Variable', value_name='Correlation')
# df_correlations.to_csv('C:/Users/whisk/OneDrive/Documents/Bristol/Economics/Year 4/Data Science/slblundell.github.io/nba_project/data/correlations_playoff.csv')

In [None]:
corr_pay = data['Playoff_W'].corr(data['Adjusted Payroll'])
corr_wins = data['W'].corr(data['Adjusted Payroll'])

print(f'Correlation between playoff wins and payrolls: {corr_pay}')
print(f'Correlation between regular season wins and payrolls: {corr_wins}')

In [None]:
# removing MOV (margin of victory) as this variable causes multicolinearity concerns with SRS (Simple Rating System)
# additionally, removing Adjusted Payroll given we do not want payroll to factor into the linear regression when
# comparing predicted playoff rating with payrolls
remove = ['MOV','Adjusted Payroll']
variables = [item for item in variables if item not in remove]
variables

In [None]:
# creating our coefficient matrices
X = data[variables].drop('Playoff_W',1)
Y = data['Playoff_W']

pred_X = pred_data[variables].drop('Playoff_W',1)

X.head()

In [None]:
# creating our test split, with a size of 20% of the data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

# training our model
model = LinearRegression().fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [None]:
# explore model performance
print('Coefficients: ', model.coef_)
print('Intercept:', model.intercept_)
print('Mean squared erorr: %.2f' 
    % mean_squared_error(Y_test, Y_pred))
print('Coefficient of determination (R^2): %.2f' 
    % r2_score(Y_test, Y_pred))

In [None]:
# scatter plot of our predicted playoff wins vs actual playoff wins
sns.scatterplot(Y_test, Y_pred)

In [None]:
# defining the linear regression model
Y_pred = model.predict(pred_X)

# normalizing the predicted playoff wins, to prevent teams being predicted "negative wins" due to the fact the 2022/23
# season has yet to end (in addition to aiding interpretation)
w_min, w_max = min(Y_pred), max(Y_pred)
for i, win in enumerate(Y_pred):
    Y_pred[i] = (win-w_min) / (w_max - w_min)

Y_pred

In [None]:
# creating DataFrame of normalized predicted playoff wins (so-called predicted playoff rating) for 2022/23 season
df_Y_pred = pred_data[['Team','Playoff_W']]
 
for i in range(30):
    df_Y_pred.loc[i, 'Playoff_W'] = Y_pred[i]

df_Y_pred = df_Y_pred.sort_values(by='Playoff_W',ascending=False).reset_index(drop=True)
df_Y_pred = df_Y_pred.rename(columns={'Playoff_W': 'Predicted Playoff Rating'})
df_Y_pred

In [None]:
# merging predicted playoff wins DataFrame with Payroll data for comparison
payroll = pred_data[['Team', 'Payroll']]
df_Y_pred = pd.merge(df_Y_pred, payroll, on='Team')
df_Y_pred

In [None]:
df_Y_pred.to_csv('C:/Users/whisk/OneDrive/Documents/Bristol/Economics/Year 4/Data Science/slblundell.github.io/nba_project/data/model_predicted_wins.csv')

In [None]:
corr_pred = df_Y_pred['Predicted Playoff Rating'].corr(df_Y_pred['Payroll'])
print(f'Correlation between predicted playoff rating and payrolls: {corr_pred}')

In [None]:
ax = df_Y_pred.plot.barh(x='Team', y='Predicted Playoff Rating', figsize=(10,10))

In [None]:
fig = ax.get_figure()
fig.savefig('C:/Users/whisk/OneDrive/Documents/Bristol/Economics/Year 4/Data Science/slblundell.github.io/nba_project/figures/win_rating_bar.jpeg')