In [431]:
#Using Machine learning to predict NBA Season Leaders and 
#Player rankings for future seasons as well as general analysis of trends

#Data used was from NBA_webscraping where a csv file was created

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

filename = 'nba_player_data.csv'


In [357]:
pd.read_csv(filename)

Unnamed: 0,Year,Season_Type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF
0,2014-15,Regular%20Season,201566,1,Russell Westbrook,1610612760,OKC,67,34.4,9.4,...,0.835,1.9,5.4,7.3,8.6,2.1,0.2,4.4,28.1,27.7
1,2014-15,Regular%20Season,201935,2,James Harden,1610612745,HOU,81,36.8,8.0,...,0.868,0.9,4.7,5.7,7.0,1.9,0.7,4.0,27.4,27.2
2,2014-15,Regular%20Season,2544,3,LeBron James,1610612739,CLE,69,36.1,9.0,...,0.710,0.7,5.3,6.0,7.4,1.6,0.7,3.9,25.3,25.3
3,2014-15,Regular%20Season,203076,4,Anthony Davis,1610612740,NOP,68,36.1,9.4,...,0.805,2.5,7.7,10.2,2.2,1.5,2.9,1.4,24.4,30.3
4,2014-15,Regular%20Season,202326,5,DeMarcus Cousins,1610612758,SAC,59,34.1,8.4,...,0.782,3.1,9.5,12.7,3.6,1.5,1.7,4.3,24.1,27.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3740,2023-24,Regular%20Season,1630550,258,JT Thor,1610612766,CHA,35,14.8,1.3,...,0.444,1.1,1.7,2.8,0.4,0.2,0.5,0.3,3.3,4.4
3741,2023-24,Regular%20Season,1630192,259,Zeke Nnaji,1610612743,DEN,36,9.4,1.1,...,0.641,1.1,1.0,2.1,0.5,0.3,0.6,0.6,3.1,4.2
3742,2023-24,Regular%20Season,1629637,260,Jaxson Hayes,1610612747,LAL,36,9.5,1.2,...,0.536,0.4,1.2,1.7,0.3,0.2,0.3,0.5,2.8,3.9
3743,2023-24,Regular%20Season,1641748,261,Andre Jackson Jr.,1610612749,MIL,38,11.4,1.1,...,0.889,0.9,1.3,2.2,0.9,0.3,0.1,0.5,2.8,4.8


In [432]:

# Define functions to obtain column names of CSV and to collect data from specific columns
def column_data(fname, column_offsets):
    data = np.loadtxt(fname, dtype=str, delimiter=',', usecols=column_offsets)
    return data


def getcolumns(fname):
    with open(fname) as fid:
        csv_reader = csv.reader(fid, delimiter=',')
        column_names = next(csv_reader)
    return np.array(column_names)

# Your file name
filename = 'nba_player_data.csv'

# Get column names and data
nba_cols = getcolumns(filename)
data = pd.read_csv(filename)

input = column_data(filename, range(len(nba_cols)))

# Extract offsets
player_offset = list(nba_cols).index('PLAYER')
rank_offset = list(nba_cols).index('RANK')
year_offset = list(nba_cols).index('Year')
points_offset = list(nba_cols).index("PTS")

# Select relevant features for prediction
selected_features = ['PTS', 'AST', 'REB', 'STL', 'BLK']

# Use only the selected features and the target variable 'PTS'
ridge_model = Ridge()


In [None]:
#Use to switch the player that you want to analyze
target_player = "Russell Westbrook" 

target_data = data[data['PLAYER'] == target_player]

X = target_data[selected_features]
y = target_data[['PTS', 'AST', 'REB', 'STL', 'BLK']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, train_size=0.9)

multioutput_model = MultiOutputRegressor(ridge_model)

multioutput_model.fit(X_train, y_train)

y_pred = multioutput_model.predict(X_test)

random_forest_model = MultiOutputRegressor(RandomForestRegressor())

# GridSearchCV for hyperparameter tuning
param_grid = {'estimator__n_estimators': [50, 100, 200],
              'estimator__max_depth': [None, 10, 20],
              'estimator__min_samples_split': [2, 5, 10]}

grid_search = GridSearchCV(random_forest_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = MultiOutputRegressor(RandomForestRegressor(**best_params))
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

In [None]:

model = Ridge()

# Train the model on the training data
model.fit(X_train, y_train)


In [None]:

# List to store predicted stats for each player
all_player_predictions = []

for player_name in data['PLAYER'].unique():
    player_data = data[data['PLAYER'] == player_name]
    selected_features_input = ['PTS', 'AST', 'REB', 'STL', 'BLK']
    X_player = player_data[selected_features_input]
    player_predictions = multioutput_model.predict(X_player)
    predicted_stats = dict(zip(selected_features, player_predictions))
    predicted_stats['PLAYER'] = player_name
    all_player_predictions.append(predicted_stats)


# Convert the list of dictionaries to a DataFrame
predicted_stats_df = pd.DataFrame(all_player_predictions)

In [None]:
multioutput_model = MultiOutputRegressor(ridge_model)

alphas = np.logspace(-3, 3, 7)
param_grid = {'estimator__alpha': alphas}

# Initialize GridSearchCV
grid_search = GridSearchCV(multioutput_model, param_grid, scoring='neg_mean_squared_error')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_alpha = grid_search.best_params_['estimator__alpha']

# Train the model with the best alpha on the entire training set
best_model = MultiOutputRegressor(Ridge(alpha=best_alpha))
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model with the best hyperparameters
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print(f'Best Alpha: {best_alpha}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()

In [None]:
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.axhline(y=0, color='r', linestyle='--')  
plt.show()

In [None]:
def predict_player_stats(player, model, current_season_data):

    # Extract relevant features from the current season data
    selected_features = ['PTS', 'AST', 'REB', 'STL', 'BLK']
    player_data = current_season_data[selected_features]

    # Predict stats for the next season
    predicted_stats_next_season = model.predict(player_data)

    # Create a DataFrame for a more structured output
    predicted_stats_df = pd.DataFrame(data=predicted_stats_next_season, columns=selected_features)
   

    return predicted_stats_df


# Call the function to get predictions
predictions_df = predict_player_stats(target_player, best_model, target_data)

# Print or use the DataFrame as needed
print("Predicted Stats for the next Season:")
print(predictions_df.tail(1))

In [None]:
def plot_predicted_stats(stats, real_career_df, predicted_stats, player_input):

    # Only account for regular season
    real_career = real_career_df[real_career_df['PLAYER'] == player_input].drop_duplicates(subset='Year')

    #Make X-axis the years of plot
    year_col = real_career['Year']
    xvalues = []
    for year in year_col:
        xvalues.append(year)

    #Make Y-axis the desired stat
    yvalues = []
    stat_column = real_career[stats]
    for stat in stat_column:
        yvalues.append(stat)
        
    #Append predicted stats
    predicted_stat = ((predicted_stats[stats]).iloc[-1])

    predicted_stat = predicted_stat.astype(float)
    xvalues.append('2024-25')
    yvalues.append(predicted_stat)

    plt.plot(range(len(xvalues)), yvalues)
    plt.xticks(range(len(xvalues)), xvalues, rotation='vertical')
    plt.ylabel(stats)
    plt.xlabel("Season")
    plt.show()


desired_stat = 'PTS'
plot_predicted_stats(desired_stat, data, predictions_df, target_player)
    