In [None]:
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression as lr
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import joblib
import streamlit as st




In [None]:
merged_logs = pd.read_csv('./Cleaned_nba_data.csv')

In [None]:
merged_logs.columns

# Feature engineering 

### Creating IS_HOME column

In [None]:
merged_logs['IS_HOME'] = merged_logs['MATCHUP'].apply(lambda x: 0 if '@' in x else 1)


In [None]:
#rolling_sum
merged_logs['PTS_rolling_sum']= merged_logs.groupby('PLAYER_NAME')['PTS'].transform(
    lambda x: x.shift().rolling(10, min_periods=1).sum()
)


In [None]:
#rolling_std
merged_logs['PTS_rolling_std']= merged_logs.groupby('PLAYER_NAME')['PTS'].transform(
    lambda x: x.shift().rolling(10, min_periods=1).std()
)


### Selecting Features 

In [None]:
features = ['MIN','PASS','TCHS','FG3M','FG3A','STL','NBA_FANTASY_PTS_RANK','PTS', 'PTS_RANK', 'WNBA_FANTASY_PTS_RANK']
target = ['PTS']
features.append('IS_HOME')
features.append('PTS_rolling_std')
features.append('PTS_rolling_sum')

In [None]:
#sorted by player_names
merged_logs = merged_logs.sort_values(['PLAYER_NAME', 'GAME_DATE'])
#creates columns with the ending rolling and creates rolling average for each feature
for col in features:
    merged_logs[f'{col}_rolling'] = (
        merged_logs.groupby('PLAYER_NAME')[col]
        .transform(lambda x: x.shift().rolling(10, min_periods=1).mean())
    )

In [None]:
merged_logs[merged_logs['PLAYER_NAME'].str.contains('LeBron')][['PLAYER_NAME','GAME_DATE','MIN','MIN_rolling','FG3M','FG3M_rolling']]

### Selecting rolling features

In [None]:

rolling_features = list(col+'_rolling' for col in features )
rolling_features.extend(['IS_HOME', 'PTS_rolling_std'])
merged_logs[merged_logs['PLAYER_NAME'].str.contains('Stephen')]['PTS'].mean()

In [None]:
merged_logs[merged_logs['PLAYER_NAME'].str.contains('LeBron')][['GAME_DATE', 'PTS', 'PTS_rolling_std']]

In [None]:
# visualize 
lebron_data = merged_logs[merged_logs['PLAYER_NAME'].str.contains('LeBron')][['GAME_DATE', 'PTS', 'PTS_rolling_std']]
lebron_data['GAME_DATE'] = pd.to_datetime(lebron_data['GAME_DATE'])


plt.figure(figsize=(12, 6))

plt.hist(lebron_data['PTS_rolling_std'], bins=10, label='Volatility (STD)')
plt.xlabel('Volatility (Standard Deviation)')
plt.ylabel('Frequency')
plt.title('LeBron’s Scoring Volatility Distribution')
plt.legend()
plt.show()

### Dropping null values and splitting the data set

In [None]:
merged_logs = merged_logs.dropna(subset= rolling_features)

X = merged_logs[rolling_features]
y = merged_logs['PTS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1
)


# Linear Regression

### Baseline Model to get a first rmse and to analyze coeffs 

In [None]:

lin_reg = lr()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

In [None]:
lin_rmse = mean_squared_error(y_test, y_pred) **0.5

In [None]:
lin_rmse

In [None]:
lin_coef = lin_reg.coef_
lin_coef_1 ={
    'coef' : lin_reg.coef_,
    'columns' : X_train.columns  
}
lin_df = pd.DataFrame(lin_coef_1)
lin_df.sort_values('coef')

# Decision Tree

### The chosen model is slightly better and by plotting the tree we can clearly interpret the model

In [None]:

dt = DecisionTreeRegressor(max_depth=3, random_state=5)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_rmse = mean_squared_error(y_test, dt_pred) ** 0.5

plt.figure(figsize=(20, 10))
tree.plot_tree(dt, feature_names=X_train.columns, filled=True, rounded=True)
plt.title("Decision Tree Structure")
plt.show()


In [None]:

meansq_er = mean_squared_error(y_test, y_pred, sample_weight=None, multioutput='uniform_average')**0.5
meansq_er

In [None]:
y_pred = model.predict(X_test)
print(y_pred[:10])

In [None]:
import joblib
joblib.dump(dt, 'model.pkl')

In [None]:
#Saving the model 

joblib.dump(dt, 'decision_tree_model.pkl')

In [None]:
result_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
result = result_df.join(merged_logs[['GAME_DATE', 'PLAYER_NAME']])
result['high_difference'] = result['actual'] - result['predicted']
result.sort_values('high_difference', ascending = False)

## Random Forest

### This model has a less practical result, so it is just for testing.

In [None]:
#RandomForest
rf = RandomForestRegressor(n_estimators=100,n_jobs=-1, random_state=1)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

from sklearn.metrics import mean_squared_error
rf_rmse = mean_squared_error(y_test, rf_pred) ** 0.5
print("Random Forest RMSE:", rf_rmse)

In [None]:

importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh', figsize=(8, 6), title='Feature Importances')

In [None]:
result = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
result['difference'] = result['actual'] - result['predicted']
result = result.join(merged_logs[['PLAYER_NAME', 'GAME_DATE']], how='left')
result.sort_values('difference', ascending=False)

In [None]:
merged_logs[rolling_features].columns



In [None]:
y_train.mean()