In [1]:

import pandas as pd
import numpy as np
import warnings
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import RFE
import joblib # joblib.dump(name,' .pkl'), joblib.load(' .pkl')
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [2]:
totalStats = pd.read_csv('csv/totalStats.csv', index_col = 0)

In [33]:
def determine_winner(game, x):
    if x == 1:
        return game[-3:]
    else:
        return game[-9:-6]
    
def aggregate_team_stats(df):
    # Group by game and team, then aggregate stats
    team_stats = df.groupby(['GAME', 'Team']).agg({
        'OFFRTG': 'mean',
        'DEFRTG': 'mean',
        'NETRTG': 'mean',
        'AST%': 'mean',
        'EFG%': 'mean',
        'TS%': 'mean',
        'USG%': 'mean',
        'PACE': 'mean',
        'PTS': 'sum',
        'REB': 'sum',
        'AST': 'sum',
        'STL': 'sum',
        'BLK': 'sum',
        'TOV': 'sum',
        '+/-': 'sum',
        'HOME': 'first'  # Assuming HOME is 1 for home team, 0 for away
    }).reset_index()
    return team_stats

def prepare_matchup_data(team_stats):
    # Create matchups by pairing home and away team stats
    home_stats = team_stats[team_stats['HOME'] == 1].set_index('GAME')
    away_stats = team_stats[team_stats['HOME'] == 0].set_index('GAME')
    
    matchups = home_stats.join(away_stats, lsuffix='_home', rsuffix='_away')
    
    # Calculate stat differentials
    for stat in ['OFFRTG', 'DEFRTG', 'NETRTG', 'AST%', 'EFG%', 'TS%', 'USG%', 'PACE', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', '+/-']:
        matchups[f'{stat}_diff'] = matchups[f'{stat}_home'] - matchups[f'{stat}_away']
    
    return matchups

def select_features(X, y, n_features=20):
    rfe = RFE(estimator=LogisticRegression(), n_features_to_select=n_features)
    rfe = rfe.fit(X, y)
    return X.columns[rfe.support_].tolist()

def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Logistic Regression
    log_reg = LogisticRegression(random_state=42)
    log_reg.fit(X_train_scaled, y_train)
    
    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_scaled, y_train)
    
    return log_reg, rf, scaler, X_test_scaled, y_test

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

team_stats = aggregate_team_stats(totalStats)
matchups = prepare_matchup_data(team_stats)
feature_cols = [col for col in matchups.columns if col.endswith('_diff')]
X = matchups[feature_cols]
y = (matchups['PTS_home'] > matchups['PTS_away']).astype(int)  # 1 if home team wins, 0 otherwise
combined = X.assign(W = y).reset_index()
win = combined[['GAME', 'W']].apply(lambda row: determine_winner(row['GAME'], row['W']), axis=1)
combined = combined.assign(WINNER = win)
selected_features = select_features(X, y)
X = X[selected_features]

# Train and evaluate models
log_reg, rf, scaler, X_test_scaled, y_test = train_model(X, y)

log_reg_accuracy, log_reg_report = evaluate_model(log_reg, X_test_scaled, y_test)
rf_accuracy, rf_report = evaluate_model(rf, X_test_scaled, y_test)

print("Logistic Regression Accuracy:", log_reg_accuracy)
print("Logistic Regression Report:\n", log_reg_report)

print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Report:\n", rf_report)

# Function to predict win probability for a new matchup
def predict_win_probability(model, scaler, home_stats, away_stats):
    matchup = pd.DataFrame()
    for stat in selected_features:
        base_stat = stat.replace('_diff', '')
        matchup[stat] = [home_stats[base_stat] - away_stats[base_stat]]
    
    matchup_scaled = scaler.transform(matchup)
    probability = model.predict_proba(matchup_scaled)[0][1]  # Probability of home team winning
    return probability

def dictate_line(model, scaler, home_stats, away_stats):
    prob = predict_win_probability(model, scaler, home_stats, away_stats)
    
    if prob >= .5:
        return -((prob/(1-prob))*100.00)
    else:
        return ((1-prob/prob) *100.00)

# win_prob = predict_win_probability(log_reg, scaler, home_team_stats, away_team_stats)

Logistic Regression Accuracy: 0.9959349593495935
Logistic Regression Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00       132
           1       0.99      1.00      1.00       114

    accuracy                           1.00       246
   macro avg       1.00      1.00      1.00       246
weighted avg       1.00      1.00      1.00       246

Random Forest Accuracy: 1.0
Random Forest Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       132
           1       1.00      1.00      1.00       114

    accuracy                           1.00       246
   macro avg       1.00      1.00      1.00       246
weighted avg       1.00      1.00      1.00       246



In [4]:
totalStats
total_team_stats = totalStats.groupby('Team').agg({
        'OFFRTG': 'mean',
        'DEFRTG': 'mean',
        'NETRTG': 'mean',
        'AST%': 'mean',
        'EFG%': 'mean',
        'TS%': 'mean',
        'USG%': 'mean',
        'PACE': 'mean',
        'PTS': 'mean',
        'REB': 'mean',
        'AST': 'mean',
        'STL': 'mean',
        'BLK': 'mean',
        'TOV': 'mean',
        '+/-': 'mean',
    }).reset_index()
total_team_stats = total_team_stats.set_index('Team')

In [25]:
def fixStats(row):
    if 'Team' not in row:
        print(f"'Team' column not found in row. Available columns: {row.index.tolist()}")
        return row
    
    team = row['Team']
    for i in row['GAME'][-9:].replace(' ', '').split('@'):
        if i != team:
            opp = i
    try:
        for col in ['OFFRTG', 'DEFRTG', 'NETRTG', 'AST%', 'EFG%', 'TS%', 'USG%', 'PACE', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', '+/-']:
            if row['HOME'] == 1:
                row[col] = (total_team_stats.loc[team, col]  - total_team_stats.loc[opp, col])
            else:
                row[col] = (total_team_stats.loc[team, col]  - total_team_stats.loc[opp, col]) 
    except KeyError:
        print(f"Team {team} not found in total_team_stats")
    return row

temp = totalStats[['GAME','Team', 'W/L', 'OFFRTG', 'DEFRTG', 'NETRTG', 'AST%', 'EFG%', 'TS%', 'USG%', 'PACE', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', '+/-', 'HOME']]
temp['W/L'] = temp['W/L'].apply(lambda x: 1 if x == 'W' else 0)
temp = temp.apply(lambda x: fixStats(x), axis=1)

record = np.abs(temp['W/L'] - log_reg.predict_proba(temp[['OFFRTG', 'DEFRTG', 'NETRTG', 'AST%', 'EFG%', 'TS%', 'USG%', 'PACE', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', '+/-']])[:,1])
recordWrong = record[record >= .8]
recordIndex = recordWrong.index
team_accuracy = (temp.iloc[recordIndex].groupby('Team').count().sort_values(by='GAME', ascending=False)/880)['GAME']
print(f'Average distance in predicted probability and realized result: {record.mean()}')
print(f'% of predictions that are wrong by .8 or greater: {team_accuracy}')

Average distance in predicted probability and realized result: 0.31141455409366764
% of predictions that are wrong by .8 or greater: Team
MIL    0.456818
IND    0.417045
LAL    0.415909
SAC    0.401136
PHX    0.365909
NOP    0.363636
ATL    0.357955
OKC    0.334091
BKN    0.327273
DEN    0.325000
DAL    0.315909
ORL    0.314773
MEM    0.314773
CHI    0.304545
LAC    0.288636
MIN    0.279545
TOR    0.262500
CLE    0.260227
HOU    0.254545
UTA    0.253409
SAS    0.251136
PHI    0.237500
POR    0.236364
NYK    0.235227
GSW    0.232955
CHA    0.223864
BOS    0.206818
MIA    0.159091
WAS    0.142045
DET    0.090909
Name: GAME, dtype: float64




In [39]:
win_prob = dictate_line(log_reg, scaler, total_updated_stats.loc['DET'], total_updated_stats.loc['DEN'])
print(predict_win_probability(log_reg, scaler, total_updated_stats.loc['DET'], total_updated_stats.loc['DEN']))
win_prob

0.5549879099959395


-124.71299599767627

In [38]:
prob = win_prob
((1-prob)/prob) *100.00

  ((1-prob)/prob) *100.00


inf

In [7]:
# 8. Feature Importance Analysis
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(log_reg.coef_[0])
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

Top 10 Most Important Features:
        feature  importance
8      PTS_diff    4.904633
14     +/-_diff    4.904633
0   OFFRTG_diff    0.551382
10     AST_diff    0.482423
6     USG%_diff    0.443420
2   NETRTG_diff    0.436132
13     TOV_diff    0.343063
3     AST%_diff    0.333246
9      REB_diff    0.297205
1   DEFRTG_diff    0.266822


In [None]:
with_trades = pd.read_csv('csv/updated_with_trades.csv')

In [20]:
selected_features

['OFFRTG_diff',
 'DEFRTG_diff',
 'NETRTG_diff',
 'AST%_diff',
 'EFG%_diff',
 'TS%_diff',
 'USG%_diff',
 'PACE_diff',
 'PTS_diff',
 'REB_diff',
 'AST_diff',
 'STL_diff',
 'BLK_diff',
 'TOV_diff',
 '+/-_diff']

In [31]:
total_updated_stats = with_trades.groupby('Team').agg({
        'OFFRTG': 'mean',
        'DEFRTG': 'mean',
        'NETRTG': 'mean',
        'AST%': 'mean',
        'EFG%': 'mean',
        'TS%': 'mean',
        'USG%': 'mean',
        'PACE': 'mean',
        'PTS': 'mean',
        'REB': 'mean',
        'AST': 'mean',
        'STL': 'mean',
        'BLK': 'mean',
        'TOV': 'mean',
        '+/-': 'mean',
    }).reset_index()
total_updated_stats.to_csv('updated_stats.csv')
total_updated_stats = total_updated_stats.set_index('Team')


In [24]:


total_updated_stats.set_index('Team')


Unnamed: 0_level_0,OFFRTG,DEFRTG,NETRTG,AST%,EFG%,TS%,USG%,PACE,PTS,REB,AST,STL,BLK,TOV,+/-
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ATL,109.930148,114.060296,-4.130417,13.908345,50.204307,53.712248,16.558277,107.505155,10.176312,4.258412,2.427995,0.725437,0.462988,1.161507,-0.866756
BKN,108.530726,113.15743,-4.628603,14.227039,50.50514,52.749609,18.530503,100.861073,9.620112,4.022346,2.176536,0.620112,0.472626,1.151955,-2.059218
BOS,118.681595,108.148712,10.532883,13.95816,55.006994,58.178773,17.791779,100.139497,11.932515,4.598773,2.665031,0.669939,0.658896,1.119018,5.522699
CHA,105.981081,116.442421,-10.46181,14.452996,49.732902,52.524324,18.016099,99.410435,10.627497,3.969448,2.520564,0.706228,0.423032,1.310223,-5.029377
CHI,109.720503,112.672704,-2.952453,13.379245,48.038994,50.020629,17.651572,100.973094,9.607547,3.977358,2.303145,0.555975,0.393711,1.09434,-0.36478
CLE,107.30213,111.361659,-4.059529,14.818049,50.69417,53.276009,17.44361,99.189585,10.577354,4.143498,2.609865,0.690583,0.456278,1.188341,1.054933
DAL,110.721862,111.440789,-0.715789,14.691802,49.881781,53.075506,17.975101,112.126123,10.520243,3.676113,2.518219,0.61336,0.421053,1.111336,1.191296
DEN,108.538239,110.913269,-2.378287,14.75187,49.993848,52.386369,18.102895,105.015042,10.053076,4.328106,2.494572,0.571773,0.496984,1.04222,1.788902
DET,109.208865,114.347211,-5.137251,13.055378,50.38506,52.913446,17.988745,101.941076,10.87251,4.322709,2.23008,0.670319,0.447211,1.258964,-2.0249
GSW,112.946624,110.751768,2.196785,15.355627,51.784137,54.23269,17.758628,110.424845,9.912111,4.031083,2.5209,0.667738,0.466238,1.158628,1.500536


In [8]:
with open('model.pkl','wb') as f:
    pickle.dump(log_reg,f)
    
with open('scaler.pkl', 'wb') as g:
    pickle.dump(scaler, g)