In [18]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns; sns.set()
import statsmodels.formula.api as smf

pd.options.display.max_columns = None

In [None]:
df = pd.read_csv('Clean Datasets/All-Clean-Game-Logs.csv')
df = df.drop(columns='Unnamed: 0')
df.head()

In [20]:
# Rename columns with spaces for better usability

df = df.rename(columns={
    "Home Game": "Home_Game",
    "Points For": "Points_For",
    "Points Against": "Points_Against",
    "Rest Days": "Rest_Days",
    "Ranked Match": "Ranked_Match",
    "Win Streak": "Win_Streak",
    "Lose Streak": "Lose_Streak",
    "FG%": "FG_Percent",
    "3P%": "3P_Percent",
    "FT%": "FT_Percent",
    "OPP FG": "OPP_FG",
    "OPP FGA": "OPP_FGA",
    "OPP FG%": "OPP_FG_Percent",
    "OPP 3P": "OPP_3P",
    "OPP 3PA": "OPP_3PA",
    "OPP 3P%": "OPP_3P_Percent",
    "OPP FT": "OPP_FT",
    "OPP FTA": "OPP_FTA",
    "OPP FT%": "OPP_FT_Percent",
    "OPP ORB": "OPP_ORB",
    "OPP TRB": "OPP_TRB",
    'OPP AST': 'OPP_AST',
    'OPP STL': 'OPP_STL',
    'OPP BLK': 'OPP_BLK',
    'OPP TOV': 'OPP_TOV',
    'OPP PF': 'OPP_PF',
    'FG_rolling_5': 'FG_rolling_5',
    'FGA_rolling_5': 'FGA_rolling_5',
    'FG%_rolling_5': 'FG_Percent_rolling_5',
    '3P_rolling_5': '3P_rolling_5',
    '3PA_rolling_5': '3PA_rolling_5',
    '3P%_rolling_5': '3P_Percent_rolling_5',
    'FT_rolling_5': 'FT_rolling_5',
    'FTA_rolling_5': 'FTA_rolling_5',
    'FT%_rolling_5': 'FT_Percent_rolling_5',
    'ORB_rolling_5': 'ORB_rolling_5',
    'TRB_rolling_5': 'TRB_rolling_5',
    'AST_rolling_5': 'AST_rolling_5',
    'STL_rolling_5': 'STL_rolling_5',
    'BLK_rolling_5': 'BLK_rolling_5',
    'TOV_rolling_5': 'TOV_rolling_5',
    'PF_rolling_5': 'PF_rolling_5',
    'OPP FG_rolling_5': 'OPP_FG_rolling_5',
    'OPP FGA_rolling_5': 'OPP_FGA_rolling_5',
    'OPP FG%_rolling_5': 'OPP_FG_Percent_rolling_5',
    'OPP 3P_rolling_5': 'OPP_3P_rolling_5',
    'OPP 3PA_rolling_5': 'OPP_3PA_rolling_5',
    'OPP 3P%_rolling_5': 'OPP_3P_Percent_rolling_5',
    'OPP FT_rolling_5': 'OPP_FT_rolling_5',
    'OPP FTA_rolling_5': 'OPP_FTA_rolling_5',
    'OPP FT%_rolling_5': 'OPP_FT_Percent_rolling_5',
    'OPP ORB_rolling_5': 'OPP_ORB_rolling_5',
    'OPP TRB_rolling_5': 'OPP_TRB_rolling_5',
    'OPP AST_rolling_5': 'OPP_AST_rolling_5',
    'OPP STL_rolling_5': 'OPP_STL_rolling_5',
    'OPP BLK_rolling_5': 'OPP_BLK_rolling_5',
    'OPP TOV_rolling_5': 'OPP_TOV_rolling_5',
    'OPP PF_rolling_5': 'OPP_PF_rolling_5'
})

In [None]:
# Turn our response variable to categorical

df['Result'] = 1 * (df['Result'] == 'W')
df.head()

In [None]:
# Filter to create the training dataset (first 4 seasons)
df_train = df[df['Season'].isin(['2019-2020', '2020-2021', '2021-2022', '2022-2023'])]

# Filter to create the test dataset (most recent season)
df_test = df[df['Season'] == '2023-2024']

# Confirm split
print("Training Set:", df_train['Season'].unique())
print("Test Set:", df_test['Season'].unique())

In [None]:
# Create a basic model predicting Result from the 5 highest correlated variables
model_1 = smf.logit('Result ~ Lose_Streak + Win_Streak + Points_Against + OPP_FG_Percent + FG_Percent', data=df_train).fit()
print(model_1.summary())

In [None]:
# Predict on training data
train_predictions_1 = model_1.predict(df_train[['Lose_Streak', 'Win_Streak', 'Points_Against', 'OPP_FG_Percent', 'FG_Percent']])
train_accuracy_1 = (train_predictions_1.round() == df_train['Result']).mean()

# Predict on test data
test_predictions_1 = model_1.predict(df_test[['Lose_Streak', 'Win_Streak', 'Points_Against', 'OPP_FG_Percent', 'FG_Percent']])
test_accuracy_1 = (test_predictions_1.round() == df_test['Result']).mean()

print("Training Accuracy:", train_accuracy_1)
print("Test Accuracy:", test_accuracy_1)

In [None]:
# Suspect that some variables were leading our model to predict the result too easily, so we'll create another with more realistic variables
model_2 = smf.logit('Result ~ FG_Percent + OPP_FG_Percent + OPP_FT + FT + Ranked_Match + Home_Game', data=df_train).fit()
print(model_2.summary())

In [None]:
# Predict on training data
train_predictions_2 = model_2.predict(df_train[['OPP_FG_Percent', 'FG_Percent', 'OPP_FT', 'FT', 'Ranked_Match', 'Home_Game']])
train_accuracy_2 = (train_predictions_2.round() == df_train['Result']).mean()

# Predict on test data
test_predictions_2 = model_2.predict(df_test[['OPP_FG_Percent', 'FG_Percent', 'OPP_FT', 'FT', 'Ranked_Match', 'Home_Game']])
test_accuracy_2 = (test_predictions_2.round() == df_test['Result']).mean()

print("Training Accuracy:", train_accuracy_2)
print("Test Accuracy:", test_accuracy_2)

In [None]:
# Plotting residuals for diagnostics
sns.regplot(x=model_2.fittedvalues, y=model_2.resid_dev, color='black', line_kws={'color':'b'}, lowess=True)
plt.xlabel('Fitted values')
plt.ylabel('Deviance residuals')
plt.title('Residual Plot for Logistic Model')
plt.show()

In [None]:
# Now create a model that would be more realistic with what kind of data we would have before a match
model_3 = smf.logit('Result ~ SRS + Rank + Ranked_Match + Home_Game + FG_Percent_rolling_5 + OPP_FG_Percent_rolling_5', data=df_train).fit()
print(model_3.summary())

In [None]:
# Predict on training data
train_predictions_3 = model_3.predict(df_train[['SRS', 'Rank', 'Ranked_Match', 'Home_Game', 'FG_Percent_rolling_5', 'OPP_FG_Percent_rolling_5']])
train_accuracy_3 = (train_predictions_3.round() == df_train['Result']).mean()

# Predict on test data
test_predictions_3 = model_3.predict(df_test[['SRS', 'Rank', 'Ranked_Match', 'Home_Game', 'FG_Percent_rolling_5', 'OPP_FG_Percent_rolling_5']])
test_accuracy_3 = (test_predictions_3.round() == df_test['Result']).mean()

print("Training Accuracy:", train_accuracy_3)
print("Test Accuracy:", test_accuracy_3)

In [None]:
# Plotting residuals for diagnostics
sns.regplot(x=model_3.fittedvalues, y=model_3.resid_dev, color='black', line_kws={'color':'b'}, lowess=True)
plt.xlabel('Fitted values')
plt.ylabel('Deviance residuals')
plt.title('Residual Plot for Logistic Model')
plt.show()

In [34]:
df

Unnamed: 0,Date,Time,Season,Type,Opponent,Rank,Ranked_Match,Conf,Rest_Days,SRS,Result,Points_For,Points_Against,W,L,Home_Game,Win_Streak,Lose_Streak,FG,FGA,FG_Percent,3P,3PA,3P_Percent,FT,FTA,FT_Percent,ORB,TRB,AST,STL,BLK,TOV,PF,OPP_FG,OPP_FGA,OPP_FG_Percent,OPP_3P,OPP_3PA,OPP_3P_Percent,OPP_FT,OPP_FTA,OPP_FT_Percent,OPP_ORB,OPP_TRB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,FG_rolling_5,FGA_rolling_5,FG_Percent_rolling_5,3P_rolling_5,3PA_rolling_5,3P_Percent_rolling_5,FT_rolling_5,FTA_rolling_5,FT_Percent_rolling_5,ORB_rolling_5,TRB_rolling_5,AST_rolling_5,STL_rolling_5,BLK_rolling_5,TOV_rolling_5,PF_rolling_5,OPP_FG_rolling_5,OPP_FGA_rolling_5,OPP_FG_Percent_rolling_5,OPP_3P_rolling_5,OPP_3PA_rolling_5,OPP_3P_Percent_rolling_5,OPP_FT_rolling_5,OPP_FTA_rolling_5,OPP_FT_Percent_rolling_5,OPP_ORB_rolling_5,OPP_TRB_rolling_5,OPP_AST_rolling_5,OPP_STL_rolling_5,OPP_BLK_rolling_5,OPP_TOV_rolling_5,OPP_PF_rolling_5
0,2019-11-05,8:00p,2019-2020,REG,Nicholls State,,False,Southland,0,-3.87,1,78,70,1,0,True,1,0,27,61,0.443,5,19,0.263,19,29,0.655,17,50,12,6,2,22,17,27,66,0.409,8,20,0.400,8,9,0.889,3,18,13,11,4,16,29,27.0,61.000000,0.4430,5.000000,19.0,0.263000,19.00,29.000000,0.65500,17.00,50.000000,12.000000,6.0,2.000000,22.000000,78.000000,27.000000,66.000000,0.409000,8.000000,20.000000,0.400000,8.000000,9.000000,0.889000,3.000000,18.000000,13.0,11.000000,4.000000,16.000000,29.00
1,2019-11-08,9:00p,2019-2020,REG,Grand Canyon,,False,WAC,3,-6.32,1,83,71,2,0,False,2,0,29,64,0.453,4,13,0.308,21,30,0.700,16,46,13,5,0,17,21,27,61,0.443,8,19,0.421,9,16,0.563,6,23,12,4,0,11,23,28.0,62.500000,0.4480,4.500000,16.0,0.285500,20.00,29.500000,0.67750,16.50,48.000000,12.500000,5.5,1.000000,19.500000,80.500000,27.000000,63.500000,0.426000,8.000000,19.500000,0.410500,8.500000,12.500000,0.726000,4.500000,20.500000,12.5,7.500000,2.000000,13.500000,26.00
2,2019-11-10,9:00p,2019-2020,REG,Arizona,21.0,True,Pac-12,2,19.49,0,69,90,2,1,False,0,1,25,53,0.472,5,16,0.313,14,20,0.700,6,25,10,4,2,22,22,34,61,0.557,7,16,0.438,15,22,0.682,10,32,20,14,3,16,23,27.0,59.333333,0.4560,4.666667,16.0,0.294667,18.00,26.333333,0.68500,13.00,40.333333,11.666667,5.0,1.333333,20.333333,76.666667,29.333333,62.666667,0.469667,7.666667,18.333333,0.419667,10.666667,15.666667,0.711333,6.333333,24.333333,15.0,9.666667,2.333333,14.333333,25.00
3,2019-11-18,8:00p,2019-2020,REG,Hawaii,,False,Big West,8,-1.27,1,66,53,3,1,True,1,0,23,54,0.426,5,12,0.417,15,23,0.652,10,43,14,3,3,15,8,24,64,0.375,3,22,0.136,2,3,0.667,4,25,9,7,2,9,18,26.0,58.000000,0.4485,4.750000,15.0,0.325250,17.25,25.500000,0.67675,12.25,41.000000,12.250000,4.5,1.750000,19.000000,74.000000,28.000000,63.000000,0.446000,6.500000,19.250000,0.348750,8.500000,12.500000,0.700250,5.750000,24.500000,13.5,9.000000,2.250000,13.000000,23.25
4,2019-11-20,9:00p,2019-2020,REG,The Citadel,,False,Southern,2,-13.82,1,85,57,4,1,True,2,0,33,68,0.485,3,18,0.167,16,21,0.762,21,53,17,4,4,13,13,21,74,0.284,10,38,0.263,5,7,0.714,15,27,10,8,2,11,19,27.4,60.000000,0.4558,4.400000,15.6,0.293600,17.00,24.600000,0.69380,14.00,43.400000,13.200000,4.4,2.200000,17.800000,76.200000,26.600000,65.200000,0.413600,7.200000,23.000000,0.331600,7.800000,11.400000,0.703000,7.600000,25.000000,12.8,8.800000,2.200000,12.600000,22.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,2024-01-18,8:30p,2023-2024,REG,Michigan,,False,Big Ten,0,5.28,1,88,73,13,4,False,1,0,30,62,0.484,6,18,0.333,22,30,0.733,13,38,12,9,4,6,13,30,63,0.476,3,13,0.231,10,12,0.833,7,25,10,1,0,9,18,25.0,61.400000,0.4084,6.200000,20.2,0.309000,20.00,26.000000,0.77360,10.60,35.400000,10.200000,6.2,4.000000,7.000000,80.200000,29.800000,65.000000,0.459200,3.800000,15.800000,0.239000,9.800000,13.000000,0.758400,9.200000,31.600000,11.6,2.800000,3.200000,7.800000,16.20
162,2024-01-21,6:09p,2023-2024,NCAA,Connecticut,1.0,True,Big East,3,26.70,0,52,77,29,9,False,0,1,32,63,0.508,6,27,0.222,16,27,0.593,11,43,17,7,3,12,12,26,69,0.377,3,14,0.214,8,11,0.727,9,30,8,7,3,14,19,26.8,63.000000,0.4264,6.000000,21.4,0.286800,19.60,27.000000,0.72860,11.40,37.800000,11.400000,7.0,3.400000,8.000000,73.600000,29.200000,65.000000,0.450600,3.400000,14.800000,0.229200,10.400000,13.800000,0.761000,8.600000,30.800000,11.2,3.400000,3.000000,9.200000,17.00
163,2024-01-21,1:00p,2023-2024,REG,Rutgers,,False,Big Ten,0,7.41,1,86,63,14,4,True,2,0,32,63,0.508,6,27,0.222,16,27,0.593,11,43,17,7,3,12,12,26,69,0.377,3,14,0.214,8,11,0.727,9,30,8,7,3,14,19,29.0,62.800000,0.4624,6.000000,22.4,0.276600,19.00,27.600000,0.68880,11.60,39.400000,13.200000,7.4,3.400000,8.800000,73.000000,28.400000,65.800000,0.433600,3.200000,14.200000,0.225000,9.600000,12.600000,0.765200,8.400000,29.400000,10.0,4.000000,2.400000,10.600000,17.80
164,2024-01-24,9:00p,2023-2024,REG,Northwestern,,False,Big Ten,3,12.77,0,91,96,14,5,False,0,1,31,76,0.408,11,24,0.458,18,24,0.750,20,44,14,5,3,10,15,38,69,0.551,11,18,0.611,9,13,0.692,6,31,16,6,5,5,19,31.0,65.200000,0.4784,7.000000,22.8,0.313600,18.80,27.600000,0.68040,13.60,41.200000,14.400000,7.4,3.400000,9.200000,77.800000,30.000000,66.600000,0.451400,4.600000,14.400000,0.300200,9.000000,11.800000,0.762400,7.600000,28.200000,10.4,4.400000,2.200000,10.200000,18.60


In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

df_clean = df.dropna()
non_numeric = []
for col in df_clean:
    if df_clean[col].dtypes != "float64":
        non_numeric.append(col) #gets rid of non numeric attributes (doesn't work with randomforest?)

X = df_clean.drop(columns= non_numeric) #the features/predictors of our model that don't have nan values
Y = df_clean['Result'] #goal/ what we're trying to predict with random forest regressor 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 42)
random_forest.fit(X_train, Y_train)

In [50]:
y_prediction = random_forest.predict(X_test)
y_prediction

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0])

In [53]:
test_data = pd.DataFrame(X_test)
test_data['actual'] = Y_test
test_data['prediction'] = y_prediction
test_data.head()

Unnamed: 0,Rank,SRS,FG_Percent,3P_Percent,FT_Percent,OPP_FG_Percent,OPP_3P_Percent,OPP_FT_Percent,FG_rolling_5,FGA_rolling_5,FG_Percent_rolling_5,3P_rolling_5,3PA_rolling_5,3P_Percent_rolling_5,FT_rolling_5,FTA_rolling_5,FT_Percent_rolling_5,ORB_rolling_5,TRB_rolling_5,AST_rolling_5,STL_rolling_5,BLK_rolling_5,TOV_rolling_5,PF_rolling_5,OPP_FG_rolling_5,OPP_FGA_rolling_5,OPP_FG_Percent_rolling_5,OPP_3P_rolling_5,OPP_3PA_rolling_5,OPP_3P_Percent_rolling_5,OPP_FT_rolling_5,OPP_FTA_rolling_5,OPP_FT_Percent_rolling_5,OPP_ORB_rolling_5,OPP_TRB_rolling_5,OPP_AST_rolling_5,OPP_STL_rolling_5,OPP_BLK_rolling_5,OPP_TOV_rolling_5,OPP_PF_rolling_5,actual,prediction
132,4.0,19.27,0.35,0.333,0.846,0.435,0.294,0.6,26.2,57.2,0.4592,7.8,25.4,0.2958,10.2,18.2,0.578,8.6,39.0,12.8,4.0,4.2,11.4,76.4,21.8,62.4,0.353,4.6,15.8,0.3044,8.0,14.0,0.5868,8.6,33.2,5.8,6.2,1.2,7.2,18.0,0,0
84,3.0,19.15,0.414,0.308,0.8,0.508,0.409,0.722,24.6,54.4,0.451,7.8,23.6,0.3298,10.4,16.2,0.6176,5.2,29.8,15.0,4.0,3.0,8.4,67.4,23.2,59.2,0.3898,4.8,18.2,0.2624,12.6,16.0,0.7914,8.8,32.0,10.2,4.6,3.2,8.6,18.0,0,0
87,19.0,13.18,0.45,0.296,0.739,0.468,0.381,0.667,24.2,56.4,0.4306,8.4,25.6,0.329,13.8,19.8,0.6956,7.2,31.4,14.6,6.0,2.8,10.6,70.6,26.4,60.0,0.439,6.4,19.0,0.3292,11.0,16.0,0.6842,8.2,31.8,13.8,7.0,3.2,11.2,18.8,1,0
160,4.0,22.3,0.484,0.333,0.733,0.476,0.231,0.833,23.6,60.0,0.3952,6.4,20.8,0.309,19.2,24.4,0.7906,9.4,34.0,10.0,5.0,4.4,7.2,76.8,29.6,66.2,0.448,4.2,17.0,0.2454,8.8,12.0,0.7346,10.2,33.4,11.6,3.4,4.0,7.4,15.6,1,0
119,14.0,14.94,0.475,0.259,0.8,0.579,0.429,0.789,26.0,56.6,0.4592,6.0,23.6,0.2598,14.4,19.6,0.7268,9.0,34.6,11.0,5.0,4.8,12.2,72.2,24.0,57.2,0.4226,8.0,22.0,0.356,12.2,15.6,0.7388,6.0,27.6,10.0,6.0,2.2,9.8,17.8,0,1


In [56]:
accuracy = accuracy_score(Y_test, y_prediction)
print("Accuracy:", accuracy)

Accuracy: 0.5


In [57]:
class_report = classification_report(Y_test, y_prediction)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.71      0.67         7
           1       0.00      0.00      0.00         3

    accuracy                           0.50        10
   macro avg       0.31      0.36      0.33        10
weighted avg       0.44      0.50      0.47        10

