In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

df = pd.read_csv('NBA_cleaned.csv')
print(df.columns)
df.head()


Index(['game_id', 'Game Date', 'Home Days Since Last Game',
       'Home Games Last 7 Days', 'Home Is Back-to-Back',
       'Away Days Since Last Game', 'Away Games Last 7 Days',
       'Away Is Back-to-Back', 'Away Team Recent 10 Win Rate',
       'Home Team Recent 10 Win Rate', 'Home Team Recent 3 Months Win Rate',
       'Away Team Recent 3 Months Win Rate', 'home_pts_avg_last10',
       'home_reb_avg_last10', 'home_ast_avg_last10',
       'home_plus_minus_avg_last10', 'away_pts_avg_last10',
       'away_reb_avg_last10', 'away_ast_avg_last10',
       'away_plus_minus_avg_last10', 'target'],
      dtype='object')


Unnamed: 0,game_id,Game Date,Home Days Since Last Game,Home Games Last 7 Days,Home Is Back-to-Back,Away Days Since Last Game,Away Games Last 7 Days,Away Is Back-to-Back,Away Team Recent 10 Win Rate,Home Team Recent 10 Win Rate,...,Away Team Recent 3 Months Win Rate,home_pts_avg_last10,home_reb_avg_last10,home_ast_avg_last10,home_plus_minus_avg_last10,away_pts_avg_last10,away_reb_avg_last10,away_ast_avg_last10,away_plus_minus_avg_last10,target
0,29900423,2000-01-02,5.0,1,False,3.0,1,False,0.4,0.7,...,0.4375,92.5,43.6,24.7,-0.6,102.2,45.4,20.0,2.4,1.0
1,29900427,2000-01-03,5.0,2,False,5.0,1,False,0.6,0.4,...,0.666667,101.7,47.7,19.1,-0.5,103.9,42.6,21.7,4.5,0.0
2,29900426,2000-01-03,4.0,1,False,8.0,0,False,0.8,0.4,...,0.785714,93.6,42.7,21.8,-0.5,97.1,43.0,24.4,-3.3,1.0
3,29900429,2000-01-03,5.0,2,False,11.0,0,False,0.7,0.8,...,0.75,94.8,39.3,24.3,6.7,100.5,46.1,22.1,2.7,1.0
4,29900428,2000-01-03,5.0,2,False,13.0,0,False,0.3,0.2,...,0.25,81.6,40.8,19.5,-14.4,96.6,44.8,24.4,6.1,0.0


In [2]:
df = df.dropna(axis=1)

In [3]:
df.columns

Index(['game_id', 'Game Date', 'Home Days Since Last Game',
       'Home Games Last 7 Days', 'Home Is Back-to-Back',
       'Away Days Since Last Game', 'Away Games Last 7 Days',
       'Away Is Back-to-Back', 'Away Team Recent 10 Win Rate',
       'Home Team Recent 10 Win Rate', 'Home Team Recent 3 Months Win Rate',
       'Away Team Recent 3 Months Win Rate', 'target'],
      dtype='object')

In [4]:
# Show total number of missing values per column
null_counts = df.isnull().sum()

# Show percentage of missing values per column
null_percent = (null_counts / len(df)) * 100

# Combine into a single DataFrame for display
null_summary = pd.DataFrame({
    "Missing Count": null_counts,
    "Missing Percentage (%)": null_percent
})

# Sort by most missing
null_summary = null_summary[null_summary["Missing Count"] > 0].sort_values(by="Missing Count", ascending=False)

print(null_summary)

Empty DataFrame
Index: []


In [5]:
df.columns

Index(['game_id', 'Game Date', 'Home Days Since Last Game',
       'Home Games Last 7 Days', 'Home Is Back-to-Back',
       'Away Days Since Last Game', 'Away Games Last 7 Days',
       'Away Is Back-to-Back', 'Away Team Recent 10 Win Rate',
       'Home Team Recent 10 Win Rate', 'Home Team Recent 3 Months Win Rate',
       'Away Team Recent 3 Months Win Rate', 'target'],
      dtype='object')

In [6]:
X = df.drop(columns=['game_id', 'Game Date', 'target'])
y = df['target']

X.head()

Unnamed: 0,Home Days Since Last Game,Home Games Last 7 Days,Home Is Back-to-Back,Away Days Since Last Game,Away Games Last 7 Days,Away Is Back-to-Back,Away Team Recent 10 Win Rate,Home Team Recent 10 Win Rate,Home Team Recent 3 Months Win Rate,Away Team Recent 3 Months Win Rate
0,5.0,1,False,3.0,1,False,0.4,0.7,0.769231,0.4375
1,5.0,2,False,5.0,1,False,0.6,0.4,0.461538,0.666667
2,4.0,1,False,8.0,0,False,0.8,0.4,0.357143,0.785714
3,5.0,2,False,11.0,0,False,0.7,0.8,0.785714,0.75
4,5.0,2,False,13.0,0,False,0.3,0.2,0.166667,0.25


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))


Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.55      0.48      0.51      2574
         1.0       0.67      0.73      0.70      3671

    accuracy                           0.63      6245
   macro avg       0.61      0.60      0.60      6245
weighted avg       0.62      0.63      0.62      6245



In [8]:
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, gb_pred))

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

         0.0       0.60      0.45      0.51      2574
         1.0       0.67      0.79      0.72      3671

    accuracy                           0.65      6245
   macro avg       0.63      0.62      0.62      6245
weighted avg       0.64      0.65      0.64      6245



In [9]:
NBA = pd.read_csv('NBA_cleaned.csv')

NBA = NBA.dropna(axis=0)
NBA.columns

Index(['game_id', 'Game Date', 'Home Days Since Last Game',
       'Home Games Last 7 Days', 'Home Is Back-to-Back',
       'Away Days Since Last Game', 'Away Games Last 7 Days',
       'Away Is Back-to-Back', 'Away Team Recent 10 Win Rate',
       'Home Team Recent 10 Win Rate', 'Home Team Recent 3 Months Win Rate',
       'Away Team Recent 3 Months Win Rate', 'home_pts_avg_last10',
       'home_reb_avg_last10', 'home_ast_avg_last10',
       'home_plus_minus_avg_last10', 'away_pts_avg_last10',
       'away_reb_avg_last10', 'away_ast_avg_last10',
       'away_plus_minus_avg_last10', 'target'],
      dtype='object')

In [10]:
X1 = NBA.drop(columns=['game_id', 'Game Date', 'target'])
y1 = NBA['target']

NBA.head()

Unnamed: 0,game_id,Game Date,Home Days Since Last Game,Home Games Last 7 Days,Home Is Back-to-Back,Away Days Since Last Game,Away Games Last 7 Days,Away Is Back-to-Back,Away Team Recent 10 Win Rate,Home Team Recent 10 Win Rate,...,Away Team Recent 3 Months Win Rate,home_pts_avg_last10,home_reb_avg_last10,home_ast_avg_last10,home_plus_minus_avg_last10,away_pts_avg_last10,away_reb_avg_last10,away_ast_avg_last10,away_plus_minus_avg_last10,target
0,29900423,2000-01-02,5.0,1,False,3.0,1,False,0.4,0.7,...,0.4375,92.5,43.6,24.7,-0.6,102.2,45.4,20.0,2.4,1.0
1,29900427,2000-01-03,5.0,2,False,5.0,1,False,0.6,0.4,...,0.666667,101.7,47.7,19.1,-0.5,103.9,42.6,21.7,4.5,0.0
2,29900426,2000-01-03,4.0,1,False,8.0,0,False,0.8,0.4,...,0.785714,93.6,42.7,21.8,-0.5,97.1,43.0,24.4,-3.3,1.0
3,29900429,2000-01-03,5.0,2,False,11.0,0,False,0.7,0.8,...,0.75,94.8,39.3,24.3,6.7,100.5,46.1,22.1,2.7,1.0
4,29900428,2000-01-03,5.0,2,False,13.0,0,False,0.3,0.2,...,0.25,81.6,40.8,19.5,-14.4,96.6,44.8,24.4,6.1,0.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Random Forest Classifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))

Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.61      0.48      0.53      2594
         1.0       0.68      0.78      0.73      3650

    accuracy                           0.65      6244
   macro avg       0.64      0.63      0.63      6244
weighted avg       0.65      0.65      0.65      6244



In [12]:
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, gb_pred))

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

         0.0       0.63      0.46      0.53      2594
         1.0       0.68      0.80      0.74      3650

    accuracy                           0.66      6244
   macro avg       0.65      0.63      0.64      6244
weighted avg       0.66      0.66      0.65      6244

