In [97]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

In [118]:
game_df = pd.read_csv('./data/gameStatistics.csv') # change path to ./data/gameStatistics contains the game_ids when cleaned properly
game_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team
0,6.0,38.0,3.0,9.0,13.0,11.0,92.0,30.0,86.0,34.9,...,34.0,43.0,19.0,21.0,10.0,11.0,11.0,16.0,240:00,Utah Jazz
1,,,,,,,101.0,42.0,101.0,41.6,...,43.0,49.0,29.0,20.0,4.0,15.0,8.0,25.0,240:00,Utah Jazz
2,,,,,,,130.0,51.0,91.0,56.0,...,23.0,29.0,31.0,17.0,11.0,12.0,0.0,-8.0,240:00,Utah Jazz
3,,,,,,,121.0,43.0,98.0,43.9,...,40.0,49.0,25.0,23.0,4.0,16.0,5.0,8.0,240:00,Utah Jazz
4,,,,,,,120.0,44.0,94.0,46.8,...,36.0,47.0,24.0,16.0,7.0,7.0,10.0,8.0,240:00,Utah Jazz


In [99]:
game_df.describe()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus
count,809.0,809.0,809.0,809.0,809.0,809.0,3899.0,3899.0,3899.0,3899.0,...,3899.0,3899.0,3899.0,3899.0,3899.0,3899.0,3899.0,3899.0,3899.0,3899.0
mean,12.229913,46.964153,12.116193,13.187886,16.055624,10.856613,114.057194,41.71762,88.223134,47.375763,...,35.520031,10.482944,32.912798,43.395742,25.416261,19.718646,7.501154,13.825083,4.849962,-1.481662
std,6.008578,10.027682,9.581921,5.713827,6.396404,2.769925,12.591627,5.33039,7.422455,5.560856,...,8.304942,3.899705,5.376019,6.681733,5.065183,4.273138,2.946189,3.997649,2.516504,14.962425
min,0.0,20.0,0.0,0.0,2.0,6.0,73.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-62.0
25%,8.0,40.0,5.0,9.0,11.0,9.0,106.0,38.0,83.0,43.6,...,30.0,8.0,29.0,39.0,22.0,17.0,5.0,11.0,3.0,-11.0
50%,12.0,46.0,10.0,13.0,16.0,10.0,114.0,42.0,88.0,47.2,...,35.3,10.0,33.0,43.0,25.0,20.0,7.0,14.0,5.0,-2.0
75%,16.0,54.0,17.0,17.0,20.0,12.0,122.0,45.0,93.0,51.2,...,41.0,13.0,36.0,48.0,29.0,22.0,9.0,16.0,6.0,8.0
max,33.0,82.0,51.0,32.0,40.0,23.0,175.0,65.0,121.0,66.3,...,63.4,29.0,60.0,73.0,47.0,35.0,22.0,33.0,19.0,56.0


In [100]:
game_df.dtypes

visitor_fastBreakPoints       float64
visitor_pointsInPaint         float64
visitor_biggestLead           float64
visitor_secondChancePoints    float64
visitor_pointsOffTurnovers    float64
visitor_longestRun            float64
visitor_points                float64
visitor_fgm                   float64
visitor_fga                   float64
visitor_fgp                   float64
visitor_ftm                   float64
visitor_fta                   float64
visitor_ftp                   float64
visitor_tpm                   float64
visitor_tpa                   float64
visitor_tpp                   float64
visitor_offReb                float64
visitor_defReb                float64
visitor_totReb                float64
visitor_assists               float64
visitor_pFouls                float64
visitor_steals                float64
visitor_turnovers             float64
visitor_blocks                float64
visitor_plusMinus             float64
visitor_min                    object
game_id     

In [101]:
# Convert values in percentage columns to decimal form
game_df[['visitor_ftp', 'visitor_fgp', 'visitor_tpp', 'home_ftp', 'home_fgp','home_tpp']] = game_df[
    ['visitor_ftp', 
     'visitor_fgp', 
     'visitor_tpp', 
     'home_ftp', 
     'home_fgp',
     'home_tpp']
    ].apply(lambda x: x / 100)

In [102]:
# Drop game_id column
game_df = game_df.drop(columns='game_id')

In [103]:
# Based on the column data and the fact that the home_team has the same missing data as the visitor_team
# A function should be created to fill all the Null values with the average of those columns
# Later will see if this affects how well the machine learning model works

def fillNullCols(df):
    # Find all data type 'int' or 'float' columns that have Null values
    df.select_dtypes(include=['int', 'float'])

    # Looks at the total percentage of null values in each columns
    null_percentage = (df.isnull().sum() / len(df)) * 100

    # Rule of thumb: Moderate Null Percentage (10-30%)
    # Filling the null values with an appropriate value (like the mean, median, or mode) is often better in this range, especially if the column is critical for analysis

    # Get columns from null_percentage with at least 20% with null values
    null_columns = null_percentage[null_percentage >= 0.2].index

    # Fill rows with the mode of the column
    for column in null_columns:
        mode_fill_value = df[column].mode()[0]
        df[column] = df[column].fillna(mode_fill_value)

    return df

# Create new data frame
stats_df = fillNullCols(game_df)
stats_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team
0,6.0,38.0,3.0,9.0,13.0,11.0,92.0,30.0,86.0,0.349,...,34.0,43.0,19.0,21.0,10.0,11.0,11.0,16.0,240:00,Utah Jazz
1,10.0,46.0,0.0,12.0,14.0,10.0,101.0,42.0,101.0,0.416,...,43.0,49.0,29.0,20.0,4.0,15.0,8.0,25.0,240:00,Utah Jazz
2,10.0,46.0,0.0,12.0,14.0,10.0,130.0,51.0,91.0,0.56,...,23.0,29.0,31.0,17.0,11.0,12.0,0.0,-8.0,240:00,Utah Jazz
3,10.0,46.0,0.0,12.0,14.0,10.0,121.0,43.0,98.0,0.439,...,40.0,49.0,25.0,23.0,4.0,16.0,5.0,8.0,240:00,Utah Jazz
4,10.0,46.0,0.0,12.0,14.0,10.0,120.0,44.0,94.0,0.468,...,36.0,47.0,24.0,16.0,7.0,7.0,10.0,8.0,240:00,Utah Jazz


In [104]:
# Create target columns 'total_points'
stats_df['total_points'] = stats_df['visitor_points'] + stats_df['home_points']
stats_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team,total_points
0,6.0,38.0,3.0,9.0,13.0,11.0,92.0,30.0,86.0,0.349,...,43.0,19.0,21.0,10.0,11.0,11.0,16.0,240:00,Utah Jazz,200.0
1,10.0,46.0,0.0,12.0,14.0,10.0,101.0,42.0,101.0,0.416,...,49.0,29.0,20.0,4.0,15.0,8.0,25.0,240:00,Utah Jazz,227.0
2,10.0,46.0,0.0,12.0,14.0,10.0,130.0,51.0,91.0,0.56,...,29.0,31.0,17.0,11.0,12.0,0.0,-8.0,240:00,Utah Jazz,252.0
3,10.0,46.0,0.0,12.0,14.0,10.0,121.0,43.0,98.0,0.439,...,49.0,25.0,23.0,4.0,16.0,5.0,8.0,240:00,Utah Jazz,250.0
4,10.0,46.0,0.0,12.0,14.0,10.0,120.0,44.0,94.0,0.468,...,47.0,24.0,16.0,7.0,7.0,10.0,8.0,240:00,Utah Jazz,248.0


In [105]:
# Look for correlation between columns and 'total_points' column
correlations = stats_df.corr(numeric_only=True)[target_column].sort_values(ascending=False)
correlations

total_points                  1.000000
visitor_points                0.804404
home_points                   0.798394
visitor_fgm                   0.669358
home_fgm                      0.660487
visitor_fgp                   0.501122
visitor_assists               0.436473
home_assists                  0.430590
visitor_tpp                   0.396681
visitor_tpm                   0.377386
home_tpp                      0.357579
home_tpm                      0.333521
home_fga                      0.320666
visitor_fga                   0.317487
home_ftm                      0.272311
visitor_ftm                   0.260121
home_fta                      0.254031
visitor_fta                   0.236870
visitor_pFouls                0.186741
visitor_pointsInPaint         0.176630
home_pFouls                   0.161013
home_pointsInPaint            0.149133
home_fgp                      0.141170
visitor_tpa                   0.126088
home_tpa                      0.091308
home_fastBreakPoints     

In [106]:
# Set target column
target_column = 'total_points'

# Look for correlation between columns and 'total_points' column
correlations = stats_df.corr(numeric_only=True)[target_column]

# Set conditions to find feature columns
feature_columns = correlations[correlations > 0.4]

# Create new data frame with chosen feature columns
new_df = stats_df[feature_columns.index]

# Add 'visitor_team' and 'home_team' to the new DataFrame
new_df = pd.concat([stats_df[['home_team', 'visitor_team']], new_df], axis=1)
new_df

Unnamed: 0,home_team,visitor_team,visitor_points,visitor_fgm,visitor_fgp,visitor_assists,home_points,home_fgm,home_assists,total_points
0,Utah Jazz,Washington Wizards,92.0,30.0,0.349,15.0,108.0,41.0,19.0,200.0
1,Utah Jazz,Washington Wizards,101.0,42.0,0.416,21.0,126.0,48.0,29.0,227.0
2,Utah Jazz,Washington Wizards,130.0,51.0,0.560,24.0,122.0,45.0,31.0,252.0
3,Utah Jazz,Washington Wizards,121.0,43.0,0.439,25.0,129.0,46.0,25.0,250.0
4,Utah Jazz,Washington Wizards,120.0,44.0,0.468,28.0,128.0,47.0,24.0,248.0
...,...,...,...,...,...,...,...,...,...,...
3896,Utah Jazz,Washington Wizards,113.0,35.0,0.461,23.0,95.0,34.0,20.0,208.0
3897,Utah Jazz,Washington Wizards,121.0,44.0,0.500,27.0,112.0,44.0,33.0,233.0
3898,Utah Jazz,Washington Wizards,108.0,43.0,0.489,33.0,123.0,50.0,34.0,231.0
3899,Utah Jazz,Washington Wizards,120.0,42.0,0.488,16.0,112.0,46.0,20.0,232.0


In [107]:
new_df.isnull().sum()

home_team          0
visitor_team       0
visitor_points     2
visitor_fgm        2
visitor_fgp        2
visitor_assists    2
home_points        2
home_fgm           2
home_assists       2
total_points       2
dtype: int64

In [108]:
new_df = new_df.dropna()
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3899 entries, 0 to 3900
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   home_team        3899 non-null   object 
 1   visitor_team     3899 non-null   object 
 2   visitor_points   3899 non-null   float64
 3   visitor_fgm      3899 non-null   float64
 4   visitor_fgp      3899 non-null   float64
 5   visitor_assists  3899 non-null   float64
 6   home_points      3899 non-null   float64
 7   home_fgm         3899 non-null   float64
 8   home_assists     3899 non-null   float64
 9   total_points     3899 non-null   float64
dtypes: float64(8), object(2)
memory usage: 335.1+ KB


In [109]:
# Apply one-hot encoding and convert to booleans to integers
encoded_df = pd.get_dummies(new_df, columns=['visitor_team', 'home_team'])

# Convert only the binary columns to integers
encoded_df = encoded_df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

# Display
encoded_df

Unnamed: 0,visitor_points,visitor_fgm,visitor_fgp,visitor_assists,home_points,home_fgm,home_assists,total_points,visitor_team_Washington Wizards,home_team_Utah Jazz
0,92.0,30.0,0.349,15.0,108.0,41.0,19.0,200.0,1,1
1,101.0,42.0,0.416,21.0,126.0,48.0,29.0,227.0,1,1
2,130.0,51.0,0.560,24.0,122.0,45.0,31.0,252.0,1,1
3,121.0,43.0,0.439,25.0,129.0,46.0,25.0,250.0,1,1
4,120.0,44.0,0.468,28.0,128.0,47.0,24.0,248.0,1,1
...,...,...,...,...,...,...,...,...,...,...
3896,113.0,35.0,0.461,23.0,95.0,34.0,20.0,208.0,1,1
3897,121.0,44.0,0.500,27.0,112.0,44.0,33.0,233.0,1,1
3898,108.0,43.0,0.489,33.0,123.0,50.0,34.0,231.0,1,1
3899,120.0,42.0,0.488,16.0,112.0,46.0,20.0,232.0,1,1


In [110]:

# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Apply one-hot encoding and convert to booleans to integers
# encoded_df = pd.get_dummies(new_df, columns=['visitor_team', 'home_team'])

# # Convert only the binary columns to integers
# encoded_df = encoded_df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

# # Create instance of scaler
# scaler = MinMaxScaler()

# # X = scaler.fit_transform(encoded_df.drop(columns=['total_points', 'home_points', 'visitor_points']))
# X = encoded_df.drop(columns=['total_points', 'home_points', 'visitor_points'])
# y = encoded_df['total_points']

# X_scaled = scaler.fit_transform(X)

# X_df = pd.DataFrame(X_scaled, columns=X.columns)

# # Splitting into testing/train data
# X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# # Create decision tree regression instance
# model = RandomForestRegressor(n_estimators=100, random_state=42)

# # Fit model
# model = model.fit(X_train, y_train)


# # # Making predictions using scaled data
# predictions = model.predict(X_test)

# # # Print predicted price for tested features
# predictions

In [111]:
# # Evaluate the model
# mae = mean_absolute_error(y_test, predictions)
# mse = mean_squared_error(y_test, predictions)
# r2 = r2_score(y_test, predictions)

# print(f"Mean Absolute Error: {mae}")
# print(f"Mean Squared Error: {mse}")
# print(f"R^2 Score: {r2}")

In [112]:
# # Define the home and visitor teams
# home_team = input("Enter Away Team: ")  # Replace with desired home team
# visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# # Create a new data instance with zeros for all one-hot columns
# input_data = {col: 0 for col in X_df.columns}
# input_data[f'home_team_{home_team}'] = 1
# input_data[f'visitor_team_{visitor_team}'] = 1

# # Convert to DataFrame
# input_df = pd.DataFrame([input_data])

# # Apply the same MinMaxScaler used on X_df
# input_scaled = scaler.transform(input_df)

# # Predict total points
# predicted_total_points = model.predict(input_scaled)[0]
# print(f"Predicted Total Points: {predicted_total_points}")

# Winner Prediction

In [122]:
# Create a column to check if the home team won
encoded_df['home_win'] = encoded_df.apply(
    lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
    axis=1
)
print(encoded_df.head())

   visitor_points  visitor_fgm  visitor_fgp  visitor_assists  home_points  \
0            92.0         30.0        0.349             15.0        108.0   
1           101.0         42.0        0.416             21.0        126.0   
2           130.0         51.0        0.560             24.0        122.0   
3           121.0         43.0        0.439             25.0        129.0   
4           120.0         44.0        0.468             28.0        128.0   

   home_fgm  home_assists  total_points  visitor_team_Washington Wizards  \
0      41.0          19.0         200.0                                1   
1      48.0          29.0         227.0                                1   
2      45.0          31.0         252.0                                1   
3      46.0          25.0         250.0                                1   
4      47.0          24.0         248.0                                1   

   home_team_Utah Jazz  home_win  
0                    1         1  
1         

In [120]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

scaler = MinMaxScaler()

X = scaler.fit_transform(encoded_df.drop(columns=['home_win', 'home_points', 'total_points', 'visitor_points']))

X_df = pd.DataFrame(X, columns=encoded_df.drop(columns=['home_win', 'home_points', 'total_points', 'visitor_points']).columns)
y = encoded_df['home_win']

# Splitting into testing/train data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Create decision tree regression instance
model = DecisionTreeClassifier()

# Fit model
model = model.fit(X_train, y_train)

# Making predictions using scaled data
predictions = model.predict(X_test)

# Print predicted price for tested features
predictions

array([0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,

In [123]:
X_df.columns

Index(['visitor_fgm', 'visitor_fgp', 'visitor_assists', 'home_fgm',
       'home_assists', 'visitor_team_Washington Wizards',
       'home_team_Utah Jazz'],
      dtype='object')

In [128]:
# Define the home and visitor teams
home_team = input("Enter Away Team: ")  # Replace with desired home team
visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# Create a new data instance with zeros for all one-hot columns
input_data = {col: 0 for col in X_df.columns}
input_data[f'home_team_{home_team}'] = 1
input_data[f'visitor_team_{visitor_team}'] = 1

# # Convert to DataFrame
input_df = pd.DataFrame([input_data])

for col in X_df.columns:
    if col not in input_df.columns:
        input_df[col] = 0

input_df = input_df[X_df.columns]

# Apply the same MinMaxScaler used on X_df
input_scaled = scaler.transform(input_df)

# Predict total points
predicted_winner = model.predict(input_scaled)[0]

winning_team = home_team if  predicted_winner == 1 else visitor_team
print("Predicted Winner:", winning_team)

Predicted Winner: Utah Jazz




In [129]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.6692307692307692
Confusion Matrix:
 [[320 127]
 [131 202]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.72      0.71       447
           1       0.61      0.61      0.61       333

    accuracy                           0.67       780
   macro avg       0.66      0.66      0.66       780
weighted avg       0.67      0.67      0.67       780

