In [194]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

In [195]:
game_df = pd.read_csv('./data/gameStatistics.csv') # change path to ./data/gameStatistics contains the game_ids when cleaned properly
game_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team
0,18.0,34.0,2.0,,20.0,,97.0,34.0,82.0,41.5,...,37.0,43.0,33.0,25.0,8.0,20.0,7.0,24.0,240:00,Atlanta Hawks
1,25.0,46.0,12.0,,18.0,,107.0,40.0,90.0,44.4,...,39.0,44.0,31.0,22.0,10.0,17.0,10.0,11.0,240:00,Atlanta Hawks
2,15.0,36.0,3.0,,10.0,,101.0,37.0,102.0,36.3,...,40.0,53.0,23.0,20.0,4.0,11.0,9.0,1.0,240:00,Atlanta Hawks
3,8.0,36.0,0.0,,6.0,,72.0,28.0,88.0,31.8,...,46.0,49.0,20.0,16.0,7.0,15.0,15.0,17.0,240:00,Atlanta Hawks
4,8.0,32.0,10.0,,9.0,,83.0,29.0,77.0,37.7,...,38.0,51.0,30.0,17.0,13.0,12.0,3.0,27.0,240:00,Atlanta Hawks


In [196]:
game_df.describe()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus
count,2319.0,2319.0,2319.0,2009.0,2319.0,2009.0,3057.0,3057.0,3057.0,3057.0,...,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0
mean,12.34972,44.878827,11.448038,12.816327,16.35705,10.516177,108.734053,40.016683,87.152764,45.974583,...,35.553876,10.363428,33.831534,44.194962,24.219496,20.004907,7.586523,13.75139,4.94668,0.564933
std,6.277979,9.807574,8.957032,5.490177,6.282615,2.758519,13.566358,5.537892,7.39507,5.581095,...,8.777542,3.798734,5.491791,6.561973,5.220834,4.29061,2.974002,3.999961,2.554274,14.548342
min,0.0,14.0,0.0,0.0,0.0,4.0,64.0,21.0,54.0,27.1,...,0.0,1.0,16.0,26.0,7.0,8.0,0.0,3.0,0.0,-60.0
25%,8.0,38.0,5.0,9.0,12.0,9.0,100.0,36.0,82.0,42.2,...,29.7,8.0,30.0,40.0,21.0,17.0,5.0,11.0,3.0,-9.0
50%,12.0,44.0,10.0,12.0,16.0,10.0,109.0,40.0,87.0,45.9,...,35.5,10.0,34.0,44.0,24.0,20.0,7.0,14.0,5.0,1.0
75%,16.0,52.0,17.0,16.0,20.0,12.0,118.0,44.0,92.0,49.5,...,41.4,13.0,37.0,48.0,28.0,23.0,9.0,16.0,6.0,10.0
max,53.0,80.0,67.0,35.0,42.0,26.0,158.0,65.0,128.0,65.5,...,68.4,27.0,56.0,70.0,43.0,39.0,22.0,29.0,20.0,50.0


In [197]:
game_df.dtypes

visitor_fastBreakPoints       float64
visitor_pointsInPaint         float64
visitor_biggestLead           float64
visitor_secondChancePoints    float64
visitor_pointsOffTurnovers    float64
visitor_longestRun            float64
visitor_points                float64
visitor_fgm                   float64
visitor_fga                   float64
visitor_fgp                   float64
visitor_ftm                   float64
visitor_fta                   float64
visitor_ftp                   float64
visitor_tpm                   float64
visitor_tpa                   float64
visitor_tpp                   float64
visitor_offReb                float64
visitor_defReb                float64
visitor_totReb                float64
visitor_assists               float64
visitor_pFouls                float64
visitor_steals                float64
visitor_turnovers             float64
visitor_blocks                float64
visitor_plusMinus             float64
visitor_min                    object
game_id     

In [198]:
# Convert values in percentage columns to decimal form
game_df[['visitor_ftp', 'visitor_fgp', 'visitor_tpp', 'home_ftp', 'home_fgp','home_tpp']] = game_df[
    ['visitor_ftp', 
     'visitor_fgp', 
     'visitor_tpp', 
     'home_ftp', 
     'home_fgp',
     'home_tpp']
    ].apply(lambda x: x / 100)

In [199]:
# Drop game_id column
game_df = game_df.drop(columns='game_id')

In [200]:
# Based on the column data and the fact that the home_team has the same missing data as the visitor_team
# A function should be created to fill all the Null values with the average of those columns
# Later will see if this affects how well the machine learning model works

def fillNullCols(df):
    # Find all data type 'int' or 'float' columns that have Null values
    df.select_dtypes(include=['int', 'float'])

    # Looks at the total percentage of null values in each columns
    null_percentage = (df.isnull().sum() / len(df)) * 100

    # Rule of thumb: Moderate Null Percentage (10-30%)
    # Filling the null values with an appropriate value (like the mean, median, or mode) is often better in this range, especially if the column is critical for analysis

    # Get columns from null_percentage with at least 20% with null values
    null_columns = null_percentage[null_percentage >= 0.2].index

    # Fill rows with the mode of the column
    for column in null_columns:
        mode_fill_value = df[column].mode()[0]
        df[column] = df[column].fillna(mode_fill_value)

    return df

# Create new data frame
stats_df = fillNullCols(game_df)
stats_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team
0,18.0,34.0,2.0,11.0,20.0,9.0,97.0,34.0,82.0,0.415,...,37.0,43.0,33.0,25.0,8.0,20.0,7.0,24.0,240:00,Atlanta Hawks
1,25.0,46.0,12.0,11.0,18.0,9.0,107.0,40.0,90.0,0.444,...,39.0,44.0,31.0,22.0,10.0,17.0,10.0,11.0,240:00,Atlanta Hawks
2,15.0,36.0,3.0,11.0,10.0,9.0,101.0,37.0,102.0,0.363,...,40.0,53.0,23.0,20.0,4.0,11.0,9.0,1.0,240:00,Atlanta Hawks
3,8.0,36.0,0.0,11.0,6.0,9.0,72.0,28.0,88.0,0.318,...,46.0,49.0,20.0,16.0,7.0,15.0,15.0,17.0,240:00,Atlanta Hawks
4,8.0,32.0,10.0,11.0,9.0,9.0,83.0,29.0,77.0,0.377,...,38.0,51.0,30.0,17.0,13.0,12.0,3.0,27.0,240:00,Atlanta Hawks


In [201]:
# Create target columns 'total_points'
stats_df['total_points'] = stats_df['visitor_points'] + stats_df['home_points']
stats_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team,total_points
0,18.0,34.0,2.0,11.0,20.0,9.0,97.0,34.0,82.0,0.415,...,43.0,33.0,25.0,8.0,20.0,7.0,24.0,240:00,Atlanta Hawks,218.0
1,25.0,46.0,12.0,11.0,18.0,9.0,107.0,40.0,90.0,0.444,...,44.0,31.0,22.0,10.0,17.0,10.0,11.0,240:00,Atlanta Hawks,225.0
2,15.0,36.0,3.0,11.0,10.0,9.0,101.0,37.0,102.0,0.363,...,53.0,23.0,20.0,4.0,11.0,9.0,1.0,240:00,Atlanta Hawks,203.0
3,8.0,36.0,0.0,11.0,6.0,9.0,72.0,28.0,88.0,0.318,...,49.0,20.0,16.0,7.0,15.0,15.0,17.0,240:00,Atlanta Hawks,161.0
4,8.0,32.0,10.0,11.0,9.0,9.0,83.0,29.0,77.0,0.377,...,51.0,30.0,17.0,13.0,12.0,3.0,27.0,240:00,Atlanta Hawks,193.0


In [202]:
# Look for correlation between columns and 'total_points' column
correlations = stats_df.corr(numeric_only=True)[target_column].sort_values(ascending=False)
correlations

total_points                  1.000000
visitor_points                0.844452
home_points                   0.829460
visitor_fgm                   0.725605
home_fgm                      0.684140
visitor_fgp                   0.525908
visitor_assists               0.506038
home_assists                  0.494517
visitor_tpm                   0.458288
visitor_fga                   0.440052
home_tpm                      0.427785
home_fga                      0.419075
visitor_pointsInPaint         0.379618
visitor_tpp                   0.372084
home_fgp                      0.324731
home_tpp                      0.322302
home_pointsInPaint            0.304784
visitor_tpa                   0.303786
home_tpa                      0.288918
home_ftm                      0.272157
visitor_ftm                   0.264036
home_fta                      0.236013
visitor_fta                   0.227624
visitor_pFouls                0.172668
home_secondChancePoints       0.157353
home_pFouls              

In [241]:
# Set target column
target_column = 'total_points'

# Look for correlation between columns and 'total_points' column
correlations = stats_df.corr(numeric_only=True)[target_column]

# Set conditions to find feature columns
feature_columns = correlations[correlations > 0.4]

# Create new data frame with chosen feature columns
new_df = stats_df[feature_columns.index]

# Add 'visitor_team' and 'home_team' to the new DataFrame
new_df = pd.concat([stats_df[['home_team', 'visitor_team']], new_df], axis=1)
new_df

Unnamed: 0,home_team,visitor_team,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_tpm,visitor_assists,home_points,home_fgm,home_fga,home_tpm,home_assists,total_points
0,Atlanta Hawks,Boston Celtics,97.0,34.0,82.0,0.415,9.0,23.0,121.0,45.0,80.0,12.0,33.0,218.0
1,Atlanta Hawks,Boston Celtics,107.0,40.0,90.0,0.444,11.0,26.0,118.0,46.0,88.0,17.0,31.0,225.0
2,Atlanta Hawks,Boston Celtics,101.0,37.0,102.0,0.363,11.0,27.0,102.0,35.0,86.0,5.0,23.0,203.0
3,Atlanta Hawks,Boston Celtics,72.0,28.0,88.0,0.318,5.0,16.0,89.0,32.0,82.0,11.0,20.0,161.0
4,Atlanta Hawks,Boston Celtics,83.0,29.0,77.0,0.377,7.0,19.0,110.0,42.0,96.0,14.0,30.0,193.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053,Toronto Raptors,Washington Wizards,113.0,35.0,76.0,0.461,17.0,23.0,95.0,34.0,95.0,7.0,20.0,208.0
3054,Washington Wizards,Utah Jazz,127.0,48.0,90.0,0.533,10.0,24.0,116.0,47.0,93.0,7.0,28.0,243.0
3055,Utah Jazz,Washington Wizards,121.0,44.0,88.0,0.500,16.0,27.0,112.0,44.0,94.0,14.0,33.0,233.0
3056,Utah Jazz,Washington Wizards,124.0,48.0,98.0,0.490,15.0,32.0,128.0,48.0,87.0,14.0,26.0,252.0


In [236]:
new_df.isnull().sum()

home_team         0
visitor_team      0
visitor_points    1
visitor_fgm       1
home_points       1
home_fgm          1
total_points      1
dtype: int64

In [253]:
new_df = new_df.dropna()
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3057 entries, 0 to 3057
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   home_team        3057 non-null   object 
 1   visitor_team     3057 non-null   object 
 2   visitor_points   3057 non-null   float64
 3   visitor_fgm      3057 non-null   float64
 4   visitor_fga      3057 non-null   float64
 5   visitor_fgp      3057 non-null   float64
 6   visitor_tpm      3057 non-null   float64
 7   visitor_assists  3057 non-null   float64
 8   home_points      3057 non-null   float64
 9   home_fgm         3057 non-null   float64
 10  home_fga         3057 non-null   float64
 11  home_tpm         3057 non-null   float64
 12  home_assists     3057 non-null   float64
 13  total_points     3057 non-null   float64
dtypes: float64(12), object(2)
memory usage: 358.2+ KB


In [234]:
# # Apply one-hot encoding and convert to booleans to integers
# encoded_df = pd.get_dummies(new_df, columns=['visitor_team', 'home_team'])

# # Convert only the binary columns to integers
# encoded_df = encoded_df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

# # Display
# encoded_df

In [255]:

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Apply one-hot encoding and convert to booleans to integers
encoded_df = pd.get_dummies(new_df, columns=['visitor_team', 'home_team'])

# Convert only the binary columns to integers
encoded_df = encoded_df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

# Create instance of scaler
scaler = MinMaxScaler()

# X = scaler.fit_transform(encoded_df.drop(columns=['total_points', 'home_points', 'visitor_points']))
X = encoded_df.drop(columns=['total_points', 'home_points', 'visitor_points'])
y = encoded_df['total_points']

X_scaled = scaler.fit_transform(X)

X_df = pd.DataFrame(X_scaled, columns=X.columns)

# Splitting into testing/train data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Create decision tree regression instance
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit model
model = model.fit(X_train, y_train)


# # Making predictions using scaled data
predictions = model.predict(X_test)

# # Print predicted price for tested features
predictions

array([225.58, 226.78, 257.58, 236.79, 197.51, 202.44, 207.62, 227.81,
       224.23, 238.67, 234.06, 215.88, 205.79, 207.67, 228.42, 198.83,
       236.15, 207.75, 201.51, 200.63, 226.05, 183.83, 247.41, 202.02,
       232.65, 255.8 , 225.49, 226.42, 228.8 , 197.4 , 185.75, 236.28,
       221.97, 203.26, 218.08, 218.53, 200.65, 200.36, 191.52, 224.39,
       204.2 , 227.5 , 222.88, 254.21, 219.91, 210.22, 255.86, 203.76,
       211.81, 203.83, 219.05, 231.67, 239.54, 216.47, 239.68, 219.82,
       224.53, 215.35, 239.26, 209.9 , 211.45, 207.02, 188.85, 226.25,
       248.32, 246.93, 255.27, 210.14, 242.53, 240.14, 245.65, 240.46,
       231.17, 215.8 , 212.38, 223.3 , 225.22, 196.03, 220.32, 222.81,
       203.79, 216.02, 201.55, 229.15, 246.76, 241.32, 273.98, 204.21,
       203.35, 179.11, 203.21, 195.27, 239.38, 226.44, 208.82, 213.13,
       230.81, 253.16, 211.37, 240.6 , 200.36, 232.48, 256.15, 215.91,
       200.29, 197.78, 269.84, 218.29, 235.98, 215.86, 209.99, 226.91,
      

In [256]:
# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Absolute Error: 8.038137254901962
Mean Squared Error: 102.12137875816995
R^2 Score: 0.7908077110602924


In [240]:
# Define the home and visitor teams
home_team = input("Enter Away Team: ")  # Replace with desired home team
visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# Create a new data instance with zeros for all one-hot columns
input_data = {col: 0 for col in X_df.columns}
input_data[f'home_team_{home_team}'] = 1
input_data[f'visitor_team_{visitor_team}'] = 1

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Apply the same MinMaxScaler used on X_df
input_scaled = scaler.transform(input_df)

# Predict total points
predicted_total_points = model.predict(input_scaled)[0]
print(f"Predicted Total Points: {predicted_total_points}")

Predicted Total Points: 159.63




# Winner Prediction

In [260]:
# Create a column to check if the home team won
encoded_df['home_win'] = encoded_df.apply(
    lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
    axis=1
)
encoded_df.head()

Unnamed: 0,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_tpm,visitor_assists,home_points,home_fgm,home_fga,home_tpm,...,home_team_Orlando Magic,home_team_Philadelphia 76ers,home_team_Phoenix Suns,home_team_Portland Trail Blazers,home_team_Sacramento Kings,home_team_San Antonio Spurs,home_team_Toronto Raptors,home_team_Utah Jazz,home_team_Washington Wizards,home_win
0,97.0,34.0,82.0,0.415,9.0,23.0,121.0,45.0,80.0,12.0,...,0,0,0,0,0,0,0,0,0,1
1,107.0,40.0,90.0,0.444,11.0,26.0,118.0,46.0,88.0,17.0,...,0,0,0,0,0,0,0,0,0,1
2,101.0,37.0,102.0,0.363,11.0,27.0,102.0,35.0,86.0,5.0,...,0,0,0,0,0,0,0,0,0,1
3,72.0,28.0,88.0,0.318,5.0,16.0,89.0,32.0,82.0,11.0,...,0,0,0,0,0,0,0,0,0,1
4,83.0,29.0,77.0,0.377,7.0,19.0,110.0,42.0,96.0,14.0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

scaler = MinMaxScaler()

X = scaler.fit_transform(encoded_df.drop(columns=['home_win', 'home_points', 'total_points', 'visitor_points']))

X_df = pd.DataFrame(X, columns=encoded_df.drop(columns=['home_win', 'home_points', 'total_points', 'visitor_points']).columns)
y = encoded_df['home_win']

# Splitting into testing/train data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Create decision tree regression instance
model = DecisionTreeClassifier()

# Fit model
model = model.fit(X_train, y_train)

# Making predictions using scaled data
predictions = model.predict(X_test)

# Print predicted price for tested features
predictions

array([1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,

In [267]:
# Define the home and visitor teams
home_team = input("Enter Away Team: ")  # Replace with desired home team
visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# Create a new data instance with zeros for all one-hot columns
input_data = {col: 0 for col in X_df.columns}
input_data[f'home_team_{home_team}'] = 1
input_data[f'visitor_team_{visitor_team}'] = 1

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Apply the same MinMaxScaler used on X_df
input_scaled = scaler.transform(input_df)

# Predict total points
predicted_winner = model.predict(input_scaled)[0]

winning_team = home_team if  predicted_winner == 1 else visitor_team
print("Predicted Winner:", winning_team)

Predicted Winner: Miami Heat




In [266]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.7467320261437909
Confusion Matrix:
 [[217  73]
 [ 82 240]]
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.75      0.74       290
           1       0.77      0.75      0.76       322

    accuracy                           0.75       612
   macro avg       0.75      0.75      0.75       612
weighted avg       0.75      0.75      0.75       612

