In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

In [3]:
game_df = pd.read_csv('./data/gameStatistics.csv') # change path to ./data/gameStatistics contains the game_ids when cleaned properly
game_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team
0,18.0,34.0,2.0,,20.0,,97.0,34.0,82.0,41.5,...,37.0,43.0,33.0,25.0,8.0,20.0,7.0,24.0,240:00,Atlanta Hawks
1,25.0,46.0,12.0,,18.0,,107.0,40.0,90.0,44.4,...,39.0,44.0,31.0,22.0,10.0,17.0,10.0,11.0,240:00,Atlanta Hawks
2,15.0,36.0,3.0,,10.0,,101.0,37.0,102.0,36.3,...,40.0,53.0,23.0,20.0,4.0,11.0,9.0,1.0,240:00,Atlanta Hawks
3,8.0,36.0,0.0,,6.0,,72.0,28.0,88.0,31.8,...,46.0,49.0,20.0,16.0,7.0,15.0,15.0,17.0,240:00,Atlanta Hawks
4,8.0,32.0,10.0,,9.0,,83.0,29.0,77.0,37.7,...,38.0,51.0,30.0,17.0,13.0,12.0,3.0,27.0,240:00,Atlanta Hawks


In [4]:
game_df.describe()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus
count,2319.0,2319.0,2319.0,2009.0,2319.0,2009.0,3057.0,3057.0,3057.0,3057.0,...,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0,3057.0
mean,12.34972,44.878827,11.448038,12.816327,16.35705,10.516177,108.734053,40.016683,87.152764,45.974583,...,35.553876,10.363428,33.831534,44.194962,24.219496,20.004907,7.586523,13.75139,4.94668,0.564933
std,6.277979,9.807574,8.957032,5.490177,6.282615,2.758519,13.566358,5.537892,7.39507,5.581095,...,8.777542,3.798734,5.491791,6.561973,5.220834,4.29061,2.974002,3.999961,2.554274,14.548342
min,0.0,14.0,0.0,0.0,0.0,4.0,64.0,21.0,54.0,27.1,...,0.0,1.0,16.0,26.0,7.0,8.0,0.0,3.0,0.0,-60.0
25%,8.0,38.0,5.0,9.0,12.0,9.0,100.0,36.0,82.0,42.2,...,29.7,8.0,30.0,40.0,21.0,17.0,5.0,11.0,3.0,-9.0
50%,12.0,44.0,10.0,12.0,16.0,10.0,109.0,40.0,87.0,45.9,...,35.5,10.0,34.0,44.0,24.0,20.0,7.0,14.0,5.0,1.0
75%,16.0,52.0,17.0,16.0,20.0,12.0,118.0,44.0,92.0,49.5,...,41.4,13.0,37.0,48.0,28.0,23.0,9.0,16.0,6.0,10.0
max,53.0,80.0,67.0,35.0,42.0,26.0,158.0,65.0,128.0,65.5,...,68.4,27.0,56.0,70.0,43.0,39.0,22.0,29.0,20.0,50.0


In [5]:
game_df.dtypes

visitor_fastBreakPoints       float64
visitor_pointsInPaint         float64
visitor_biggestLead           float64
visitor_secondChancePoints    float64
visitor_pointsOffTurnovers    float64
visitor_longestRun            float64
visitor_points                float64
visitor_fgm                   float64
visitor_fga                   float64
visitor_fgp                   float64
visitor_ftm                   float64
visitor_fta                   float64
visitor_ftp                   float64
visitor_tpm                   float64
visitor_tpa                   float64
visitor_tpp                   float64
visitor_offReb                float64
visitor_defReb                float64
visitor_totReb                float64
visitor_assists               float64
visitor_pFouls                float64
visitor_steals                float64
visitor_turnovers             float64
visitor_blocks                float64
visitor_plusMinus             float64
visitor_min                    object
game_id     

In [6]:
# Based on the column data and the fact that the home_team has the same missing data as the visitor_team
# A function should be created to fill all the Null values with the average of those columns
# Later will see if this affects how well the machine learning model works

def fillNullCols(df):
    # Find all data type 'int' or 'float' columns that have Null values
    df.select_dtypes(include=['int', 'float'])

    # Looks at the total percentage of null values in each columns
    null_percentage = (df.isnull().sum() / len(df)) * 100

    # Rule of thumb: Moderate Null Percentage (10-30%)
    # Filling the null values with an appropriate value (like the mean, median, or mode) is often better in this range, especially if the column is critical for analysis

    # Get columns from null_percentage with at least 20% with null values
    null_columns = null_percentage[null_percentage >= 20].index

    # Fill rows with the mode of the column
    for column in null_columns:
        mode_fill_value = df[column].mode()[0]
        df[column] = df[column].fillna(mode_fill_value)

    return df

# Create new data frame
stats_df = fillNullCols(game_df)
stats_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team
0,18.0,34.0,2.0,11.0,20.0,9.0,97.0,34.0,82.0,41.5,...,37.0,43.0,33.0,25.0,8.0,20.0,7.0,24.0,240:00,Atlanta Hawks
1,25.0,46.0,12.0,11.0,18.0,9.0,107.0,40.0,90.0,44.4,...,39.0,44.0,31.0,22.0,10.0,17.0,10.0,11.0,240:00,Atlanta Hawks
2,15.0,36.0,3.0,11.0,10.0,9.0,101.0,37.0,102.0,36.3,...,40.0,53.0,23.0,20.0,4.0,11.0,9.0,1.0,240:00,Atlanta Hawks
3,8.0,36.0,0.0,11.0,6.0,9.0,72.0,28.0,88.0,31.8,...,46.0,49.0,20.0,16.0,7.0,15.0,15.0,17.0,240:00,Atlanta Hawks
4,8.0,32.0,10.0,11.0,9.0,9.0,83.0,29.0,77.0,37.7,...,38.0,51.0,30.0,17.0,13.0,12.0,3.0,27.0,240:00,Atlanta Hawks


In [None]:
# Create target columns 'total_points'
stats_df['total_points'] = stats_df['visitor_points'] + stats_df['home_points']
stats_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team,home_win,total_points
0,18.0,34.0,2.0,11.0,20.0,9.0,97.0,34.0,82.0,41.5,...,33.0,25.0,8.0,20.0,7.0,24.0,240:00,Atlanta Hawks,1,218.0
1,25.0,46.0,12.0,11.0,18.0,9.0,107.0,40.0,90.0,44.4,...,31.0,22.0,10.0,17.0,10.0,11.0,240:00,Atlanta Hawks,1,225.0
2,15.0,36.0,3.0,11.0,10.0,9.0,101.0,37.0,102.0,36.3,...,23.0,20.0,4.0,11.0,9.0,1.0,240:00,Atlanta Hawks,1,203.0
3,8.0,36.0,0.0,11.0,6.0,9.0,72.0,28.0,88.0,31.8,...,20.0,16.0,7.0,15.0,15.0,17.0,240:00,Atlanta Hawks,1,161.0
4,8.0,32.0,10.0,11.0,9.0,9.0,83.0,29.0,77.0,37.7,...,30.0,17.0,13.0,12.0,3.0,27.0,240:00,Atlanta Hawks,1,193.0


In [46]:
# Set target column
target_column = 'total_points'

# Look for correlation between columns and 'total_points' column
correlations = stats_df.corr(numeric_only=True)[target_column].sort_values(ascending=True)

# Set conditions to find feature columns
feature_columns = correlations[correlations > 0.3]

# Create new data frame with chosen feature columns
new_df = stats_df[feature_columns.index]

# Add 'visitor_team' and 'home_team' to the new DataFrame
new_df = pd.concat([stats_df[['home_team', 'visitor_team']], new_df], axis=1)
new_df

Unnamed: 0,home_team,visitor_team,visitor_tpa,home_pointsInPaint,game_id,home_tpp,home_fgp,visitor_tpp,visitor_pointsInPaint,home_fga,...,visitor_fga,visitor_tpm,home_assists,visitor_assists,visitor_fgp,home_fgm,visitor_fgm,home_points,visitor_points,total_points
0,Atlanta Hawks,Boston Celtics,21.0,50.0,319,50.0,56.2,42.9,34.0,80.0,...,82.0,9.0,33.0,23.0,41.5,45.0,34.0,121.0,97.0,218.0
1,Atlanta Hawks,Boston Celtics,28.0,44.0,1300,51.5,52.3,39.3,46.0,88.0,...,90.0,11.0,31.0,26.0,44.4,46.0,40.0,118.0,107.0,225.0
2,Atlanta Hawks,Boston Celtics,35.0,52.0,1344,19.2,40.7,31.4,36.0,86.0,...,102.0,11.0,23.0,27.0,36.3,35.0,37.0,102.0,101.0,203.0
3,Atlanta Hawks,Boston Celtics,28.0,32.0,1353,37.9,39.0,17.9,36.0,82.0,...,88.0,5.0,20.0,16.0,31.8,32.0,28.0,89.0,72.0,161.0
4,Atlanta Hawks,Boston Celtics,29.0,44.0,1376,40.0,43.8,24.1,32.0,96.0,...,77.0,7.0,30.0,19.0,37.7,42.0,29.0,110.0,83.0,193.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053,Toronto Raptors,Washington Wizards,43.0,42.0,14072,21.2,68.4,39.5,46.0,95.0,...,76.0,17.0,20.0,23.0,46.1,34.0,35.0,95.0,113.0,208.0
3054,Washington Wizards,Utah Jazz,30.0,56.0,6974,31.8,50.5,33.3,68.0,93.0,...,90.0,10.0,28.0,24.0,53.3,47.0,48.0,116.0,127.0,243.0
3055,Utah Jazz,Washington Wizards,36.0,42.0,11234,35.0,85.0,44.4,46.0,94.0,...,88.0,16.0,33.0,27.0,50.0,44.0,44.0,112.0,121.0,233.0
3056,Utah Jazz,Washington Wizards,37.0,58.0,5520,43.8,55.2,40.5,52.0,87.0,...,98.0,15.0,26.0,32.0,49.0,48.0,48.0,128.0,124.0,252.0


In [47]:
new_df.isnull().sum()

home_team                0
visitor_team             0
visitor_tpa              1
home_pointsInPaint       0
game_id                  0
home_tpp                 1
home_fgp                 1
visitor_tpp              1
visitor_pointsInPaint    0
home_fga                 1
home_tpm                 1
visitor_fga              1
visitor_tpm              1
home_assists             1
visitor_assists          1
visitor_fgp              1
home_fgm                 1
visitor_fgm              1
home_points              1
visitor_points           1
total_points             1
dtype: int64

In [48]:
new_df = new_df.dropna()

In [49]:
# Apply one-hot encoding and convert to booleans to integers
encoded_df = pd.get_dummies(new_df, columns=['visitor_team', 'home_team']).astype(int)

# Display
encoded_df

Unnamed: 0,visitor_tpa,home_pointsInPaint,game_id,home_tpp,home_fgp,visitor_tpp,visitor_pointsInPaint,home_fga,home_tpm,visitor_fga,...,home_team_Oklahoma City Thunder,home_team_Orlando Magic,home_team_Philadelphia 76ers,home_team_Phoenix Suns,home_team_Portland Trail Blazers,home_team_Sacramento Kings,home_team_San Antonio Spurs,home_team_Toronto Raptors,home_team_Utah Jazz,home_team_Washington Wizards
0,21,50,319,50,56,42,34,80,12,82,...,0,0,0,0,0,0,0,0,0,0
1,28,44,1300,51,52,39,46,88,17,90,...,0,0,0,0,0,0,0,0,0,0
2,35,52,1344,19,40,31,36,86,5,102,...,0,0,0,0,0,0,0,0,0,0
3,28,32,1353,37,39,17,36,82,11,88,...,0,0,0,0,0,0,0,0,0,0
4,29,44,1376,40,43,24,32,96,14,77,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053,43,42,14072,21,68,39,46,95,7,76,...,0,0,0,0,0,0,0,1,0,0
3054,30,56,6974,31,50,33,68,93,7,90,...,0,0,0,0,0,0,0,0,0,1
3055,36,42,11234,35,85,44,46,94,14,88,...,0,0,0,0,0,0,0,0,1,0
3056,37,58,5520,43,55,40,52,87,14,98,...,0,0,0,0,0,0,0,0,1,0


In [53]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

scaler = MinMaxScaler()

X = scaler.fit_transform(encoded_df.drop(columns=['total_points', 'home_points', 'visitor_points']))

X_df = pd.DataFrame(X, columns=encoded_df.drop(columns=['total_points', 'home_points', 'visitor_points']).columns)

y = encoded_df['total_points']

# Splitting into testing/train data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Create decision tree regression instance
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit model
model = model.fit(X_train, y_train)

# Making predictions using scaled data
predictions = model.predict(X_test)

# Print predicted price for tested features
predictions

array([224.81, 229.08, 257.13, 238.57, 195.23, 198.63, 205.08, 223.9 ,
       225.48, 237.03, 235.15, 214.49, 206.71, 207.12, 226.32, 196.73,
       233.3 , 209.11, 201.75, 202.18, 226.7 , 180.79, 249.96, 202.16,
       233.25, 253.68, 222.56, 226.24, 229.96, 199.28, 186.63, 239.73,
       223.59, 202.12, 214.44, 219.38, 198.11, 197.61, 190.24, 223.16,
       204.94, 226.21, 222.2 , 254.37, 219.13, 211.14, 254.55, 202.24,
       209.98, 201.7 , 219.23, 234.27, 239.68, 218.57, 240.18, 219.7 ,
       222.31, 216.22, 240.7 , 213.07, 205.39, 202.63, 188.21, 225.33,
       251.2 , 247.25, 254.49, 211.65, 242.89, 239.3 , 243.27, 243.51,
       229.67, 214.41, 211.65, 222.52, 222.74, 199.68, 220.06, 222.31,
       201.16, 214.02, 199.77, 228.97, 246.05, 243.07, 271.67, 204.75,
       203.51, 177.59, 204.61, 197.01, 239.67, 224.02, 208.09, 216.  ,
       232.22, 255.84, 209.2 , 244.33, 200.93, 233.57, 257.64, 215.56,
       202.17, 199.77, 269.39, 217.59, 238.19, 215.74, 211.94, 228.23,
      

In [57]:
# Define the home and visitor teams
home_team = input("Enter Away Team: ")  # Replace with desired home team
visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# Create a new data instance with zeros for all one-hot columns
input_data = {col: 0 for col in X_df.columns}
input_data[f'home_team_{home_team}'] = 1
input_data[f'visitor_team_{visitor_team}'] = 1

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Apply the same MinMaxScaler used on X_df
input_scaled = scaler.transform(input_df)

# Predict total points
predicted_total_points = model.predict(input_scaled)[0]
print(f"Predicted Total Points: {predicted_total_points}")

Predicted Total Points: 142.8


