In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

# Clean Data Frames

In [2]:
# Read in game scores data
game_score_df = pd.read_excel('./data/apiGameScores.xlsx') 
game_score_df.head()

Unnamed: 0,game_ids,date,visitor,home,visitor_points,home_points
0,319,2015-11-25T01:00:00.000Z,Boston Celtics,Atlanta Hawks,97.0,121.0
1,1300,2016-04-09T23:30:00.000Z,Boston Celtics,Atlanta Hawks,107.0,118.0
2,1344,2016-04-16T23:00:00.000Z,Boston Celtics,Atlanta Hawks,101.0,102.0
3,1353,2016-04-19T23:00:00.000Z,Boston Celtics,Atlanta Hawks,72.0,89.0
4,1376,2016-04-27T00:30:00.000Z,Boston Celtics,Atlanta Hawks,83.0,110.0


In [3]:
game_score_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14242 entries, 0 to 14241
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_ids        14242 non-null  int64  
 1   date            14242 non-null  object 
 2   visitor         14242 non-null  object 
 3   home            14242 non-null  object 
 4   visitor_points  12952 non-null  float64
 5   home_points     12952 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 667.7+ KB


In [4]:
game_score_df = game_score_df.dropna()
game_score_df = game_score_df.reset_index(drop=True)
game_score_df.shape

(12952, 6)

# Machine Learning Model

In [128]:
# Define the home and visitor teams
home_team = input("Enter Away Team: ")  # Replace with desired home team
visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# Option 1: Create Data Frame to find stats one-way
visitor_home_df = game_score_df[(game_score_df['visitor'] == f'{visitor_team}') & (game_score_df['home'] == f'{home_team}')]
visitor_home_df
# Option 2: Create a Data Frame to find stats both-ways
# game_score_df.loc[
#         ((game_score_df['visitor'] == visitor_team) & (game_score_df['home'] == home_team)) | 
#         ((game_score_df['visitor'] == home_team) & (game_score_df['home'] == visitor_team))
#         ]

Unnamed: 0,game_ids,date,visitor,home,visitor_points,home_points
197,831,2016-02-02T01:00:00.000Z,Dallas Mavericks,Atlanta Hawks,97.0,112.0
198,2429,2017-03-02T00:30:00.000Z,Dallas Mavericks,Atlanta Hawks,95.0,100.0
199,2906,2017-10-12T23:30:00.000Z,Dallas Mavericks,Atlanta Hawks,108.0,94.0
200,3403,2017-12-24T00:30:00.000Z,Dallas Mavericks,Atlanta Hawks,107.0,112.0
201,4438,2018-10-24T23:00:00.000Z,Dallas Mavericks,Atlanta Hawks,104.0,111.0
202,7226,2020-02-23T00:30:00.000Z,Dallas Mavericks,Atlanta Hawks,107.0,111.0
203,8511,2021-02-04T00:30:00.000Z,Dallas Mavericks,Atlanta Hawks,122.0,116.0
204,9579,2021-10-21T23:30:00.000Z,Dallas Mavericks,Atlanta Hawks,87.0,113.0
205,12216,2023-04-02T22:00:00.000Z,Dallas Mavericks,Atlanta Hawks,130.0,132.0
206,13182,2024-01-27T00:00:00.000Z,Dallas Mavericks,Atlanta Hawks,148.0,143.0


In [126]:
# Drop 'date' columns first
new_df = visitor_home_df.drop(columns=['date', 'game_ids'], axis=1)

# Apply one-hot encoding and convert to booleans to integers
encoded_df = pd.get_dummies(new_df, columns=['visitor', 'home']).astype(int)

# Display
encoded_df

Unnamed: 0,visitor_points,home_points,visitor_Boston Celtics,home_Atlanta Hawks
0,97,121,1,1
1,107,118,1,1
2,101,102,1,1
3,72,89,1,1
4,83,110,1,1
5,103,101,1,1
6,116,123,1,1
7,110,107,1,1
8,110,99,1,1
9,114,96,1,1


In [127]:
# Create a column to check if the home team won
encoded_df['home_win'] = encoded_df.apply(
    lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
    axis=1
)
encoded_df

Unnamed: 0,visitor_points,home_points,visitor_Boston Celtics,home_Atlanta Hawks,home_win
0,97,121,1,1,1
1,107,118,1,1,1
2,101,102,1,1,1
3,72,89,1,1,1
4,83,110,1,1,1
5,103,101,1,1,0
6,116,123,1,1,1
7,110,107,1,1,0
8,110,99,1,1,0
9,114,96,1,1,0


In [None]:
# Create Data Frame of only 'home_team' and 'visitor_team'
# Relevant Columns 
relevant_columns = [ f'visitor_{visitor_team}', f'home_{home_team}', 'visitor_points', 'home_points', 'home_win']


Unnamed: 0,visitor_points,home_points,visitor_Atlanta Hawks,visitor_Boston Celtics,visitor_Brooklyn Nets,visitor_Charlotte Hornets,visitor_Chicago Bulls,visitor_Cleveland Cavaliers,visitor_Dallas Mavericks,visitor_Denver Nuggets,...,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards,home_win
24,93,106,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
25,109,101,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,103,111,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
27,95,104,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
28,104,92,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,136,108,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
855,130,126,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
856,122,120,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
857,114,107,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

scaler = MinMaxScaler()

X = scaler.fit_transform(encoded_df.drop(columns=['home_win', 'home_points', 'visitor_points']))

X_df = pd.DataFrame(X, columns=encoded_df.drop(columns=['home_win', 'home_points', 'visitor_points']).columns)

y = encoded_df['home_win']

# Splitting into testing/train data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Create decision tree regression instance
model = DecisionTreeClassifier()

# Fit model
model = model.fit(X_train, y_train)

# Making predictions using scaled data
predictions = model.predict(X_test)

# Print predicted price for tested features
predictions

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [90]:
# Define the home and visitor teams
home_team = input("Enter Away Team: ")  # Replace with desired home team
visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# Create a new data instance with zeros for all one-hot columns
input_data = {col: 0 for col in X_df.columns}
input_data[f'home_{home_team}'] = 1
input_data[f'visitor_{visitor_team}'] = 1

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Apply the same MinMaxScaler used on X_df
input_scaled = scaler.transform(input_df)

# Predict total points
predicted_winner = model.predict(input_scaled)[0]

winning_team = home_team if  predicted_winner == 1 else visitor_team
print("Predicted Winner:", winning_team)

Predicted Winner: Atlanta Hawks




In [None]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.5492087996912389
Confusion Matrix:
 [[475 662]
 [506 948]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.42      0.45      1137
           1       0.59      0.65      0.62      1454

    accuracy                           0.55      2591
   macro avg       0.54      0.53      0.53      2591
weighted avg       0.54      0.55      0.54      2591

