In [96]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

# Clean Data Frames

In [97]:
# Read in game scores data
game_score_df = pd.read_excel('./data/apiGameScores.xlsx') 
game_score_df.head()

Unnamed: 0,game_ids,date,visitor,home,visitor_points,home_points
0,319,2015-11-25T01:00:00.000Z,Boston Celtics,Atlanta Hawks,97.0,121.0
1,1300,2016-04-09T23:30:00.000Z,Boston Celtics,Atlanta Hawks,107.0,118.0
2,1344,2016-04-16T23:00:00.000Z,Boston Celtics,Atlanta Hawks,101.0,102.0
3,1353,2016-04-19T23:00:00.000Z,Boston Celtics,Atlanta Hawks,72.0,89.0
4,1376,2016-04-27T00:30:00.000Z,Boston Celtics,Atlanta Hawks,83.0,110.0


In [98]:
game_score_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14242 entries, 0 to 14241
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_ids        14242 non-null  int64  
 1   date            14242 non-null  object 
 2   visitor         14242 non-null  object 
 3   home            14242 non-null  object 
 4   visitor_points  13044 non-null  float64
 5   home_points     13044 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 667.7+ KB


In [99]:
game_score_df = game_score_df.dropna()
game_score_df = game_score_df.reset_index(drop=True)
game_score_df.shape

(13044, 6)

# Machine Learning Model 1

In [100]:
# Drop 'date' columns first
new_df = game_score_df.drop(columns=['date', 'game_ids'], axis=1)

# Apply one-hot encoding and convert to booleans to integers
encoded_df = pd.get_dummies(new_df, columns=['visitor', 'home']).astype(int)

# Display
encoded_df

Unnamed: 0,visitor_points,home_points,visitor_Atlanta Hawks,visitor_Boston Celtics,visitor_Brooklyn Nets,visitor_Charlotte Hornets,visitor_Chicago Bulls,visitor_Cleveland Cavaliers,visitor_Dallas Mavericks,visitor_Denver Nuggets,...,home_Oklahoma City Thunder,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards
0,97,121,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,107,118,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,101,102,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,72,89,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,83,110,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13039,119,129,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13040,125,121,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13041,109,103,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13042,112,120,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [101]:
# Create a column to check if the home team won
encoded_df['home_win'] = encoded_df.apply(
    lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
    axis=1
)
encoded_df

Unnamed: 0,visitor_points,home_points,visitor_Atlanta Hawks,visitor_Boston Celtics,visitor_Brooklyn Nets,visitor_Charlotte Hornets,visitor_Chicago Bulls,visitor_Cleveland Cavaliers,visitor_Dallas Mavericks,visitor_Denver Nuggets,...,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards,home_win
0,97,121,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,107,118,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,101,102,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,72,89,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,83,110,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13039,119,129,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
13040,125,121,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
13041,109,103,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
13042,112,120,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [102]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

scaler = MinMaxScaler()

X = scaler.fit_transform(encoded_df.drop(columns=['home_win', 'home_points', 'visitor_points']))

X_df = pd.DataFrame(X, columns=encoded_df.drop(columns=['home_win', 'home_points', 'visitor_points']).columns)

y = encoded_df['home_win']

# Splitting into testing/train data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Create decision tree regression instance
model = DecisionTreeClassifier()

# Fit model
model = model.fit(X_train, y_train)

# Making predictions using scaled data
predictions = model.predict(X_test)

# Print predicted price for tested features
predictions

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

In [103]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.5277884246837868
Confusion Matrix:
 [[478 651]
 [581 899]]
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.42      0.44      1129
           1       0.58      0.61      0.59      1480

    accuracy                           0.53      2609
   macro avg       0.52      0.52      0.52      2609
weighted avg       0.52      0.53      0.53      2609



In [105]:
# # Define the home and visitor teams
home_team = input("Enter Away Team: ")  # Replace with desired home team
visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# Create a new data instance with zeros for all one-hot columns
input_data = {col: 0 for col in X_df.columns}
input_data[f'home_{home_team}'] = 1
input_data[f'visitor_{visitor_team}'] = 1

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Apply the same MinMaxScaler used on X_df
input_scaled = scaler.transform(input_df)

# Predict total points
predicted_winner = model.predict(input_scaled)[0]

winning_team = home_team if  predicted_winner == 1 else visitor_team
print("Predicted Winner:", winning_team)

Predicted Winner: Boston Celtics


