In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

# Clean Data Frames

In [3]:
# Read in game scores data
game_score_df = pd.read_excel('./data/apiGameScores.xlsx') 
game_score_df.head()

Unnamed: 0,game_ids,date,visitor,home,visitor_points,home_points
0,319,2015-11-25T01:00:00.000Z,Boston Celtics,Atlanta Hawks,97.0,121.0
1,1300,2016-04-09T23:30:00.000Z,Boston Celtics,Atlanta Hawks,107.0,118.0
2,1344,2016-04-16T23:00:00.000Z,Boston Celtics,Atlanta Hawks,101.0,102.0
3,1353,2016-04-19T23:00:00.000Z,Boston Celtics,Atlanta Hawks,72.0,89.0
4,1376,2016-04-27T00:30:00.000Z,Boston Celtics,Atlanta Hawks,83.0,110.0


In [4]:
game_score_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14273 entries, 0 to 14272
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_ids        14273 non-null  int64  
 1   date            14273 non-null  object 
 2   visitor         14273 non-null  object 
 3   home            14273 non-null  object 
 4   visitor_points  13181 non-null  float64
 5   home_points     13181 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 669.2+ KB


In [5]:
game_score_df = game_score_df.dropna()
game_score_df = game_score_df.reset_index(drop=True)
game_score_df.shape

(13181, 6)

# Machine Learning Model 1

In [6]:
# Drop 'date' columns first
new_df = game_score_df.drop(columns=['date', 'game_ids'], axis=1)

# Apply one-hot encoding and convert to booleans to integers
encoded_df = pd.get_dummies(new_df, columns=['visitor', 'home']).astype(int)

# Display
encoded_df

Unnamed: 0,visitor_points,home_points,visitor_Atlanta Hawks,visitor_Boston Celtics,visitor_Brooklyn Nets,visitor_Charlotte Hornets,visitor_Chicago Bulls,visitor_Cleveland Cavaliers,visitor_Dallas Mavericks,visitor_Denver Nuggets,...,home_Oklahoma City Thunder,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards
0,97,121,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,107,118,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,101,102,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,72,89,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,83,110,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13176,119,129,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13177,125,121,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13178,109,103,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13179,112,120,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
# Copy encoded data frame
model_df = encoded_df.copy()

# Create first target column 'winner'
model_df['winner'] = encoded_df.apply(
    lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
    axis=1
)
# Create second target column 'total_points'
model_df['total_points'] = model_df['home_points'] + model_df['visitor_points']

# Display
model_df

Unnamed: 0,visitor_points,home_points,visitor_Atlanta Hawks,visitor_Boston Celtics,visitor_Brooklyn Nets,visitor_Charlotte Hornets,visitor_Chicago Bulls,visitor_Cleveland Cavaliers,visitor_Dallas Mavericks,visitor_Denver Nuggets,...,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards,winner,total_points
0,97,121,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,218
1,107,118,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,225
2,101,102,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,203
3,72,89,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,161
4,83,110,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13176,119,129,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,248
13177,125,121,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,246
13178,109,103,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,212
13179,112,120,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,232


In [8]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

X = model_df.drop(columns=['winner','home_points', 'visitor_points', 'total_points'])
X_df = pd.DataFrame(X, columns=model_df.drop(columns=['winner', 'home_points', 'visitor_points', 'total_points']).columns)

# Create both target variables
y_winner = model_df['winner']
y_total_points = model_df['total_points']

# Splitting data
X_train, X_test, y_train_winner, y_test_winner = train_test_split(X_df, y_winner, test_size=0.2, random_state=42)
_, _, y_train_points, y_test_points = train_test_split(X_df, y_total_points, test_size=0.2, random_state=42)

# Scaling the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train the classification model (winner prediction)
clf_model = DecisionTreeClassifier().fit(X_train, y_train_winner)

# Train regression model (total_points)
linear_model = LinearRegression().fit(X_train_scaled, y_train_points)

# Scale the test data
X_test_scaled = scaler.transform(X_test)

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate the classification model
winner_predictions = clf_model.predict(X_test_scaled)  # Predictions for winner
print("Winner Prediction Evaluation:")
print("Accuracy:", accuracy_score(y_test_winner, winner_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test_winner, winner_predictions))
print("Classification Report:\n", classification_report(y_test_winner, winner_predictions))


Winner Prediction Evaluation:
Accuracy: 0.5358361774744027
Confusion Matrix:
 [[491 647]
 [577 922]]
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.43      0.45      1138
           1       0.59      0.62      0.60      1499

    accuracy                           0.54      2637
   macro avg       0.52      0.52      0.52      2637
weighted avg       0.53      0.54      0.53      2637





In [18]:
visitor_team = input("Enter Away Team: ")
home_team = input("Enter Home Team: ")

# Create input data with correct features
input_data = {col: 0 for col in X_df.columns}
if f'home_{home_team}' in input_data:
    input_data[f'home_{home_team}'] = 1
if f'visitor_{visitor_team}' in input_data:
    input_data[f'visitor_{visitor_team}'] = 1

# Convert to DataFrame and scale
input_df = pd.DataFrame([input_data])
input_scaled = scaler.transform(input_df)

# Predict the winner and total points
predicted_winner = clf_model.predict(input_scaled)[0]
predicted_total_points = linear_model.predict(input_scaled)[0]

# Map winner prediction to the team name
winning_team = home_team if predicted_winner == 1 else visitor_team

# Print results
print("Predicted Winner:", winning_team)
print("Predicted Total Points:", round(predicted_total_points, 0))

Predicted Winner: Miami Heat
Predicted Total Points: 212.0


