In [81]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

In [82]:
game_df = pd.read_csv('./data/finalMergedSet.csv')
game_df.head()

Unnamed: 0,date,visitor,home,visitor_points,home_points
0,2015-10-02,Denver Nuggets,LA Clippers,96.0,103.0
1,2015-10-03,New Orleans Pelicans,Indiana Pacers,110.0,105.0
2,2015-10-03,Charlotte Hornets,Orlando Magic,106.0,100.0
3,2015-10-04,Charlotte Hornets,Miami Heat,90.0,77.0
4,2015-10-04,LA Clippers,Toronto Raptors,73.0,93.0


In [83]:
game_df.describe()

Unnamed: 0,visitor_points,home_points
count,12797.0,12797.0
mean,107.568024,109.817223
std,13.634783,13.597757
min,0.0,0.0
25%,99.0,101.0
50%,108.0,110.0
75%,116.0,119.0
max,176.0,175.0


In [84]:
game_df.dtypes

date               object
visitor            object
home               object
visitor_points    float64
home_points       float64
dtype: object

In [85]:
league_df = pd.read_excel('./data/team_data.xlsx')
league_df.head()

Unnamed: 0,team_id,team_code,team_name
0,1,ATL,Atlanta Hawks
1,2,BOS,Boston Celtics
2,4,BKN,Brooklyn Nets
3,5,CHA,Charlotte Hornets
4,6,CHI,Chicago Bulls


In [86]:
# Drop 'date' columns first
new_df = game_df.drop(columns=['date'], axis=1)

# Apply one-hot encoding and convert to booleans to integers
encoded_df = pd.get_dummies(new_df, columns=['visitor', 'home']).astype(int)

# Display
encoded_df

Unnamed: 0,visitor_points,home_points,visitor_Atlanta Hawks,visitor_Boston Celtics,visitor_Brooklyn Nets,visitor_Charlotte Hornets,visitor_Chicago Bulls,visitor_Cleveland Cavaliers,visitor_Dallas Mavericks,visitor_Denver Nuggets,...,home_Oklahoma City Thunder,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards
0,96,103,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,110,105,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,106,100,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,90,77,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,73,93,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12792,116,118,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
12793,134,127,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12794,111,110,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12795,103,115,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
# Create 'total_point' colum
# encoded_df['total_points'] = encoded_df['visitor_points'] + encoded_df['home_points']

# Create a column to check if the home team won
encoded_df['home_win'] = encoded_df.apply(
    lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
    axis=1
)

encoded_df.head()

Unnamed: 0,visitor_points,home_points,visitor_Atlanta Hawks,visitor_Boston Celtics,visitor_Brooklyn Nets,visitor_Charlotte Hornets,visitor_Chicago Bulls,visitor_Cleveland Cavaliers,visitor_Dallas Mavericks,visitor_Denver Nuggets,...,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards,home_win
0,96,103,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,110,105,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,106,100,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,90,77,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,73,93,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [134]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

scaler = MinMaxScaler()

X = scaler.fit_transform(encoded_df.drop(columns=['home_win']))

X_df = pd.DataFrame(X, columns=encoded_df.drop(columns=['home_win']).columns)

y = encoded_df['home_win']

# Splitting into testing/train data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Create decision tree regression instance
model = DecisionTreeClassifier()

# Fit model
model = model.fit(X_train, y_train)

# Making predictions using scaled data
predictions = model.predict(X_test)

# Print predicted price for tested features
predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [140]:
# Define the home and visitor teams
home_team = input("Enter Home Team: ")  # Replace with desired home team
visitor_team = input("Enter Away Team: ")  # Replace with desired visitor team

# Create a new data instance with zeros for all one-hot columns
input_data = {col: 0 for col in X_df.columns}
input_data[f'home_{home_team}'] = 1
input_data[f'visitor_{visitor_team}'] = 1

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Apply the same MinMaxScaler used on X_df
input_scaled = scaler.transform(input_df)

# Predict total points
predicted_winner = model.predict(input_scaled)[0]

winning_team = home_team if  predicted_winner == 1 else visitor_team
print("Predicted Winner:", winning_team)

Enter Home Team:  Boston Celtics
Enter Away Team:  Milwaukee Bucks


Predicted Winner: Milwaukee Bucks




In [148]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.998046875
Confusion Matrix:
 [[1085    3]
 [   2 1470]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1088
           1       1.00      1.00      1.00      1472

    accuracy                           1.00      2560
   macro avg       1.00      1.00      1.00      2560
weighted avg       1.00      1.00      1.00      2560

