In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

In [88]:
game_statistics_df = pd.read_csv('./data/gameStatistics.csv')
game_statistics_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team
0,15.0,48.0,25.0,5.0,23.0,15.0,120.0,40.0,86.0,46.5,...,31.0,46.0,21.0,27.0,9.0,18.0,4.0,-10.0,240:00,Houston Rockets
1,10.0,40.0,8.0,18.0,12.0,11.0,121.0,43.0,97.0,44.3,...,38.0,46.0,27.0,18.0,1.0,9.0,8.0,-3.0,240:00,Cleveland Cavaliers
2,14.0,36.0,7.0,3.0,6.0,16.0,108.0,38.0,78.0,48.7,...,36.0,54.0,19.0,18.0,8.0,7.0,4.0,8.0,240:00,Toronto Raptors
3,4.0,36.0,9.0,3.0,8.0,10.0,80.0,30.0,81.0,37.0,...,42.0,49.0,21.0,17.0,6.0,9.0,3.0,15.0,240:00,Oklahoma City Thunder
4,16.0,48.0,5.0,12.0,6.0,10.0,105.0,39.0,90.0,43.3,...,42.0,58.0,28.0,18.0,7.0,9.0,11.0,13.0,240:00,Memphis Grizzlies


In [89]:
game_statistics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4072 entries, 0 to 4071
Data columns (total 55 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   visitor_fastBreakPoints     809 non-null    float64
 1   visitor_pointsInPaint       809 non-null    float64
 2   visitor_biggestLead         809 non-null    float64
 3   visitor_secondChancePoints  809 non-null    float64
 4   visitor_pointsOffTurnovers  809 non-null    float64
 5   visitor_longestRun          809 non-null    float64
 6   visitor_points              4070 non-null   float64
 7   visitor_fgm                 4070 non-null   float64
 8   visitor_fga                 4070 non-null   float64
 9   visitor_fgp                 4070 non-null   float64
 10  visitor_ftm                 4070 non-null   float64
 11  visitor_fta                 4070 non-null   float64
 12  visitor_ftp                 4070 non-null   float64
 13  visitor_tpm                 4070 

In [90]:
# Convert values in percentage columns to decimal form
game_statistics_df[['visitor_ftp', 'visitor_fgp', 'visitor_tpp', 'home_ftp', 'home_fgp','home_tpp']] = game_statistics_df[
    ['visitor_ftp', 
     'visitor_fgp', 
     'visitor_tpp', 
     'home_ftp', 
     'home_fgp',
     'home_tpp']
    ].apply(lambda x: x / 100)

In [91]:
# Drop irrelevant columns
game_statistics_df = game_statistics_df.drop(columns=['visitor_fastBreakPoints', 'visitor_pointsInPaint',
       'visitor_biggestLead', 'visitor_secondChancePoints',
       'visitor_pointsOffTurnovers', 'visitor_longestRun','visitor_plusMinus',
       'visitor_min','home_fastBreakPoints',
       'home_pointsInPaint', 'home_biggestLead', 'home_secondChancePoints',
       'home_pointsOffTurnovers', 'home_longestRun','home_plusMinus', 'home_min','game_id'])

# Display
game_statistics_df.head()

Unnamed: 0,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,visitor_tpp,...,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_team
0,120.0,40.0,86.0,0.465,26.0,31.0,0.839,14.0,33.0,0.424,...,0.333,15.0,31.0,46.0,21.0,27.0,9.0,18.0,4.0,Houston Rockets
1,121.0,43.0,97.0,0.443,19.0,21.0,0.905,16.0,38.0,0.421,...,0.32,8.0,38.0,46.0,27.0,18.0,1.0,9.0,8.0,Cleveland Cavaliers
2,108.0,38.0,78.0,0.487,17.0,19.0,0.895,15.0,34.0,0.441,...,0.314,18.0,36.0,54.0,19.0,18.0,8.0,7.0,4.0,Toronto Raptors
3,80.0,30.0,81.0,0.37,12.0,22.0,0.545,8.0,41.0,0.195,...,0.419,7.0,42.0,49.0,21.0,17.0,6.0,9.0,3.0,Oklahoma City Thunder
4,105.0,39.0,90.0,0.433,15.0,24.0,0.625,12.0,35.0,0.343,...,0.385,16.0,42.0,58.0,28.0,18.0,7.0,9.0,11.0,Memphis Grizzlies


In [92]:
game_statistics_df = game_statistics_df.dropna()

In [93]:
# Copy encoded data frame
target_dataframe = game_statistics_df.copy()

# Create first target column 'winner'
target_dataframe['winner'] = target_dataframe.apply(
    lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
    axis=1
)
# Create second target column 'total_points'
target_dataframe['total_points'] = target_dataframe['home_points'] + target_dataframe['visitor_points']

# Display
target_dataframe.head()

Unnamed: 0,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,visitor_tpp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_team,winner,total_points
0,120.0,40.0,86.0,0.465,26.0,31.0,0.839,14.0,33.0,0.424,...,31.0,46.0,21.0,27.0,9.0,18.0,4.0,Houston Rockets,0,230.0
1,121.0,43.0,97.0,0.443,19.0,21.0,0.905,16.0,38.0,0.421,...,38.0,46.0,27.0,18.0,1.0,9.0,8.0,Cleveland Cavaliers,0,239.0
2,108.0,38.0,78.0,0.487,17.0,19.0,0.895,15.0,34.0,0.441,...,36.0,54.0,19.0,18.0,8.0,7.0,4.0,Toronto Raptors,1,224.0
3,80.0,30.0,81.0,0.37,12.0,22.0,0.545,8.0,41.0,0.195,...,42.0,49.0,21.0,17.0,6.0,9.0,3.0,Oklahoma City Thunder,1,175.0
4,105.0,39.0,90.0,0.433,15.0,24.0,0.625,12.0,35.0,0.343,...,42.0,58.0,28.0,18.0,7.0,9.0,11.0,Memphis Grizzlies,1,223.0


In [94]:
# Apply one-hot encoding and convert to booleans to ingeters
encoded_df = pd.get_dummies(target_dataframe, columns=['visitor_team', 'home_team'])

# Display
new_df = encoded_df.replace({True: 1, False: 0})
new_df.head()

  new_df = encoded_df.replace({True: 1, False: 0})


Unnamed: 0,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,visitor_tpp,...,home_team_Oklahoma City Thunder,home_team_Orlando Magic,home_team_Philadelphia 76ers,home_team_Phoenix Suns,home_team_Portland Trail Blazers,home_team_Sacramento Kings,home_team_San Antonio Spurs,home_team_Toronto Raptors,home_team_Utah Jazz,home_team_Washington Wizards
0,120.0,40.0,86.0,0.465,26.0,31.0,0.839,14.0,33.0,0.424,...,0,0,0,0,0,0,0,0,0,0
1,121.0,43.0,97.0,0.443,19.0,21.0,0.905,16.0,38.0,0.421,...,0,0,0,0,0,0,0,0,0,0
2,108.0,38.0,78.0,0.487,17.0,19.0,0.895,15.0,34.0,0.441,...,0,0,0,0,0,0,0,1,0,0
3,80.0,30.0,81.0,0.37,12.0,22.0,0.545,8.0,41.0,0.195,...,1,0,0,0,0,0,0,0,0,0
4,105.0,39.0,90.0,0.433,15.0,24.0,0.625,12.0,35.0,0.343,...,0,0,0,0,0,0,0,0,0,0


# Prepare Machine Learning Model

In [95]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

X = encoded_df.drop(columns=['winner','home_points', 'visitor_points', 'total_points'])
X_df = pd.DataFrame(X, columns=encoded_df.drop(columns=['winner','home_points', 'visitor_points', 'total_points']).columns)

scaler = StandardScaler()
# X = scaler.fit_transform(new_df.drop(columns=['home_points', 'visitor_points', 'winner', 'total_points']))
# X_df = pd.DataFrame(X, columns=new_df.drop(columns=['home_points', 'visitor_points', 'winner', 'total_points']).columns)

# Create both target variables
y_winner = target_dataframe['winner']
y_total_points = target_dataframe['total_points']

# Splitting data
X_train, X_test, y_train_winner, y_test_winner = train_test_split(X_df, y_winner, test_size=0.2, random_state=42)
_, _, y_train_points, y_test_points = train_test_split(X_df, y_total_points, test_size=0.2, random_state=42)

# # Scaling the training data
X_train_scaled = scaler.fit_transform(X_train)

# Train the classification model (winner prediction)
clf_model = DecisionTreeClassifier().fit(X_train, y_train_winner)

# Train regression model (total_points)
linear_model = LinearRegression().fit(X_train_scaled, y_train_points)
# linear_model = LinearRegression().fit(X_train, y_train_points)

# Scale the test data
X_test_scaled = scaler.transform(X_test)

In [96]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report # type: ignore

# Evaluate the classification model
winner_predictions = clf_model.predict(X_test)  # Predictions for winner
print("Winner Prediction Evaluation:")
print("Accuracy:", accuracy_score(y_test_winner, winner_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test_winner, winner_predictions))
print("Classification Report:\n", classification_report(y_test_winner, winner_predictions))

Winner Prediction Evaluation:
Accuracy: 0.7886977886977887
Confusion Matrix:
 [[333  83]
 [ 89 309]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.80      0.79       416
           1       0.79      0.78      0.78       398

    accuracy                           0.79       814
   macro avg       0.79      0.79      0.79       814
weighted avg       0.79      0.79      0.79       814



In [99]:
visitor_team = input("Enter Away Team: ")
home_team = input("Enter Home Team: ")

# Create input data with correct features
input_data = {col: 0 for col in X_df.columns}
if f'home_team_{home_team}' in input_data:
    input_data[f'home_team_{home_team}'] = 1
if f'visitor_team_{visitor_team}' in input_data:
    input_data[f'visitor_team_{visitor_team}'] = 1

# Convert to DataFrame and scale
input_df = pd.DataFrame([input_data])
input_scaled = scaler.transform(input_df)

# Predict the winner and total points
predicted_winner = clf_model.predict(input_scaled)[0]
predicted_total_points = linear_model.predict(input_scaled)[0]

# Map winner prediction to the team name
winning_team = home_team if predicted_winner == 1 else visitor_team

# Print results
print("Predicted Winner:", winning_team)
print("Predicted Total Points:", round(predicted_total_points, 0))

Predicted Winner: Atlanta Hawks
Predicted Total Points: 190.0


