In [345]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

In [346]:
# Read in game scores data (same as verison1)
game_score_df = pd.read_excel('./data/apiGameScores.xlsx')
game_score_df.head()

Unnamed: 0,game_ids,date,visitor,home,visitor_points,home_points
0,319,2015-11-25T01:00:00.000Z,Boston Celtics,Atlanta Hawks,97.0,121.0
1,1300,2016-04-09T23:30:00.000Z,Boston Celtics,Atlanta Hawks,107.0,118.0
2,1344,2016-04-16T23:00:00.000Z,Boston Celtics,Atlanta Hawks,101.0,102.0
3,1353,2016-04-19T23:00:00.000Z,Boston Celtics,Atlanta Hawks,72.0,89.0
4,1376,2016-04-27T00:30:00.000Z,Boston Celtics,Atlanta Hawks,83.0,110.0


In [347]:
# Read in game scores data (same as verison2)
game_statistics_df = pd.read_csv('./version3_mlFiles/gameStatisticsFinalDataFrame.csv')
game_statistics_df.head()

Unnamed: 0,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,visitor_tpp,...,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_team
0,88,29,77,0.377,16,23,0.696,14,46,0.304,...,0.294,16,30,46,20,20,10,10,5,New York Knicks
1,125,47,85,0.553,23,32,0.719,8,19,0.421,...,0.412,6,20,26,21,24,5,5,3,Dallas Mavericks
2,106,38,96,0.396,19,21,0.905,11,41,0.268,...,0.382,2,40,42,26,16,7,14,6,Brooklyn Nets
3,102,39,81,0.481,17,24,0.708,7,21,0.333,...,0.342,15,35,50,20,18,9,21,5,Cleveland Cavaliers
4,104,36,83,0.434,22,26,0.846,10,34,0.294,...,0.37,15,37,52,21,22,6,14,6,Detroit Pistons


In [348]:
print(f"Shape of game score dataframe: {game_score_df.shape}")
print(f"Shape of game statistics dataframe: {game_statistics_df.shape}")

Shape of game score dataframe: (14242, 6)
Shape of game statistics dataframe: (2495, 39)


In [349]:
print(f"Look at game score columns: \n{game_score_df.columns}")
print("\n")
print(f"Look at game statistics columns: \n{game_statistics_df.columns}")

Look at game score columns: 
Index(['game_ids', 'date', 'visitor', 'home', 'visitor_points', 'home_points'], dtype='object')


Look at game statistics columns: 
Index(['visitor_points', 'visitor_fgm', 'visitor_fga', 'visitor_fgp',
       'visitor_ftm', 'visitor_fta', 'visitor_ftp', 'visitor_tpm',
       'visitor_tpa', 'visitor_tpp', 'visitor_offReb', 'visitor_defReb',
       'visitor_totReb', 'visitor_assists', 'visitor_pFouls', 'visitor_steals',
       'visitor_turnovers', 'visitor_blocks', 'game_id', 'visitor_team',
       'home_points', 'home_fgm', 'home_fga', 'home_fgp', 'home_ftm',
       'home_fta', 'home_ftp', 'home_tpm', 'home_tpa', 'home_tpp',
       'home_offReb', 'home_defReb', 'home_totReb', 'home_assists',
       'home_pFouls', 'home_steals', 'home_turnovers', 'home_blocks',
       'home_team'],
      dtype='object')


In [350]:
# Both Data Frames have a common id column.
# These will be used to merge both frames
merged_df = pd.merge(game_score_df, game_statistics_df, 
                     left_on=['game_ids'], 
                     right_on=['game_id'])
merged_df.head()

Unnamed: 0,game_ids,date,visitor,home,visitor_points_x,home_points_x,visitor_points_y,visitor_fgm,visitor_fga,visitor_fgp,...,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_team
0,12388,2023-04-21T23:00:00.000Z,Boston Celtics,Atlanta Hawks,122.0,130.0,130,51,91,0.56,...,0.438,6,23,29,31,17,11,12,0,Boston Celtics
1,12395,2023-04-23T23:00:00.000Z,Boston Celtics,Atlanta Hawks,129.0,121.0,121,43,98,0.439,...,0.4,9,40,49,25,23,4,16,5,Boston Celtics
2,12404,2023-04-28T00:30:00.000Z,Boston Celtics,Atlanta Hawks,128.0,120.0,120,44,94,0.468,...,0.429,11,36,47,24,16,7,7,10,Boston Celtics
3,13582,2024-03-25T23:30:00.000Z,Boston Celtics,Atlanta Hawks,118.0,120.0,120,46,87,0.529,...,0.289,12,26,38,21,16,7,9,8,Boston Celtics
4,13608,2024-03-28T23:30:00.000Z,Boston Celtics,Atlanta Hawks,122.0,123.0,123,51,104,0.49,...,0.389,9,34,43,28,15,7,12,6,Boston Celtics


In [351]:
print(f"Shape of game merged dataframe: {merged_df.shape}")

Shape of game merged dataframe: (2495, 45)


In [352]:
# Look at data types
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2495 entries, 0 to 2494
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   game_ids           2495 non-null   int64  
 1   date               2495 non-null   object 
 2   visitor            2495 non-null   object 
 3   home               2495 non-null   object 
 4   visitor_points_x   2495 non-null   float64
 5   home_points_x      2495 non-null   float64
 6   visitor_points_y   2495 non-null   int64  
 7   visitor_fgm        2495 non-null   int64  
 8   visitor_fga        2495 non-null   int64  
 9   visitor_fgp        2495 non-null   float64
 10  visitor_ftm        2495 non-null   int64  
 11  visitor_fta        2495 non-null   int64  
 12  visitor_ftp        2495 non-null   float64
 13  visitor_tpm        2495 non-null   int64  
 14  visitor_tpa        2495 non-null   int64  
 15  visitor_tpp        2495 non-null   float64
 16  visitor_offReb     2495 

In [353]:
# Look at null values
merged_df.isnull().sum()

game_ids             0
date                 0
visitor              0
home                 0
visitor_points_x     0
home_points_x        0
visitor_points_y     0
visitor_fgm          0
visitor_fga          0
visitor_fgp          0
visitor_ftm          0
visitor_fta          0
visitor_ftp          0
visitor_tpm          0
visitor_tpa          0
visitor_tpp          0
visitor_offReb       0
visitor_defReb       0
visitor_totReb       0
visitor_assists      0
visitor_pFouls       0
visitor_steals       0
visitor_turnovers    0
visitor_blocks       0
game_id              0
visitor_team         0
home_points_y        0
home_fgm             0
home_fga             0
home_fgp             0
home_ftm             0
home_fta             0
home_ftp             0
home_tpm             0
home_tpa             0
home_tpp             0
home_offReb          0
home_defReb          0
home_totReb          0
home_assists         0
home_pFouls          0
home_steals          0
home_turnovers       0
home_blocks

In [None]:
merged_df.columns

Index(['game_ids', 'date', 'visitor', 'home', 'visitor_points_x',
       'home_points_x', 'visitor_points_y', 'visitor_fgm', 'visitor_fga',
       'visitor_fgp', 'visitor_ftm', 'visitor_fta', 'visitor_ftp',
       'visitor_tpm', 'visitor_tpa', 'visitor_tpp', 'visitor_offReb',
       'visitor_defReb', 'visitor_totReb', 'visitor_assists', 'visitor_pFouls',
       'visitor_steals', 'visitor_turnovers', 'visitor_blocks', 'game_id',
       'visitor_team', 'home_points_y', 'home_fgm', 'home_fga', 'home_fgp',
       'home_ftm', 'home_fta', 'home_ftp', 'home_tpm', 'home_tpa', 'home_tpp',
       'home_offReb', 'home_defReb', 'home_totReb', 'home_assists',
       'home_pFouls', 'home_steals', 'home_turnovers', 'home_blocks',
       'home_team'],
      dtype='object')

In [355]:
# Drop irrelevant columns 
merged_df = merged_df.drop(columns=['game_id', 'date', 'home_team', 'visitor_team', 'home_points_y', 'visitor_points_y'])

# Rename columns 'home_points_x' and 'visitor_points_x'
merged_df = merged_df.rename(columns={
    'home_points_x': 'home_points',
    'visitor_points_x': 'visitor_points'
})

# Display
merged_df

Unnamed: 0,game_ids,visitor,home,visitor_points,home_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,...,home_tpa,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks
0,12388,Boston Celtics,Atlanta Hawks,122.0,130.0,51,91,0.560,13,16,...,48,0.438,6,23,29,31,17,11,12,0
1,12395,Boston Celtics,Atlanta Hawks,129.0,121.0,43,98,0.439,21,24,...,40,0.400,9,40,49,25,23,4,16,5
2,12404,Boston Celtics,Atlanta Hawks,128.0,120.0,44,94,0.468,17,22,...,42,0.429,11,36,47,24,16,7,7,10
3,13582,Boston Celtics,Atlanta Hawks,118.0,120.0,46,87,0.529,10,17,...,38,0.289,12,26,38,21,16,7,9,8
4,13608,Boston Celtics,Atlanta Hawks,122.0,123.0,51,104,0.490,6,10,...,36,0.389,9,34,43,28,15,7,12,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,12964,Toronto Raptors,Washington Wizards,132.0,102.0,42,90,0.467,10,15,...,38,0.395,14,39,53,43,18,12,20,5
2491,13570,Toronto Raptors,Washington Wizards,109.0,112.0,44,92,0.478,13,19,...,33,0.242,13,29,42,27,20,9,12,7
2492,14072,Toronto Raptors,Washington Wizards,95.0,113.0,35,76,0.461,26,38,...,33,0.212,19,26,45,20,32,12,23,3
2493,13176,Utah Jazz,Washington Wizards,123.0,108.0,43,88,0.489,15,20,...,31,0.419,13,39,52,34,19,6,16,2


# Machine Learning Model 3 Version 1.1

In [None]:
# This will be Version 1.1 in which the whole Data Frame will be used
# Version 1.2 will ask the user for the away and home team and create a dataframe with just those teams. Out of a dataframe of 2495 Boston Celtics vs Atlanta Hawks might only play around 100. This will lead to less rows of data (games played by two teams)

# Drop 'game_ids' column
new_df = merged_df.drop(columns='game_ids', axis=1)
new_df.head()

Unnamed: 0,visitor,home,visitor_points,home_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,...,home_tpa,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks
0,Boston Celtics,Atlanta Hawks,122.0,130.0,51,91,0.56,13,16,0.813,...,48,0.438,6,23,29,31,17,11,12,0
1,Boston Celtics,Atlanta Hawks,129.0,121.0,43,98,0.439,21,24,0.875,...,40,0.4,9,40,49,25,23,4,16,5
2,Boston Celtics,Atlanta Hawks,128.0,120.0,44,94,0.468,17,22,0.773,...,42,0.429,11,36,47,24,16,7,7,10
3,Boston Celtics,Atlanta Hawks,118.0,120.0,46,87,0.529,10,17,0.588,...,38,0.289,12,26,38,21,16,7,9,8
4,Boston Celtics,Atlanta Hawks,122.0,123.0,51,104,0.49,6,10,0.6,...,36,0.389,9,34,43,28,15,7,12,6


In [357]:
# Apply one-hot encoding to categorical columns
encoded_df = pd.get_dummies(new_df, columns=['visitor', 'home']).copy()

# Apply transformation only to columns with dtype 'bool'
encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)

# Display
encoded_df

  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_df.loc[:, encoded_df.dtypes == 'bool'] = encoded_df.loc[:, encoded_df.dtypes == 'bool'].astype(int)
  encoded_

Unnamed: 0,visitor_points,home_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,...,home_Oklahoma City Thunder,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards
0,122.0,130.0,51,91,0.560,13,16,0.813,15,34,...,0,0,0,0,0,0,0,0,0,0
1,129.0,121.0,43,98,0.439,21,24,0.875,14,37,...,0,0,0,0,0,0,0,0,0,0
2,128.0,120.0,44,94,0.468,17,22,0.773,15,35,...,0,0,0,0,0,0,0,0,0,0
3,118.0,120.0,46,87,0.529,10,17,0.588,18,36,...,0,0,0,0,0,0,0,0,0,0
4,122.0,123.0,51,104,0.490,6,10,0.600,15,44,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,132.0,102.0,42,90,0.467,10,15,0.667,8,33,...,0,0,0,0,0,0,0,0,0,1
2491,109.0,112.0,44,92,0.478,13,19,0.684,11,35,...,0,0,0,0,0,0,0,0,0,1
2492,95.0,113.0,35,76,0.461,26,38,0.684,17,43,...,0,0,0,0,0,0,0,0,0,1
2493,123.0,108.0,43,88,0.489,15,20,0.750,7,30,...,0,0,0,0,0,0,0,0,0,1


In [383]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2495 entries, 0 to 2494
Data columns (total 97 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   visitor_points                  2495 non-null   float64
 1   home_points                     2495 non-null   float64
 2   visitor_fgm                     2495 non-null   int64  
 3   visitor_fga                     2495 non-null   int64  
 4   visitor_fgp                     2495 non-null   float64
 5   visitor_ftm                     2495 non-null   int64  
 6   visitor_fta                     2495 non-null   int64  
 7   visitor_ftp                     2495 non-null   float64
 8   visitor_tpm                     2495 non-null   int64  
 9   visitor_tpa                     2495 non-null   int64  
 10  visitor_tpp                     2495 non-null   float64
 11  visitor_offReb                  2495 non-null   int64  
 12  visitor_defReb                  24

In [358]:
# Create a column to check if the home team won
encoded_df['winner'] = encoded_df.apply(
    lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
    axis=1
)
encoded_df

Unnamed: 0,visitor_points,home_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,...,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards,winner
0,122.0,130.0,51,91,0.560,13,16,0.813,15,34,...,0,0,0,0,0,0,0,0,0,1
1,129.0,121.0,43,98,0.439,21,24,0.875,14,37,...,0,0,0,0,0,0,0,0,0,0
2,128.0,120.0,44,94,0.468,17,22,0.773,15,35,...,0,0,0,0,0,0,0,0,0,0
3,118.0,120.0,46,87,0.529,10,17,0.588,18,36,...,0,0,0,0,0,0,0,0,0,1
4,122.0,123.0,51,104,0.490,6,10,0.600,15,44,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,132.0,102.0,42,90,0.467,10,15,0.667,8,33,...,0,0,0,0,0,0,0,0,1,0
2491,109.0,112.0,44,92,0.478,13,19,0.684,11,35,...,0,0,0,0,0,0,0,0,1,1
2492,95.0,113.0,35,76,0.461,26,38,0.684,17,43,...,0,0,0,0,0,0,0,0,1,1
2493,123.0,108.0,43,88,0.489,15,20,0.750,7,30,...,0,0,0,0,0,0,0,0,1,0


# Feature Engineering

In [359]:
# Set target column
target_column = 'winner'
target_df = encoded_df.drop(columns=['home_points', 'visitor_points'])

# Look for correlation between columns and 'home_win' column
correlations = target_df.corr(numeric_only=True)[target_column].sort_values(ascending=False)
correlations

winner            1.000000
visitor_fgp       0.422685
visitor_fgm       0.373733
visitor_defReb    0.362472
visitor_tpp       0.316422
                    ...   
home_tpm         -0.271960
home_assists     -0.301383
home_defReb      -0.346700
home_tpp         -0.364760
home_fgm         -0.390861
Name: winner, Length: 95, dtype: float64

In [360]:
# Look for correlation between columns and 'total_points' column
correlations = target_df.corr(numeric_only=True)[target_column]

# Set conditions to find feature columns
feature_columns = correlations[correlations > 0]

# Create new data frame with chosen feature columns
new_df = target_df[feature_columns.index]
new_df.columns

Index(['visitor_fgm', 'visitor_fgp', 'visitor_ftm', 'visitor_fta',
       'visitor_ftp', 'visitor_tpm', 'visitor_tpa', 'visitor_tpp',
       'visitor_defReb', 'visitor_totReb', 'visitor_assists', 'visitor_steals',
       'visitor_blocks', 'home_fga', 'home_fgp', 'home_tpa', 'home_offReb',
       'home_pFouls', 'home_turnovers', 'visitor_Atlanta Hawks',
       'visitor_Brooklyn Nets', 'visitor_Charlotte Hornets',
       'visitor_Detroit Pistons', 'visitor_Houston Rockets',
       'visitor_Indiana Pacers', 'visitor_Memphis Grizzlies',
       'visitor_Orlando Magic', 'visitor_Portland Trail Blazers',
       'visitor_San Antonio Spurs', 'visitor_Toronto Raptors',
       'visitor_Utah Jazz', 'visitor_Washington Wizards',
       'home_Boston Celtics', 'home_Cleveland Cavaliers',
       'home_Denver Nuggets', 'home_Golden State Warriors',
       'home_Houston Rockets', 'home_Indiana Pacers', 'home_LA Clippers',
       'home_Los Angeles Lakers', 'home_Miami Heat', 'home_Milwaukee Bucks',
     

# Model

In [None]:
# # Exclude boolean columns before scaling
# numerical_cols = encoded_df.select_dtypes(include=['int64', 'float64']).columns
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(encoded_df[numerical_cols])
# scaled_data


array([[ 0.73250479,  1.20557286,  1.66793056, ..., -0.46637087,
        -1.95801394,  0.89088362],
       [ 1.28222961,  0.48440661,  0.17038686, ...,  0.53220911,
         0.03612631, -1.12248108],
       [ 1.2036975 ,  0.40427703,  0.35757982, ..., -1.71459583,
         2.03026656, -1.12248108],
       ...,
       [-1.38786239, -0.15663005, -1.32715683, ...,  2.27972406,
        -0.76152979,  0.89088362],
       [ 0.81103691, -0.55727797,  0.17038686, ...,  0.53220911,
        -1.16035784, -1.12248108],
       [ 0.18277996,  0.96518411,  1.29354463, ...,  0.28256411,
        -0.76152979,  0.89088362]])

In [None]:
# numerical_cols

Index(['visitor_points', 'home_points', 'visitor_fgm', 'visitor_fga',
       'visitor_fgp', 'visitor_ftm', 'visitor_fta', 'visitor_ftp',
       'visitor_tpm', 'visitor_tpa', 'visitor_tpp', 'visitor_offReb',
       'visitor_defReb', 'visitor_totReb', 'visitor_assists', 'visitor_pFouls',
       'visitor_steals', 'visitor_turnovers', 'visitor_blocks', 'home_fgm',
       'home_fga', 'home_fgp', 'home_ftm', 'home_fta', 'home_ftp', 'home_tpm',
       'home_tpa', 'home_tpp', 'home_offReb', 'home_defReb', 'home_totReb',
       'home_assists', 'home_pFouls', 'home_steals', 'home_turnovers',
       'home_blocks', 'winner'],
      dtype='object')

In [413]:
remaining_cols = [col for col in encoded_df.columns if col not in numerical_cols]
encoded_df[remaining_cols]

Unnamed: 0,visitor_Atlanta Hawks,visitor_Boston Celtics,visitor_Brooklyn Nets,visitor_Charlotte Hornets,visitor_Chicago Bulls,visitor_Cleveland Cavaliers,visitor_Dallas Mavericks,visitor_Denver Nuggets,visitor_Detroit Pistons,visitor_Golden State Warriors,...,home_Oklahoma City Thunder,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2493,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [422]:
encoded_df.drop(columns=['winner', 'home_points', 'visitor_points'])

Unnamed: 0,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,visitor_tpp,visitor_offReb,...,home_Oklahoma City Thunder,home_Orlando Magic,home_Philadelphia 76ers,home_Phoenix Suns,home_Portland Trail Blazers,home_Sacramento Kings,home_San Antonio Spurs,home_Toronto Raptors,home_Utah Jazz,home_Washington Wizards
0,51,91,0.560,13,16,0.813,15,34,0.441,11,...,0,0,0,0,0,0,0,0,0,0
1,43,98,0.439,21,24,0.875,14,37,0.378,11,...,0,0,0,0,0,0,0,0,0,0
2,44,94,0.468,17,22,0.773,15,35,0.429,12,...,0,0,0,0,0,0,0,0,0,0
3,46,87,0.529,10,17,0.588,18,36,0.500,15,...,0,0,0,0,0,0,0,0,0,0
4,51,104,0.490,6,10,0.600,15,44,0.341,17,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,42,90,0.467,10,15,0.667,8,33,0.242,10,...,0,0,0,0,0,0,0,0,0,1
2491,44,92,0.478,13,19,0.684,11,35,0.314,16,...,0,0,0,0,0,0,0,0,0,1
2492,35,76,0.461,26,38,0.684,17,43,0.395,7,...,0,0,0,0,0,0,0,0,0,1
2493,43,88,0.489,15,20,0.750,7,30,0.233,5,...,0,0,0,0,0,0,0,0,0,1


In [423]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Create X and y parameters
X = encoded_df.drop(columns=['winner', 'home_points', 'visitor_points'])
y = encoded_df['winner']

# Splitting into testing/train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create decision tree regression instance
model = DecisionTreeClassifier()

# Fit model
model = model.fit(X_train, y_train)

# Making predictions using scaled data
predictions = model.predict(X_test)

# Print predicted price for tested features
predictions

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,

In [424]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.7655310621242485
Confusion Matrix:
 [[157  62]
 [ 55 225]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.72      0.73       219
           1       0.78      0.80      0.79       280

    accuracy                           0.77       499
   macro avg       0.76      0.76      0.76       499
weighted avg       0.76      0.77      0.77       499



In [426]:
# # Define the home and visitor teams
home_team = input("Enter Away Team: ")  # Replace with desired home team
visitor_team = input("Enter Home Team: ")  # Replace with desired visitor team

# Create a new data instance with zeros for all one-hot columns
input_data = {col: 0 for col in X.columns}

if f'home_{home_team}' in input_data:
    input_data[f'home_{home_team}'] = 1

if f'visitor_{visitor_team}' in input_data:
    input_data[f'visitor_{visitor_team}'] = 1

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Predict total points
predicted_winner = model.predict(input_df)[0]

winning_team = home_team if  predicted_winner == 1 else visitor_team
print("Predicted Winner:", winning_team)

Predicted Winner: Atlanta Hawks


# Machine Learning Model 3 Version 1.2