In [145]:
# NOTES:
# 1.Dont scale the linear regression model to total points
# 2.Feature Selection: Research other features that can be added to the dataframe
# 3.Remove columns that have total counts instead of percentages (i.e FGP vs FGM or FGA)
# 4.Look at correlation of all columns (especially after creating new ones from research)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings


In [146]:
game_statistics_df = pd.read_csv('./data/gameStatistics.csv')
game_statistics_df.head()

Unnamed: 0,visitor_fastBreakPoints,visitor_pointsInPaint,visitor_biggestLead,visitor_secondChancePoints,visitor_pointsOffTurnovers,visitor_longestRun,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,...,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_plusMinus,home_min,home_team
0,15.0,48.0,25.0,5.0,23.0,15.0,120.0,40.0,86.0,46.5,...,31.0,46.0,21.0,27.0,9.0,18.0,4.0,-10.0,240:00,Houston Rockets
1,10.0,40.0,8.0,18.0,12.0,11.0,121.0,43.0,97.0,44.3,...,38.0,46.0,27.0,18.0,1.0,9.0,8.0,-3.0,240:00,Cleveland Cavaliers
2,14.0,36.0,7.0,3.0,6.0,16.0,108.0,38.0,78.0,48.7,...,36.0,54.0,19.0,18.0,8.0,7.0,4.0,8.0,240:00,Toronto Raptors
3,4.0,36.0,9.0,3.0,8.0,10.0,80.0,30.0,81.0,37.0,...,42.0,49.0,21.0,17.0,6.0,9.0,3.0,15.0,240:00,Oklahoma City Thunder
4,16.0,48.0,5.0,12.0,6.0,10.0,105.0,39.0,90.0,43.3,...,42.0,58.0,28.0,18.0,7.0,9.0,11.0,13.0,240:00,Memphis Grizzlies


In [147]:
game_statistics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4072 entries, 0 to 4071
Data columns (total 55 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   visitor_fastBreakPoints     809 non-null    float64
 1   visitor_pointsInPaint       809 non-null    float64
 2   visitor_biggestLead         809 non-null    float64
 3   visitor_secondChancePoints  809 non-null    float64
 4   visitor_pointsOffTurnovers  809 non-null    float64
 5   visitor_longestRun          809 non-null    float64
 6   visitor_points              4070 non-null   float64
 7   visitor_fgm                 4070 non-null   float64
 8   visitor_fga                 4070 non-null   float64
 9   visitor_fgp                 4070 non-null   float64
 10  visitor_ftm                 4070 non-null   float64
 11  visitor_fta                 4070 non-null   float64
 12  visitor_ftp                 4070 non-null   float64
 13  visitor_tpm                 4070 

In [148]:
# Convert values in percentage columns to decimal form
game_statistics_df[['visitor_ftp', 'visitor_fgp', 'visitor_tpp', 'home_ftp', 'home_fgp','home_tpp']] = game_statistics_df[
    ['visitor_ftp', 
     'visitor_fgp', 
     'visitor_tpp', 
     'home_ftp', 
     'home_fgp',
     'home_tpp']
    ].apply(lambda x: x / 100)

In [149]:
# Drop irrelevant columns
game_statistics_df = game_statistics_df.drop(columns=['visitor_fastBreakPoints', 'visitor_pointsInPaint',
       'visitor_biggestLead', 'visitor_secondChancePoints',
       'visitor_pointsOffTurnovers', 'visitor_longestRun','visitor_plusMinus',
       'visitor_min','home_fastBreakPoints',
       'home_pointsInPaint', 'home_biggestLead', 'home_secondChancePoints',
       'home_pointsOffTurnovers', 'home_longestRun','home_plusMinus', 'home_min','game_id'])

# Display
game_statistics_df.head()

Unnamed: 0,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,visitor_tpp,...,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_team
0,120.0,40.0,86.0,0.465,26.0,31.0,0.839,14.0,33.0,0.424,...,0.333,15.0,31.0,46.0,21.0,27.0,9.0,18.0,4.0,Houston Rockets
1,121.0,43.0,97.0,0.443,19.0,21.0,0.905,16.0,38.0,0.421,...,0.32,8.0,38.0,46.0,27.0,18.0,1.0,9.0,8.0,Cleveland Cavaliers
2,108.0,38.0,78.0,0.487,17.0,19.0,0.895,15.0,34.0,0.441,...,0.314,18.0,36.0,54.0,19.0,18.0,8.0,7.0,4.0,Toronto Raptors
3,80.0,30.0,81.0,0.37,12.0,22.0,0.545,8.0,41.0,0.195,...,0.419,7.0,42.0,49.0,21.0,17.0,6.0,9.0,3.0,Oklahoma City Thunder
4,105.0,39.0,90.0,0.433,15.0,24.0,0.625,12.0,35.0,0.343,...,0.385,16.0,42.0,58.0,28.0,18.0,7.0,9.0,11.0,Memphis Grizzlies


In [150]:
game_statistics_df.isnull().sum()

visitor_points       2
visitor_fgm          2
visitor_fga          2
visitor_fgp          2
visitor_ftm          2
visitor_fta          2
visitor_ftp          2
visitor_tpm          2
visitor_tpa          2
visitor_tpp          2
visitor_offReb       2
visitor_defReb       2
visitor_totReb       2
visitor_assists      2
visitor_pFouls       2
visitor_steals       2
visitor_turnovers    2
visitor_blocks       2
visitor_team         0
home_points          2
home_fgm             2
home_fga             2
home_fgp             2
home_ftm             2
home_fta             2
home_ftp             2
home_tpm             2
home_tpa             2
home_tpp             2
home_offReb          2
home_defReb          2
home_totReb          2
home_assists         2
home_pFouls          2
home_steals          2
home_turnovers       2
home_blocks          2
home_team            0
dtype: int64

In [151]:
df = game_statistics_df.dropna()

# Analysis

In [152]:
df.head()

Unnamed: 0,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,visitor_tpp,...,home_tpp,home_offReb,home_defReb,home_totReb,home_assists,home_pFouls,home_steals,home_turnovers,home_blocks,home_team
0,120.0,40.0,86.0,0.465,26.0,31.0,0.839,14.0,33.0,0.424,...,0.333,15.0,31.0,46.0,21.0,27.0,9.0,18.0,4.0,Houston Rockets
1,121.0,43.0,97.0,0.443,19.0,21.0,0.905,16.0,38.0,0.421,...,0.32,8.0,38.0,46.0,27.0,18.0,1.0,9.0,8.0,Cleveland Cavaliers
2,108.0,38.0,78.0,0.487,17.0,19.0,0.895,15.0,34.0,0.441,...,0.314,18.0,36.0,54.0,19.0,18.0,8.0,7.0,4.0,Toronto Raptors
3,80.0,30.0,81.0,0.37,12.0,22.0,0.545,8.0,41.0,0.195,...,0.419,7.0,42.0,49.0,21.0,17.0,6.0,9.0,3.0,Oklahoma City Thunder
4,105.0,39.0,90.0,0.433,15.0,24.0,0.625,12.0,35.0,0.343,...,0.385,16.0,42.0,58.0,28.0,18.0,7.0,9.0,11.0,Memphis Grizzlies


# Feature Engineering

In [153]:
# Look at columns of data frame
df.columns

Index(['visitor_points', 'visitor_fgm', 'visitor_fga', 'visitor_fgp',
       'visitor_ftm', 'visitor_fta', 'visitor_ftp', 'visitor_tpm',
       'visitor_tpa', 'visitor_tpp', 'visitor_offReb', 'visitor_defReb',
       'visitor_totReb', 'visitor_assists', 'visitor_pFouls', 'visitor_steals',
       'visitor_turnovers', 'visitor_blocks', 'visitor_team', 'home_points',
       'home_fgm', 'home_fga', 'home_fgp', 'home_ftm', 'home_fta', 'home_ftp',
       'home_tpm', 'home_tpa', 'home_tpp', 'home_offReb', 'home_defReb',
       'home_totReb', 'home_assists', 'home_pFouls', 'home_steals',
       'home_turnovers', 'home_blocks', 'home_team'],
      dtype='object')

In [164]:
# Calculate Free Throw Attempt Rate
df['visitor_fta_rate'] = df['visitor_fta'] / df['visitor_fga']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['visitor_fta_rate'] = df['visitor_fta'] / df['visitor_fga']


In [None]:
# Benchmarks for TO% in the NBA:
# Excellent: Less than 12%
# Teams that excel at ball control and avoid turnovers.
# Good: 12% to 14%
# Solid ball-handling and decision-making.
# Average: 14% to 16%
# Typical for most teams; manageable turnovers without being costly.
# Poor: Greater than 16%
# Indicates sloppy play or excessive risk-taking.

# Calculate number of possessions
df['visitor_possessions'] = (df['visitor_fga'] + 0.44  * df['visitor_fta'] - df['visitor_offReb'] + df['visitor_turnovers'])
df['visitor_to%'] = (df['visitor_turnovers'] / df['visitor_possessions'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['visitor_possessions'] = (df['visitor_fga'] + 0.44  * df['visitor_fta'] - df['visitor_offReb'] + df['visitor_turnovers'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['visitor_to%'] = (df['visitor_turnovers'] / df['visitor_possessions'])


In [156]:
# Calculate Turn Over Ratio (TO)
# (TO * 100) / (FGA + (FTA * 0.44) + AST + TO
df['visitor_turn_over_ratio'] = (df['visitor_turnovers'] * 100) / (df['visitor_fga'] + ((df['visitor_fta']) * 0.44) + df['visitor_assists'] + df['visitor_turnovers'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['visitor_turn_over_ratio'] = (df['visitor_turnovers'] * 100) / (df['visitor_fga'] + ((df['visitor_fta']) * 0.44) + df['visitor_assists'] + df['visitor_turnovers'])


In [157]:
# Create a column for both the home team and visitor team to calculate effective field goal percentage
# EFG - A statistics that adjust field goal percentage to show that three point shots are worth more than two point shots
df['visitor_efg%'] = (df['visitor_fgm'] + (0.5 * df['visitor_tpm'])) / df['visitor_fga']
df['home_efg%'] = (df['home_fgm'] + (0.5 * df['home_tpm'])) / df['home_fga']


# Measures the teams efficiency at shooting the ball
# Calculate the teams True Shooting Percentage
df['visitor_tsp'] = (df['visitor_points'] / (2 * (df['visitor_fga'] + (0.44 * df['visitor_fta']))))
df['home_tsp'] = (df['home_points'] / (2 * (df['home_fga'] + (0.44 * df['home_fta']))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['visitor_efg%'] = (df['visitor_fgm'] + (0.5 * df['visitor_tpm'])) / df['visitor_fga']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['home_efg%'] = (df['home_fgm'] + (0.5 * df['home_tpm'])) / df['home_fga']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['visitor_tsp'] = (df['visitor_points

In [165]:
# Display
df.head()

Unnamed: 0,visitor_points,visitor_fgm,visitor_fga,visitor_fgp,visitor_ftm,visitor_fta,visitor_ftp,visitor_tpm,visitor_tpa,visitor_tpp,...,home_blocks,home_team,visitor_efg%,home_efg%,visitor_tsp,home_tsp,visitor_turn_over_ratio,visitor_possessions,visitor_to%,visitor_fta_rate
0,120.0,40.0,86.0,0.465,26.0,31.0,0.839,14.0,33.0,0.424,...,4.0,Houston Rockets,0.546512,0.516129,0.602168,0.547373,10.398099,102.64,0.136399,0.360465
1,121.0,43.0,97.0,0.443,19.0,21.0,0.905,16.0,38.0,0.421,...,8.0,Cleveland Cavaliers,0.525773,0.619048,0.569465,0.623942,1.523926,94.24,0.021222,0.216495
2,108.0,38.0,78.0,0.487,17.0,19.0,0.895,15.0,34.0,0.441,...,4.0,Toronto Raptors,0.583333,0.473958,0.625289,0.524792,9.572431,95.36,0.125839,0.24359
3,80.0,30.0,81.0,0.37,12.0,22.0,0.545,8.0,41.0,0.195,...,3.0,Oklahoma City Thunder,0.419753,0.517241,0.441112,0.527309,8.874689,95.68,0.104515,0.271605
4,105.0,39.0,90.0,0.433,15.0,24.0,0.625,12.0,35.0,0.343,...,11.0,Memphis Grizzlies,0.5,0.529412,0.522076,0.541086,8.114488,100.56,0.109387,0.266667


In [155]:
# # Create first target column 'winner'
# df['home_win'] = df.apply(
#     lambda row: 1 if row['home_points'] > row['visitor_points'] else 0,
#     axis=1
# )
# # Create second target column 'total_points'
# df['total_points'] = df['home_points'] + df['visitor_points']

# # Display
# df.head()