# NBA Season

### Data Initialization

We are pulling the NBA season stats throughout the years from the URL of https://www.kaggle.com/datasets/justinas/nba-players-data/data

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import requests
%matplotlib inline

**Get the kaggle dataset**

In [77]:
# read a csv file into a df
playerData = pd.read_csv('nba.csv')

teamNames = pd.read_csv('unique_teams.csv')

playerData.head()

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Randy Livingston,HOU,22.0,193.04,94.800728,Louisiana State,USA,1996,2,...,3.9,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97
1,1,Gaylon Nickerson,WAS,28.0,190.5,86.18248,Northwestern Oklahoma,USA,1994,2,...,3.8,1.3,0.3,8.9,0.03,0.111,0.174,0.497,0.043,1996-97
2,2,George Lynch,VAN,26.0,203.2,103.418976,North Carolina,USA,1993,1,...,8.3,6.4,1.9,-8.2,0.106,0.185,0.175,0.512,0.125,1996-97
3,3,George McCloud,LAL,30.0,203.2,102.0582,Florida State,USA,1989,1,...,10.2,2.8,1.7,-2.7,0.027,0.111,0.206,0.527,0.125,1996-97
4,4,George Zidek,DEN,23.0,213.36,119.748288,UCLA,USA,1995,1,...,2.8,1.7,0.3,-14.1,0.102,0.169,0.195,0.5,0.064,1996-97


**Put abbreviations to Cities**

Some cities have multiple as franchises have changed

Cities had to be added as ESPN win rates used cities

In [78]:
# merge the two dataframes on team_abbreviation with df and dfTeams on abbreviations
playerData = pd.merge(playerData, teamNames, left_on='team_abbreviation', right_on='abbreviations')

# drop the team_abbreviation column and abbreviations column
playerData = playerData.drop(columns=['team_abbreviation', 'abbreviations'])

In [79]:
playerData.head()

Unnamed: 0.1,Unnamed: 0,player_name,age,player_height,player_weight,college,country,draft_year,draft_round,draft_number,...,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season,team
0,0,Randy Livingston,22.0,193.04,94.800728,Louisiana State,USA,1996,2,42,...,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97,Houston
1,18,Hakeem Olajuwon,34.0,213.36,115.66596,Houston,Nigeria,1984,1,1,...,9.2,3.0,6.5,0.075,0.206,0.308,0.558,0.158,1996-97,Houston
2,29,Emanual Davis,28.0,195.58,87.996848,Delaware State,USA,Undrafted,Undrafted,Undrafted,...,1.7,2.0,6.6,0.011,0.098,0.144,0.565,0.191,1996-97,Houston
3,61,Joe Stephens,24.0,200.66,95.25432,Arkansas-Little Rock,USA,Undrafted,Undrafted,Undrafted,...,1.5,0.0,-17.4,0.25,0.111,0.279,0.3,0.0,1996-97,Houston
4,97,Eddie Johnson,38.0,200.66,97.52228,Illinois,USA,1981,2,29,...,2.7,1.0,4.1,0.034,0.126,0.22,0.541,0.102,1996-97,Houston


In [80]:
# drop the unnamed column
playerData.drop('Unnamed: 0', axis=1, inplace=True)

playerData.dtypes

player_name       object
age              float64
player_height    float64
player_weight    float64
college           object
country           object
draft_year        object
draft_round       object
draft_number      object
gp                 int64
pts              float64
reb              float64
ast              float64
net_rating       float64
oreb_pct         float64
dreb_pct         float64
usg_pct          float64
ts_pct           float64
ast_pct          float64
season            object
team              object
dtype: object

In [81]:
# look for null values
playerData.isnull().sum()

player_name         0
age                 0
player_height       0
player_weight       0
college          1852
country             0
draft_year          0
draft_round         0
draft_number        0
gp                  0
pts                 0
reb                 0
ast                 0
net_rating          0
oreb_pct            0
dreb_pct            0
usg_pct             0
ts_pct              0
ast_pct             0
season              0
team                0
dtype: int64

### Data Preprocessing

We need to deal with the columns we want to keep and also all the categorial data cols of:

player_name           object

team_abbreviation     object

college               object

country               object

draft_year            object

draft_round           object

draft_number          object

season                object

#### **Drop Some of Them**

In [82]:
playerData.drop(['player_name', 'college', 'draft_year', 'draft_round', 'draft_number', 'country'], axis=1, inplace=True)

In [83]:
playerData.head()

Unnamed: 0,age,player_height,player_weight,gp,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season,team
0,22.0,193.04,94.800728,64,3.9,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97,Houston
1,34.0,213.36,115.66596,78,23.2,9.2,3.0,6.5,0.075,0.206,0.308,0.558,0.158,1996-97,Houston
2,28.0,195.58,87.996848,13,5.0,1.7,2.0,6.6,0.011,0.098,0.144,0.565,0.191,1996-97,Houston
3,24.0,200.66,95.25432,2,1.5,1.5,0.0,-17.4,0.25,0.111,0.279,0.3,0.0,1996-97,Houston
4,38.0,200.66,97.52228,52,8.2,2.7,1.0,4.1,0.034,0.126,0.22,0.541,0.102,1996-97,Houston


#### **Add in Win Rates**

In [84]:
import pandas as pd
import requests
import time
from io import StringIO

def winRateFromYear(year, max_retries=3):
    print(f'Getting data from {year}.')
    url = f'https://www.espn.com/nba/stats/rpi/_/year/{year}'
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise an error for bad responses
            
            # Read the data into a DataFrame using StringIO
            winRateDf = pd.read_html(StringIO(response.text))[0]

            # Make the first row the header
            winRateDf.columns = winRateDf.iloc[1]

            # Drop the first two rows
            winRateDf.drop([0, 1], inplace=True)

            # Calculate point differential
            winRateDf['PT DIFF'] = winRateDf['PF'].astype(float) - winRateDf['PA'].astype(float)

            # Select relevant columns
            winRateDf = winRateDf[['TEAM', 'RPI', 'PCT', 'PT DIFF']]
            
            return winRateDf
        
        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 404:
                print(f'404 error occurred: {http_err}. Retrying...')
                time.sleep(2)  # Optional: wait before retrying
            else:
                print(f'HTTP error occurred: {http_err}.')
                break  # Exit the loop for other errors
        except Exception as err:
            print(f'Other error occurred: {err}')
            break  # Exit the loop for unexpected errors

# Example usage
# df = winRateFromYear(2023)
# print(df)


In [85]:
# merge the two dataframes on team with df and teams on team where seasonEncoded is 7
winRateDf = pd.DataFrame()

for year in range(2004, 2024):
    winRateDf = pd.concat([winRateDf, winRateFromYear(year)], ignore_index=True)
    time.sleep(1)

winRateDf.head()

Getting data from 2004.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2005.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2006.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2007.
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2007. Retrying...


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2008.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2009.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2010.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2011.
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2011. Retrying...
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2011. Retrying...
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2011. Retrying...
Getting data from 2012.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2013.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2014.


  winRateDf = pd.read_html(response.text)[0]


Getting data from 2015.
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2015. Retrying...
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2015. Retrying...
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2015. Retrying...
Getting data from 2016.
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2016. Retrying...
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2016. Retrying...
404 error occurred: 404 Client Error: Not Found for url: https://www.espn.com/nba/stats/rpi/_/year/2016. Retrying...


**Merge DF2 with df on Team Names**

In [75]:
# merge the two dataframes on team with df and teams on team where seasonEncoded is 7
winRateDf = pd.DataFrame()
seasonEncoded = 7

for year in range(2004, 2024):
    winRateDf = winRateDf.append(winRateFromYear(year, seasonEncoded))
    seasonEncoded += 1

winRateDf.tail()

AttributeError: 'DataFrame' object has no attribute 'append'

**TODO: MERGE THE DFS**

In [12]:
merged_df = pd.merge(df, winRateDf, left_on=['team', 'seasonEncoded'], right_on=['TEAM', 'seasonEncoded'])

merged_df.head()

Unnamed: 0,age,player_height,player_weight,gp,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,team,seasonEncoded,TEAM,RPI,PCT,PT DIFF
0,40.0,205.74,111.13004,7,1.3,0.7,0.3,-7.9,0.0,0.217,0.165,0.521,0.1,Houston,7,Houston,0.521,0.549,142.0
1,33.0,200.66,122.46984,52,5.0,3.9,0.6,-5.0,0.104,0.175,0.152,0.538,0.063,Houston,7,Houston,0.521,0.549,142.0
2,28.0,193.04,97.52228,80,15.8,4.5,3.2,1.8,0.015,0.112,0.2,0.535,0.144,Houston,7,Houston,0.521,0.549,142.0
3,23.0,205.74,100.243832,45,3.1,1.6,0.7,2.0,0.021,0.136,0.159,0.477,0.103,Houston,7,Houston,0.521,0.549,142.0
4,29.0,195.58,104.32616,19,0.6,1.0,0.5,-8.4,0.01,0.167,0.093,0.278,0.132,Houston,7,Houston,0.521,0.549,142.0


**Encode Season**

In [None]:
# categorical code season col but i want to keep the original
df['seasonEncoded'] = pd.Categorical(df['season']).codes

# print the unique values of season and seasonEncoded
unique_pairings = df[['season', 'seasonEncoded']].drop_duplicates()
print("Season unique pairings:\n", unique_pairings)

# drop the rows where seasonEncoded is less than 7. This keeps the season of 03-04 and later
df = df[df['seasonEncoded'] >= 7]

df.drop('season', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

KeyError: 'season'