# NBA Season

### Data Initialization

We are pulling the NBA season stats throughout the years from the URL of https://www.kaggle.com/datasets/justinas/nba-players-data/data

In [41]:
import pandas as pd

pd.__version__

'2.2.3'

**Get the kaggle dataset**

In [42]:
# read a csv file into a df
df = pd.read_csv('nba.csv')

dfTeams = pd.read_csv('unique_teams.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Randy Livingston,HOU,22.0,193.04,94.800728,Louisiana State,USA,1996,2,...,3.9,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97
1,1,Gaylon Nickerson,WAS,28.0,190.5,86.18248,Northwestern Oklahoma,USA,1994,2,...,3.8,1.3,0.3,8.9,0.03,0.111,0.174,0.497,0.043,1996-97
2,2,George Lynch,VAN,26.0,203.2,103.418976,North Carolina,USA,1993,1,...,8.3,6.4,1.9,-8.2,0.106,0.185,0.175,0.512,0.125,1996-97
3,3,George McCloud,LAL,30.0,203.2,102.0582,Florida State,USA,1989,1,...,10.2,2.8,1.7,-2.7,0.027,0.111,0.206,0.527,0.125,1996-97
4,4,George Zidek,DEN,23.0,213.36,119.748288,UCLA,USA,1995,1,...,2.8,1.7,0.3,-14.1,0.102,0.169,0.195,0.5,0.064,1996-97


**Put abbreviations to Cities**

Some cities have multiple as franchises have changed

Cities had to be added as ESPN win rates used cities

In [43]:
# merge the two dataframes on team_abbreviation with df and dfTeams on abbreviations
df = pd.merge(df, dfTeams, left_on='team_abbreviation', right_on='abbreviations')

# drop the team_abbreviation column and abbreviations column
df = df.drop(columns=['team_abbreviation', 'abbreviations'])

In [44]:
# drop the unnamed column
df.drop('Unnamed: 0', axis=1, inplace=True)

df.head()

df.dtypes

player_name       object
age              float64
player_height    float64
player_weight    float64
college           object
country           object
draft_year        object
draft_round       object
draft_number      object
gp                 int64
pts              float64
reb              float64
ast              float64
net_rating       float64
oreb_pct         float64
dreb_pct         float64
usg_pct          float64
ts_pct           float64
ast_pct          float64
season            object
team              object
dtype: object

In [45]:
# look for null values
df.isnull().sum()

player_name         0
age                 0
player_height       0
player_weight       0
college          1852
country             0
draft_year          0
draft_round         0
draft_number        0
gp                  0
pts                 0
reb                 0
ast                 0
net_rating          0
oreb_pct            0
dreb_pct            0
usg_pct             0
ts_pct              0
ast_pct             0
season              0
team                0
dtype: int64

### Data Preprocessing

We need to deal with the columns we want to keep and also all the categorial data cols of:

player_name           object

team_abbreviation     object

college               object

country               object

draft_year            object

draft_round           object

draft_number          object

season                object

#### **Drop Some of Them**

In [46]:
df.drop(['player_name', 'college', 'draft_year', 'draft_round', 'draft_number'], axis=1, inplace=True)

#### **Country**

In [47]:
df['country'] = pd.Categorical(df['country']).codes

# print format the value counts with 'Country' as the header
print("Country value counts:\n\n%s" % df['country'].value_counts())

# Maybe drop this col for now
df.drop('country', axis=1, inplace=True)

Country value counts:

country
75    10569
12      203
25      189
2       100
63       93
      ...  
29        1
70        1
20        1
14        1
66        1
Name: count, Length: 82, dtype: int64


**Encode Season**

In [48]:
# categorical code season col but i want to keep the original
df['seasonEncoded'] = pd.Categorical(df['season']).codes

# print the unique values of season and seasonEncoded
unique_pairings = df[['season', 'seasonEncoded']].drop_duplicates()
print("Season unique pairings:\n", unique_pairings)

# drop the rows where seasonEncoded is less than 7. This keeps the season of 03-04 and later
df = df[df['seasonEncoded'] >= 7]

df.drop('season', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

Season unique pairings:
         season  seasonEncoded
0      1996-97              0
413    1997-98              1
824    1998-99              2
1233   1999-00              3
1639   2000-01              4
2051   2001-02              5
2477   2002-03              6
2905   2003-04              7
3347   2004-05              8
3811   2005-06              9
4269   2006-07             10
4727   2007-08             11
5178   2008-09             12
5623   2009-10             13
6065   2010-11             14
6517   2011-12             15
6995   2012-13             16
7464   2013-14             17
7946   2014-15             18
8438   2015-16             19
8914   2016-17             20
9400   2017-18             21
9940   2018-19             22
10470  2019-20             23
10999  2020-21             24
11539  2021-22             25
12144  2022-23             26


#### **Add in Win Rates**

In [49]:
import requests

def winRateFromYear(year, seasonEncoded):

    winRateDf = pd.read_html(f'https://www.teamrankings.com/nba/stat/win-pct-all-games?date={year}-06-16')[0]

    winRateDf['Win PCT']= winRateDf[f'{year - 1}'] 

    # make the first row the header
    #winRateDf.columns = winRateDf.iloc[1]

    #drop the first two rows
    #winRateDf.drop([0, 1], inplace=True)

    #winRateDf['PT DIFF'] = winRateDf['PF'].astype(float) - winRateDf['PA'].astype(float)

    winRateDf['seasonEncoded'] = seasonEncoded

    winRateDf = winRateDf[['Team', 'Win PCT', 'seasonEncoded']]

    return winRateDf

**Merge DF2 with df on Team Names**

In [50]:
import time

def getWinRates():
    # merge the two dataframes on team with df and teams on team where seasonEncoded is 7
    winRateDf = pd.DataFrame()
    seasonEncoded = 7

    for year in range(2004, 2024):
        winRateDf = pd.concat([winRateDf, winRateFromYear(year, seasonEncoded)], ignore_index=True)
        seasonEncoded += 1

    winRateDf.tail()

    return winRateDf

run = False

if run:
    winRateDf = getWinRates()
else:
    winRateDf = pd.read_csv('winRate.csv')


**TODO: MERGE THE DFS**

In [52]:
merged_df = pd.merge(df, winRateDf, left_on=['team', 'seasonEncoded'], right_on=['Team', 'seasonEncoded'])

merged_df.drop('Unnamed: 0', axis=1, inplace=True)

merged_df.head()

Unnamed: 0,age,player_height,player_weight,gp,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,team,seasonEncoded,Team,Win PCT
0,28.0,177.8,77.11064,56,4.7,2.0,3.6,-4.4,0.016,0.115,0.156,0.475,0.343,Milwaukee,7,Milwaukee,0.483
1,25.0,203.2,98.883056,42,2.2,1.6,0.3,-5.0,0.071,0.133,0.15,0.438,0.065,Boston,7,Boston,0.419
2,29.0,220.98,117.93392,81,15.3,8.1,1.3,-3.7,0.122,0.163,0.229,0.541,0.074,Cleveland,7,Cleveland,0.427
3,32.0,182.88,79.3786,16,2.9,0.9,0.9,-16.0,0.016,0.077,0.128,0.498,0.146,Washington,7,Washington,0.305
4,31.0,208.28,111.13004,23,18.7,8.7,4.6,-0.7,0.065,0.198,0.289,0.456,0.227,Sacramento,7,Sacramento,0.66
