# Scraping Nba Site for Players Stat

In [22]:
import pandas as pd
import requests
import numpy as np
import time

def scrape_nba_stats(season_types, years, file_name):
    df = pd.DataFrame()
    time_taken = time.time()

    for y in years:
        for s in season_types:
            url = f'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=PerGame&Scope=S&Season={y}&SeasonType={s}&StatCategory=PTS'
            r = requests.get(url=url).json()

            # Dynamically get table headers from the API response
            table_headers = r['resultSet']['headers']
            temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns=table_headers)
            temp_df2 = pd.DataFrame({'Years': [y] * len(temp_df1),
                                     'Season_Type': [s] * len(temp_df1)})
            temp_df3 = pd.concat([temp_df2, temp_df1], axis=1)
            df = pd.concat([df, temp_df3], axis=0)
            print(f'Finished scraping for the {y} {s}')
            lag = np.random.uniform(low=5, high=40)
            time.sleep(lag)

    print(f'Finished in {round((time.time() - time_taken) / 60, 2)} minutes')
    df.to_excel(file_name, index=False)

# Define parameters
season_types_regular = ['Regular%20Season']
season_types_playoff = ['Playoffs']
years = ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

# Scrape regular season stats
scrape_nba_stats(season_types_regular, years, 'players_stats_regular.xlsx')

# Scrape playoff stats
scrape_nba_stats(season_types_playoff, years, 'players_stats_playoff.xlsx')


Finished scraping for the 2013-14 Regular%20Season
Finished scraping for the 2014-15 Regular%20Season
Finished scraping for the 2015-16 Regular%20Season
Finished scraping for the 2016-17 Regular%20Season
Finished scraping for the 2017-18 Regular%20Season
Finished scraping for the 2018-19 Regular%20Season
Finished scraping for the 2019-20 Regular%20Season
Finished scraping for the 2020-21 Regular%20Season
Finished scraping for the 2021-22 Regular%20Season
Finished scraping for the 2022-23 Regular%20Season
Finished scraping for the 2023-24 Regular%20Season
Finished in 3.51 minutes
Finished scraping for the 2013-14 Playoffs
Finished scraping for the 2014-15 Playoffs
Finished scraping for the 2015-16 Playoffs
Finished scraping for the 2016-17 Playoffs
Finished scraping for the 2017-18 Playoffs
Finished scraping for the 2018-19 Playoffs
Finished scraping for the 2019-20 Playoffs
Finished scraping for the 2020-21 Playoffs
Finished scraping for the 2021-22 Playoffs
Finished scraping for the 2

# Data Processing

In [32]:
all_datas = []

datas_regular = pd.read_excel('players_stats_regular.xlsx')
datas_playoff = pd.read_excel('players_stats_playoff.xlsx')

all_datas.append(datas_regular)
all_datas.append(datas_playoff)

for datas in all_datas:
    datas.drop(columns=["RANK","EFF"], inplace =True)
    datas['season_start_year'] = datas['Years'].str[:4].astype(int)
    datas['TEAM'].replace(to_replace=['NOP','NOH'], value ='NO', inplace=True)
    
    if datas['Season_Type'][0] == "Regular%20Season" :
        datas['Season_Type'].replace("Regular%20Season", "RS", inplace=True)
    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  datas['TEAM'].replace(to_replace=['NOP','NOH'], value ='NO', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  datas['Season_Type'].replace("Regular%20Season", "RS", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the 

In [33]:
all_datas[0].sample(10)

Unnamed: 0,Years,Season_Type,PLAYER_ID,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,season_start_year
1286,2017-18,RS,202687,Bismack Biyombo,1610612753,ORL,82,18.2,2.2,4.3,...,0.65,1.8,4.0,5.7,0.8,0.3,1.2,1.0,5.7,2017
1035,2016-17,RS,203086,Meyers Leonard,1610612757,POR,74,16.5,2.0,5.1,...,0.875,0.4,2.8,3.2,1.0,0.2,0.4,0.5,5.4,2016
658,2015-16,RS,201977,Marcus Thornton,1610612764,WAS,61,18.2,3.5,8.9,...,0.848,0.6,1.8,2.5,1.4,0.8,0.1,0.8,9.7,2015
1684,2019-20,RS,202684,Tristan Thompson,1610612739,CLE,57,30.2,5.1,9.9,...,0.615,4.0,6.2,10.1,2.1,0.6,0.9,1.8,12.0,2019
1770,2019-20,RS,202083,Wesley Matthews,1610612749,MIL,67,24.4,2.5,6.3,...,0.765,0.3,2.1,2.5,1.4,0.6,0.1,0.6,7.4,2019
1882,2020-21,RS,1629628,RJ Barrett,1610612752,NYK,72,34.9,6.5,14.7,...,0.746,0.9,4.8,5.8,3.0,0.7,0.3,1.9,17.6,2020
2722,2023-24,RS,1630241,Sam Merrill,1610612739,CLE,61,17.5,2.6,6.4,...,0.929,0.4,1.6,2.0,1.8,0.3,0.1,0.3,8.0,2023
2234,2021-22,RS,203493,Reggie Bullock Jr.,1610612742,DAL,68,28.0,3.0,7.4,...,0.833,0.5,3.1,3.5,1.2,0.6,0.2,0.6,8.6,2021
2138,2021-22,RS,203992,Bogdan Bogdanovic,1610612737,ATL,63,29.3,5.4,12.6,...,0.843,0.5,3.5,4.0,3.1,1.1,0.2,1.1,15.1,2021
1746,2019-20,RS,203145,Kent Bazemore,1610612758,SAC,68,24.8,2.9,7.9,...,0.769,0.4,3.9,4.3,1.4,1.1,0.6,1.3,8.8,2019
