In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Source Code for the functions


In [18]:
###Pulling Per game stats###

def player_pg(years):
   
    for i in years:
       # Get the corresponding url and parse it 
        years=years        
        url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'.format(i)
        source = requests.get(url).text
        soup = BeautifulSoup(source,'lxml')
        
        #Get the header and rows into two separate list
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers = headers[1:]
        
        rows = soup.findAll('tr')[1:]
        player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
        
        #Creating the stats dataframe and getting rid of completely empty rows that were
        #included and replace remainding NaN values with 0. This is because of how Basketball Reference sets their tables up
        stats = pd.DataFrame(player_stats, columns = headers)
        stats = stats.replace('',np.nan)
        stats = stats.dropna(subset=['Player'])
        stats = stats.fillna(0)
        
        
        #Change the numerical columns from objects to floats and create a csv 
        float_col = [ 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
        
        stats.insert(1,'Year',i)

        stats[float_col] = stats[float_col].astype(float)
        csv = '{}_players_pgs.csv'.format(i)
        stats.to_csv(csv,index=False)
        
    return
        

In [None]:
###Pulling totals stats###

def player_tot(years):
    for i in years:
       # Get the corresponding url and parse it 
        years=years        
        url = 'https://www.basketball-reference.com/leagues/NBA_{}_totals.html'.format(i)
        source = requests.get(url).text
        soup = BeautifulSoup(source,'lxml')
        
        #Get the header and rows into two separate list
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers = headers[1:]
        
        rows = soup.findAll('tr')[1:]
        player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
        
        #Creating the stats dataframe and getting rid of completely empty rows that were
        #included and replace remainding NaN values with 0. This is because of how Basketball Reference sets their tables up
        stats = pd.DataFrame(player_stats, columns = headers)
        stats = stats.replace('',np.nan)
        stats = stats.dropna(subset=['Player'])
        stats = stats.fillna(0)
        
        
        #Change the numerical columns from objects to floats and create a csv 
        float_col = [ 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
        
        stats.insert(1,'Year',i)

        stats[float_col] = stats[float_col].astype(float)
        csv = '{}_players_tot.csv'.format(i)
        stats.to_csv(csv,index=False)
        
    return
        

# How to Pull Data

In [1]:
### Either copy and paste functions or download Player_Scrapper.py
from Player_Scrapper import *
import numpy as np

In [8]:
#If pulling for one year make sure to have the year as a list
year = [2020]

#If using mutiple years you want to create a list
years = np.arange(2010,2021,1)
years = list(years)

In [None]:
# 'years' is the only requirement and it needs a list of years
player_tot(years=year)

player_pg(years=year)

# How to Add All Data to One Frame


In [23]:
import pandas as pd
import numpy as np
import glob

In [3]:
###Used to combined all scraped files###

path = 'use given path' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [32]:
frame.to_csv('data.csv',index=False)