In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
def pull_stats(year):   

    # create url based on selected year
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
    soup = BeautifulSoup(urlopen(url))

    headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]

    # remove the 'rank' column
    headers = headers[1:]

    # find rows except for the header row
    rows = soup.findAll('tr')[1:]
    
    # format data
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    
    # create pandas dataframe
    stats = pd.DataFrame(player_stats, columns = headers)
    
    # create a year column
    stats['Year'] = year
    
    return stats

In [4]:
def pull_salary(year, pages=50):
    full = []
    
    for i in range(1,pages+1):
        url = "http://www.espn.com/nba/salaries/_/year/{}/page/{}".format(year, i)

        soup = BeautifulSoup(urlopen(url))

        headers = [td.getText() for td in soup.findAll('tr')[0].findAll('td')]

        rows = soup.findAll('tr')[1:]

        player_salary = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]

        salary = pd.DataFrame(player_salary, columns = headers)
        salary['Year'] = year

        for i in range(len(salary)):
            salary['NAME'][i] = salary['NAME'][i].split(',')[0]
        
        full.append(salary)
    
    #concat pages
    full = pd.concat(full)
    
    #remove header rows
    full.drop(full[full['RK']=='RK'].index, inplace=True)
    
    #reset index
    full = full.reset_index().drop(columns='index')
    
    return full

In [5]:
def create_df(years):
    
    final = pd.DataFrame()
    
    for year in years:
        temp_sta = pull_stats(year)
        temp_sal = pull_salary(year)
        
        #create new dataframe with the correct columns
        cols = list(temp_sta.columns)
        cols.append('Salary')
        new = pd.DataFrame(columns=cols)
        
        #loop through stats df to find index for player salary df
        for i in range(len(temp_sta)):
            player = temp_sta['Player'][i]
            index = temp_sal[temp_sal['NAME']==player].index.values
            
            #only add players that we know salaries for
            if index.size == 0:
                continue
            else:
                new = new.append(temp_sta.iloc[i])
                new['Salary'][i] = temp_sal['SALARY'][index].values[0]
        
        #add the years dataframe to the final dataframe
        final = final.append(new)
            
    return final

In [6]:
test = create_df([2018, 2019])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
test.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Salary
2,Steven Adams,C,24,OKC,76,76,32.7,5.9,9.4,0.629,...,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9,2018,"$22,471,910"
3,Bam Adebayo,C,20,MIA,69,19,19.8,2.5,4.9,0.512,...,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9,2018,"$2,955,840"
5,Cole Aldrich,C,29,MIN,21,0,2.3,0.2,0.7,0.333,...,0.6,0.7,0.1,0.1,0.0,0.0,0.5,0.6,2018,"$7,300,000"
6,LaMarcus Aldridge,C,32,SAS,75,75,33.5,9.2,18.0,0.51,...,5.2,8.5,2.0,0.6,1.2,1.5,2.1,23.1,2018,"$21,461,010"
7,Jarrett Allen,C,19,BRK,72,31,20.0,3.3,5.5,0.589,...,3.4,5.4,0.7,0.4,1.2,1.1,2.0,8.2,2018,"$2,034,120"


In [8]:
test.to_csv('Final.csv')