In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [2]:
# Create url templates for each kind of stats
per_g_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_per_game.html"
adv_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_advanced.html"
tot_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_totals.html"
per_36m_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}_\
per_minute.html"
per_100p_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}_\
per_poss.html"

# Put all the URL templates into a list
url_template_list = [per_g_url_template, adv_url_template, tot_url_template, 
                     per_36m_url_template,]

In [3]:
# Create empty lists to store data before appending to Dataframe
column_headers = []
player_data = []
# Create empty DataFrame for following functions to fill
df = pd.DataFrame()

In [4]:
# Empty DataFrames for each set of pages
df_adv = pd.DataFrame()
df_per_g = pd.DataFrame()
df_tot = pd.DataFrame()
df_per_36m = pd.DataFrame()
#df_per_100p = pd.DataFrame

# Create df_list of DataFrames for looping
df_list = [df_per_g, df_adv, df_tot, df_per_36m]

In [5]:
# Get column headers from each page
# Assigns a new list of column headers each time this is called
def get_column_headers(soup):
    headers = []
    for th in soup.find('tr').findAll('th'):
        #print th.getText()
        headers.append(th.getText())
    #print headers # this line was for a bug check
    # Assign global variable to headers gathered by function
    return headers    
    #column_headers = [th.getText() for th in soup.find('tr').findAll('th')]

In [6]:
# old function that's a mess
def get_player_data(soup):
    temp_player_data = []
    for i in range(len(soup.findAll('tr')[1:])):
        # temp list to store player data
        player_row = []
        
        # Loop through 'td' tags to extract player data
        for td in soup.findAll('tr')[1:][i].findAll('td'):
            player_row.append(td.getText())
        
        # Append data to a list    
        temp_player_data.append(player_row)
        
        # Replace global variable with gathered player data
    print(temp_player_data)
    player_data = temp_player_data

In [7]:
# Function to get player data from each page
def get_player_data(soup):
    # Temporary list within function to store data
    temp_player_data = []
    
    data_rows = soup.findAll('tr')[1:] # skip first row
    for i in range(len(data_rows)): # loop through each table row
        player_row = [] # empty list for each player row
        for td in data_rows[i].findAll('td'):
            player_row.append(td.getText()) # append separate data points
        temp_player_data.append(player_row) # append player row data
    return temp_player_data

In [8]:
def scrape_page(url):
    r = requests.get(url) # get the url
    soup = BeautifulSoup(r.text, 'html.parser') # Create BS object
    
    # call function to get column headers
    column_headers = get_column_headers(soup)
    
    # call function to get player data
    player_data = get_player_data(soup)
    
    # input data to DataFrame
    # Skip first value of column headers, 'Rk'
    df = pd.DataFrame(player_data, columns = column_headers[1:])
    
    return df

In [9]:
def get_season(input_year):
    first_yr = input_year - 1
    season = str(first_yr) + "-" + str(input_year)[2:]
    return season

In [10]:
# This function drops empty rows an columns, drops duplicates, and changes
# % character in columns
def gen_cleaning(df):
    # Convert values to numeric values first
    df = df.apply(pd.to_numeric, errors = 'ignore')
    
    # Drop columns with no data
    df.dropna(axis = 1, how = "all", inplace = True)
    
    # Drop rows with no data
    df.dropna(axis = 0, how = "all", inplace = True)
    
    # Remove duplicates player inputs; ie. players who were traded
    # I only kept the TOT per game season values
    #df.drop_duplicates(["Player"], keep = "first", inplace = True)
    
    # Change % symbol to _perc
    df.columns = df.columns.str.replace('%', '_perc')
    
    return df

In [11]:
# This function scrapes player data from multiple pages by start and end years
def scrape_pages(url_template, start_year, end_year, output_df):
    count = 0 
    for year in range(start_year, (end_year+1)):
        url = url_template.format(year = year) # grab URL per year
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html5lib') # Create soup item
        
        # Check to grab column headers
        if count == 0: # only append column headers once
            columns = get_column_headers(soup)
            count += 1
            
        # grab player data for each year
        player_data = get_player_data(soup)
        
        # Create temporary DataFrame first for each year
        # Duplicates are removed before putting into bigger DataFrame
        # These duplicates come from players playing on multiple teams in one season
        # This script only keeps the TOT output as Tm
        year_df = pd.DataFrame(player_data, columns = columns[1:])
        year_df.drop_duplicates(['Player'], keep = 'first', inplace = True)
        year_df.insert(0, 'Season', get_season(year)) # insert season year column
        
        # Append to big DataFrame for detailed cleaning
        output_df = output_df.append(year_df, ignore_index = True)
        
    # Do common, general cleaning practices
    output_df = gen_cleaning(output_df)
        
    return output_df

In [12]:
# This bunch of code is just for me to check things as I go

#url = "https://www.basketball-reference.com/leagues/NBA_2006_per_game.html"
#r = requests.get(url)
#soup = BeautifulSoup(r.text, 'html.parser')
#column_headers = get_column_headers(soup)
#player_data = get_player_data(soup)
#df_test = pd.DataFrame(player_data, columns = column_headers[1:])
#df_test = gen_cleaning(df_test)

In [13]:
#df_test.sort_values('PS/G', ascending = False)

In [14]:
#df_test[df_test['Player'] == 'Kobe Bryant']

In [15]:
# Fill each DataFrame with data scraped from their respective pages
# Each print statement is a check for if any pages or functions give issues
# Added timer to check how long this was taking

start = time.time()

df_per_g = scrape_pages(per_g_url_template, 1981, 2017, df_per_g)
print("Finished per g")
df_adv = scrape_pages(adv_url_template, 1981, 2017, df_adv)
print("Finished adv")
df_tot = scrape_pages(tot_url_template, 1981, 2017, df_tot)
print("Finished tots")
df_per_36m = scrape_pages(per_36m_url_template, 1981, 2017, df_per_36m)
print("Finished per 36m")

end = time.time()
print("Time elapsed :" +str((end - start) / 60) + " minutes")

Finished per g
Finished adv
Finished tots
Finished per 36m
Time elapsed :12.929147251447041 minutes


In [16]:
# Check all column names to see what needs to be cleaned

print("totals")
print(list(df_tot))
print("per game")
print(list(df_per_g))
print("per 36 minutes")
print(list(df_per_36m))
print("advanced")
print(list(df_adv))

totals
['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG_perc', '3P', '3PA', '3P_perc', '2P', '2PA', '2P_perc', 'eFG_perc', 'FT', 'FTA', 'FT_perc', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
per game
['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG_perc', '3P', '3PA', '3P_perc', '2P', '2PA', '2P_perc', 'eFG_perc', 'FT', 'FTA', 'FT_perc', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PS/G']
per 36 minutes
['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG_perc', '3P', '3PA', '3P_perc', '2P', '2PA', '2P_perc', 'FT', 'FTA', 'FT_perc', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
advanced
['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS_perc', '3PAr', 'FTr', 'ORB_perc', 'DRB_perc', 'TRB_perc', 'AST_perc', 'STL_perc', 'BLK_perc', 'TOV_perc', 'USG_perc', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']


In [17]:
# Label columns properly by adding "_tot" to the end of some column values
df_tot.columns.values[[7, 8 , 9, 11, 12, 14, 15, 18, 19]] = \
[df_tot.columns.values[[7, 8 , 9, 11, 12, 14, 15, 18, 19]][col] + "_tot" for col in range(9)]

df_tot.columns.values[21:30] = [df_tot.columns.values[21:30][col] + \
"_tot" for col in range(9)]

In [18]:
# Check column titles again
list(df_tot)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'FG_tot',
 'FGA_tot',
 'FG_perc',
 '3P_tot',
 '3PA_tot',
 '3P_perc',
 '2P_tot',
 '2PA_tot',
 '2P_perc',
 'eFG_perc',
 'FT_tot',
 'FTA_tot',
 'FT_perc',
 'ORB_tot',
 'DRB_tot',
 'TRB_tot',
 'AST_tot',
 'STL_tot',
 'BLK_tot',
 'TOV_tot',
 'PF_tot',
 'PTS_tot']

In [19]:
# drop _perc columns from per_g and per_36m
# Never mind, drop duplicates later on
# Add _per_g and _per_36m to column values
# Add _per_G to some values in df_per_g
df_per_g.columns.values[[7, 8 , 9, 11, 12, 14, 15, 18, 19]] = \
[df_per_g.columns.values[[7, 8 , 9, 11, 12, 14, 15, 18, 19]][col] + "_per_G" for col in range(9)]

df_per_g.columns.values[21:29] = [df_per_g.columns.values[21:30][col] + \
"_per_G" for col in range(8)]

# Rename PS/G to PTS_per_G
df_per_g.rename(columns={'PS/G': 'PTS_per_G'}, inplace = True)

In [20]:
df_per_36m.columns.values[[7, 8, 9, 11, 12, 14, 15, 18, 19]]

array(['MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FTA', 'FT_perc'], dtype=object)

In [21]:
# Check if proper values were changed
list(df_per_g)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 'FG_perc',
 '3P_per_G',
 '3PA_per_G',
 '3P_perc',
 '2P_per_G',
 '2PA_per_G',
 '2P_perc',
 'eFG_perc',
 'FT_per_G',
 'FTA_per_G',
 'FT_perc',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G']

In [22]:
df_per_36m.columns.values[[8, 9, 11, 12, 14, 15, 17, 18]] = \
[df_per_36m.columns.values[[8, 9, 11, 12, 14, 15, 17, 18]][col] + "_per_36m" \
for col in range(8)]

df_per_36m.columns.values[20:30] = [df_per_36m.columns.values[20:30][col] + "_per_36m" \
                                   for col in range(9)]

In [23]:
# Check columns were changed properly
list(df_per_36m)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG_per_36m',
 'FGA_per_36m',
 'FG_perc',
 '3P_per_36m',
 '3PA_per_36m',
 '3P_perc',
 '2P_per_36m',
 '2PA_per_36m',
 '2P_perc',
 'FT_per_36m',
 'FTA_per_36m',
 'FT_perc',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m']

In [24]:
# Find where '\xa0' columns are for removal
print(df_adv.columns[-5])
print(df_adv.columns[19])

WS/48
OWS


In [25]:
# Drop '\xa0' columns, last one first
#df_adv.drop(df_adv.columns[-5], axis = 1, inplace = True)
#df_adv.drop(df_adv.columns[19], axis = 1, inplace = True)

In [26]:
list(df_adv)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [27]:
df_adv.rename(columns = {'WS/48' : 'WS_per_48'}, inplace = True)

In [28]:
# Check to see if columns were dropped properly
list(df_adv)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 'WS',
 'WS_per_48',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [29]:
# Merge dataframes later on season, player name, and team
# Order of merges: tots, per_g, per_36m, adv
# DFs: df_tot, df_per_g, df_per_36m, df_adv
# Common things: Season, Player, Pos, Age, Tm, G

In [30]:
df_all = pd.merge(df_tot, df_per_g, how = "left", 
                 on = ['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'FT_perc',
                      '3P_perc', '2P_perc', 'FG_perc', 'eFG_perc'])

In [31]:
df_all = pd.merge(df_all, df_per_36m, how = "left",
                 on = ['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'FT_perc',
                      '3P_perc', '2P_perc', 'FG_perc'])

In [32]:
df_all = pd.merge(df_all, df_adv, how = "left",
                on = ['Season', 'Player', 'Pos', 'Age', 'Tm', 'G'])

In [33]:
# Check columns to make sure they're all right
list(df_all)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'FG_tot',
 'FGA_tot',
 'FG_perc',
 '3P_tot',
 '3PA_tot',
 '3P_perc',
 '2P_tot',
 '2PA_tot',
 '2P_perc',
 'eFG_perc',
 'FT_tot',
 'FTA_tot',
 'FT_perc',
 'ORB_tot',
 'DRB_tot',
 'TRB_tot',
 'AST_tot',
 'STL_tot',
 'BLK_tot',
 'TOV_tot',
 'PF_tot',
 'PTS_tot',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 '3P_per_G',
 '3PA_per_G',
 '2P_per_G',
 '2PA_per_G',
 'FT_per_G',
 'FTA_per_G',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G',
 'MP_x',
 'FG_per_36m',
 'FGA_per_36m',
 '3P_per_36m',
 '3PA_per_36m',
 '2P_per_36m',
 '2PA_per_36m',
 'FT_per_36m',
 'FTA_per_36m',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m',
 'MP_y',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',

In [34]:
# Try to drop duplicate MP columns
list(df_all.drop(['MP_x', 'MP_y'], axis = 1))

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'FG_tot',
 'FGA_tot',
 'FG_perc',
 '3P_tot',
 '3PA_tot',
 '3P_perc',
 '2P_tot',
 '2PA_tot',
 '2P_perc',
 'eFG_perc',
 'FT_tot',
 'FTA_tot',
 'FT_perc',
 'ORB_tot',
 'DRB_tot',
 'TRB_tot',
 'AST_tot',
 'STL_tot',
 'BLK_tot',
 'TOV_tot',
 'PF_tot',
 'PTS_tot',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 '3P_per_G',
 '3PA_per_G',
 '2P_per_G',
 '2PA_per_G',
 'FT_per_G',
 'FTA_per_G',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G',
 'FG_per_36m',
 'FGA_per_36m',
 '3P_per_36m',
 '3PA_per_36m',
 '2P_per_36m',
 '2PA_per_36m',
 'FT_per_36m',
 'FTA_per_36m',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 

In [35]:
df_all.drop(['MP_x', 'MP_y'], axis = 1, inplace = True)

In [36]:
# Final check of columns
list(df_all)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'FG_tot',
 'FGA_tot',
 'FG_perc',
 '3P_tot',
 '3PA_tot',
 '3P_perc',
 '2P_tot',
 '2PA_tot',
 '2P_perc',
 'eFG_perc',
 'FT_tot',
 'FTA_tot',
 'FT_perc',
 'ORB_tot',
 'DRB_tot',
 'TRB_tot',
 'AST_tot',
 'STL_tot',
 'BLK_tot',
 'TOV_tot',
 'PF_tot',
 'PTS_tot',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 '3P_per_G',
 '3PA_per_G',
 '2P_per_G',
 '2PA_per_G',
 'FT_per_G',
 'FTA_per_G',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G',
 'FG_per_36m',
 'FGA_per_36m',
 '3P_per_36m',
 '3PA_per_36m',
 '2P_per_36m',
 '2PA_per_36m',
 'FT_per_36m',
 'FTA_per_36m',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 

In [64]:
# First check length of dataframe
print(len(df_all))

15234


In [65]:
# Fill Null values with 0
df_all.fillna(0, inplace = True)

In [66]:
# Address ambiguous positions and combination positions
df = df_all.groupby(['Pos'])['Pos'].nunique()
df

Pos
C        1
C-PF     1
C-SF     1
PF       1
PF-C     1
PF-SF    1
PG       1
PG-SF    1
PG-SG    1
SF       1
SF-PF    1
SF-SG    1
SG       1
SG-PF    1
SG-PG    1
SG-SF    1
Name: Pos, dtype: int64

In [67]:
# Remove rows of 0s
df_all = df_all[df_all['Pos'] != 0]

# Check df_all length again
print(len(df_all))

15234


In [41]:
# I think this position is a mistake
# Check the value to see the player
df_all[df_all['Pos'] == 'C-SF']

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
10817,2007-08,Bobby Jones,C-SF,24.0,TOT,47.0,2.0,531.0,60.0,140.0,...,14.7,15.9,0.4,0.4,0.8,0.07,-1.7,-1.5,-3.1,-0.2


In [42]:
# Check Bobby Jones' actual, commonly played position
df_all[df_all['Player'] == 'Bobby Jones']

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
142,1980-81,Bobby Jones,PF,29.0,PHI,81.0,0.0,2046.0,407.0,755.0,...,14.1,20.8,5.4,3.9,9.2,0.217,2.8,2.2,5.0,3.6
448,1981-82,Bobby Jones,PF,30.0,PHI,76.0,73.0,2181.0,416.0,737.0,...,14.1,19.6,4.8,3.4,8.2,0.181,2.0,2.2,4.2,3.4
756,1982-83,Bobby Jones,PF,31.0,PHI,74.0,0.0,1749.0,250.0,460.0,...,16.5,15.0,3.3,3.1,6.4,0.175,1.2,3.3,4.5,2.8
1067,1983-84,Bobby Jones,PF,32.0,PHI,75.0,0.0,1761.0,226.0,432.0,...,16.1,14.7,2.8,3.2,6.0,0.163,0.6,3.8,4.4,2.8
1377,1984-85,Bobby Jones,PF,33.0,PHI,80.0,8.0,1633.0,207.0,385.0,...,19.7,14.7,2.8,2.0,4.8,0.142,1.1,1.9,3.0,2.1
1698,1985-86,Bobby Jones,PF,34.0,PHI,70.0,42.0,1519.0,189.0,338.0,...,18.3,13.0,1.9,1.4,3.2,0.102,-0.6,0.8,0.2,0.9
10354,2006-07,Bobby Jones,SF,23.0,PHI,44.0,5.0,336.0,43.0,93.0,...,13.9,17.7,0.0,0.3,0.3,0.045,-2.8,-1.3,-4.1,-0.2
10817,2007-08,Bobby Jones,C-SF,24.0,TOT,47.0,2.0,531.0,60.0,140.0,...,14.7,15.9,0.4,0.4,0.8,0.07,-1.7,-1.5,-3.1,-0.2


In [43]:
# Continue auditing dual position players
# Look who's a PG-SF

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
9115,2003-04,Jalen Rose,PG-SF,31.0,TOT,66.0,64.0,2497.0,383.0,952.0,...,16.5,24.1,-0.7,2.2,1.5,0.029,-1.2,-0.9,-2.1,-0.1


In [56]:
# Check what is going on with rows with Pos == '0'
df_all[df_all['Pos'] == 0]

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
20,1980-81,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,1981-82,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
639,1982-83,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
954,1983-84,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1264,1984-85,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1582,1985-86,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1905,1986-87,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2240,1987-88,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2572,1988-89,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2925,1989-90,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
df_dual_pos = df_all['Pos'] == 'PG-SF'

In [47]:
df_dual_pos
df_all[df_dual_pos]

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
9115,2003-04,Jalen Rose,PG-SF,31.0,TOT,66.0,64.0,2497.0,383.0,952.0,...,16.5,24.1,-0.7,2.2,1.5,0.029,-1.2,-0.9,-2.1,-0.1


In [49]:
for pos in df_all['Pos']:
    print(pos)

C
SF
C
PF
PG
C
PF
SF
SF
SG
PF
C
PG
PF
SG
SG
SG
SG
SG
PG
0
PF
SG
SF
SF
PG
SF
SG
C
C
PF
PG
C
SG
SF
PG
SG
PF
SF
C
SF
SG
C
PF
PG
SG
PF
SG
SF
SG
SG
SF
SG
C
C
PG
C
C
C
SF
SF
PG
PG
PF
SF
SG
C
PF
C
C
SF
C
SF
PG
C
SG-SF
SG
SG
PG
C
C
C
SF
SF
PG
C
SG
SF
PG
SG
SG
PG
SF
SF
SG
C
PG
C
PF
PG
PF
SG
PF
SG
SF
SF
PG
SF
SG
PF
PF
SG
C
PF
PF
PG
SG
SG
PG
PG
PG
SG
PG
PF
PF
C
PG
C
PG
SF
C
PG
SG
C
SF
PF
SG
SF
PF
SF
PF
PG
PF
C
PF
PF
PF
PG
SF
SF
PF
C
SF
SF
SG
SF
PF
SG
PF
SG
C
PF
C
C
C
PF
PF
C
C
PG
PG
PG
C
SG
PG
PF
SG
PG
PF
C
PG
SF
SF
SF
PF
PG
SG
PG
PF
PF
PG
C
SG
C
SF
SF
SF
C
SG
PG
PG
PG
C
SF
SG
SG
PG
SF
PG
SF
SF
C
SF
PF
C
SG
SF
PF
SG
PF
SG
SG
PF
PG
SG
SF
PG
C
PF
SG
SF
SF
SG
SG
SF
C
PF
PF
PF
PF
C
PG
C
PG
PF
SF
SF
PF
SF
SF
PF
SF
PF
PG
C
PG
PF
SG
SG
PG
PG
SF
PG
SF
PG
SG
SG
SG
C
PF
SG
SF
C
SF
SF
SF
C
PG
SF
PG
PF
PF
C
SF
SG
PG
SG
C
SG
PF
PF
SG
SF
SF
C
SG
PG
PF
SG
SF
SG
SG
PG
PG
C
C
SF
SG
PG
C
C
PF
SF
SF
SF
SG
PF
C
PF
SG
SG
SG
SF
SG
0
SG
PF
PG
PF
SG
SF
SF
SG
SF
SG
C
SF
PG
SG
PG
PG
PF
SF
C
SG
SG
C
PF
PF
PF
PG
C
SG
PG
PF


PG
SF
C
SF
C
SF
C
SG
SG
PF
SF
C
PG
SG
SF
PG
SG
PF
C
C
SF
SF
SF
PG
SG
PG
PG
PF
SG
PF
C
SF
C
C
SF
PF
PF
SF
PG
PF
C
PF
SF
C
PF
SG
C
PF
C
C
SF
PG
PG
C
PG
PF
SF
PG
PF
PF
SF
C
C
PG
PG
PF
PG
SG
PF
SG
PF
SF
SG
PG
SF
C
C
PF
PF
PG
PF
PF
PG
SF
SG
PG
PF
PF
C
SG
SF
SF
PG
PF
C
PG
PG
PF
PG
C
C
SG
C
PF
SG
PG
SF
C
SG
PF
C
PF
SF
PG
PF-SF
SF
PF
SG
SF
C
SG
SG
SF
PG
SF
PG
SF
PF
PG
C
PG
SF
SG
PF
PG
PF
SG
SF
SG
SF
PG
SG
SF
PF
PF
PF
PF
0
SF
PG
SG
SG
C
C
SF
SF
SG
SG
PG
PG
C
PF
C
C
PF
PG
SF
PG
C
SG-SF
PF
SF
PF
SF
C
C
PF
SG
PF
SG
PF
C
C
PF
C
SF
PF
PG
PG
SF
SF
PF
PG
PG
PG
PF
C
SF
C
PF
PF
PF
SG
SG
SF
C
PG
PF
SG
PG
SF
PG
C
C
PG
C
PG
SG
C
C
SG
SG
SG
C
SF
C
SG
SG
SF-SG
SF
SG
C
SG
SF
C
SG
C
SF
PF
PG
PG
PF
SF
SG
SF
PG
SG
PF
PG
PF
SG
PF
SF
C
C
PF
PG
PG
PF
PF
C
SG
PF
PG
PF
PF
SG
PG
PF
PF
SG
PG
PG
SG
SG
PF
SG
PG
PG
SF
SF
SF
PF
PF
SG
C
SF
SG
PG
SG
PG
SG
PF
C
PG
SF
SG
PG
PG
C
SG
C
SG
PF
PG
SF
C
SF
SG
SF
PF
C
C
PG
SG
C
PF
C
PF
C
C
PG
SG
C
PG
PG
SG
PF
SG
SF
SG
C
SF
PF
PF
SG
C
SF
SG
PF
C
PF
SG
PG
PF
PF
SG
PG
SF
C
PF
SF
PF
SF
S

SG
PF
PG
C
PF
SF
PG
PG
PG
PF
PG
PF
SF
SF
SG
C
PG
SF
PG
SF
SG
PF
PG
PG
SG
C
SF
SF
SF
SF
SG
PF
C
PF
C
PF
PF
SG
PG
C
C
SF
PF
SG
SG
PF
PG
SG
C
PF
PG
SG
C
SF
SG
PF
PF
PG
PF
C
PF
SG
SG
C
SG
SG
C
PF
PF
C
PG
SG
SF
PF
SG
C
PG
SG
C
PG
C
PG
SG
C
SG
SG
PF
SG
PG
SG
SG
PG
SF
SF
SF
SF
SF
PF
SF
C
C
PF
SF
PF
SF
PG
PF
C
SF
C
C
PF
PG
PF
C
SG
PF
PG
C
SF
PF
PG
SG
SF
PG
SF
C
SG
SG
PF
PG
SF
SF
C
SF
PF
SF
SG
PG
SG
PG
PF
SG
SG
PF
SF
PF
PG
PF
SF
SF
PF
SG
PF
SG
PF
SG
PF
SF
PG
PG
PG
C
PG
PG
SG
SG
PG
PF
PG
PG
SF
C
PG
SF
PF
SF
SG
PF
SG
SG
C
PF
PF
PG
SF
PG
SG
C
PG
C
SF
PF
C
SF
PG
C
C
SG
SG
C
PG
C
PF
PF
C
SF
SF
SF
SG
C
SG
PF
PG
PF
PG
SF
PG
SF
PF
C
SG
PG
SF
SF
SF
SF
C
PF
SF
C
SF
SG-SF
SG
SG
SF
PF
C
SG
SF
PF
SG
SF
C
SF
SG
C
SF
PG
SF
SF
C
PF
SG
PG
PF
SF
PF
SG
PF
C
PF
C
SG
SG
PG
C
PF
PG
SF
PG
SG
C
PG
SG
C
PF
PG
PF
SG
SG
SG
SG
SF
C
C
SF
PG
C
PG
PG
C
SG
C
PG
PF
PF
SG
SF
SF
PG
SG
SG
C
C
SG
SF
C
SF
PG
SF
PF
PF
SF
PF
C
SF
PF
SF
C
SF
C
C
C
SF
SF
C
SG
SF
C
PG
PG
PF
PF
SG
PF
PF
SG
PG
C
PG
C
SG
SG-SF
SG
SF
PF
C
SG
PG
C
PG
SG
PF
P

PF
SF
PG
C
C
SG
C
C
PF
SF
SF
PG
SF
PG
C
C
PG
SG
PF
PF
C
SF
SF
PF
PF
PG
PG
PF
PF
SG
SG
SG
PF
C
SF
PG
C
PG
C
SG
SF
PG
PG
PF
SG
C
SG-PG
PF
C
C
SF
SF
C
C
PF
SG
PF
SG
SG
PG
C
SG
PG
SF-SG
SF
C
PG
SF
PG
PG
PG
SF
PG
PF-C
0
PG
PG
SF
PG
SG
SG
C
SG
PF
SG
SF
PG
PG
C
PG
PG
C
SG
SG
C
C
PF
C
PF
SF
SF
PG
PF
C
PF
PF
PG
C
SF
SG
SF
PF
PF
PG
PG
SG
SG
SF
PG
SF
C
SF
PF
C
C
PF
SG
SG
PG
SF-SG
PG
C
PG
C
SG
SG
SG
PG
PG
C
C
C
C
C
C
PG
SG
PF
C
SF
C
C
PG
SF
SG
PF
PG
C
PF
SG
C
SG
SG
SF
SG
PG
SF
C
C
SG
C
PG
C
PG
PF
SF
SF
SG
C
C
PG
C
C
PF
SG
SG
PF
SG
PG
PF
PG
C
C
SF
C
SG
PG
PF
PF
C
PG
SG
PF
PF
PF
SF
PG
SG
SG
SG
SG
PG
PF
SG
C
SG
SF
PF
PF
PF
SF
SG
C
C
SF
SF
SF
SG
PF
PG
SG
C
PG
PF
PF
SG
SF
C
PF
C
SF
PG
SG
PF
PG-SG
SG
PF
SF
PF
PG
SG
SF
PF
PF
SG
C
SG
C
C
PG
PG
SG
SG
SF
SF
C
SF
SG-SF
C
SF
PG
PF
PG
PF
SF
SF
PG
SF
PG
SG
C
SF
SF
SG
PG
SF
SG
SF
SF
SF
C
SF
C
SF
PG
PF
SG
PG
PG
SF
PF
SF
SG
PF
PF
C
SG
SF
PG
PG
PG
PG
SF
C
SG
C
PG
PF
PF
PF
PG
PF
PG
SF
PF
PF
C
PF
SF
PG
SG
PG
PF
SF
C
SF
C
PG
PF
C
SF
SG
C
SG
C
C-PF
PF
C-PF
PF
SF
SG
C
C

PF
PG
SG
PF
SG
SF
SG
PF
SF
SF
PG
SG
C
C
PF
PG
PG
PG
C
SF
C
C
SF
SF
PF
SG
SG
SG
PF
PG
SG
PF
C
SF
PF
PF
SG
SG
C
SG
C
SG
PG
PF
C
C
SF
PG
C
PG
PF
PF
SG
C
C
PF
C
PF
PG
SG
PG
C
SF
PF
PF
C
SF
C
C
PF
SF
SF
C
PG
PG
SF
PG
SF
PF
PG
SF
C
C
C
SF
SG
C
SF
PG
PG
SF
C
SF
PF
PF
SG
SG
PF
SG
SF
SG
PG
PG
PG
PG
SF
PG
SF
C
PF
PF
PF
SG
PG
C
PG
PG
SG
SF
SF
PF
C
PG
PF
C
PG
SG
PF
PG
PF
PG
PF
C
SF
SG
SF
C
SG
PF
SG
SF
PG
PG
SG
C
PG
C
PF
PF
PF
C
PF
SG
SG
PF
SG
PG
SF
C
SF
SF
C
SF
C
SG
PG
C
SG
PF
PF
C
SG
SG
SG
PG
PG
C
SF
PF-SF
PF
C
PG
PG
SF
PF
PG
PG
PF
C
C
SF
PG
PF
SG
C
PG
SF
SG
SF
C
SF
C
SG
SF
C
SF
SF
SG
SF
PF
PF
PF
SF
PG
C
PF
C
SG
SG
SF
PF
C
SF
SG-SF
PF
PF
C
SF
PF
C
0
PG
C
PF
PF
PG
PG
PF
SF
SF
C
SG
PF
SF
SF
PG
C
SG
SG
SF
PG
SG
PG
C
SG
C
PF
PG
C
PG
SG
C
PF
PF
PF
C
PG
C
SF
SF
PG
SG
C
SG
SG
SF
SG
SF
SF
PG
PG
PF
SF
SG
SF
SF
PG
C
PF
SF
PF
SF
PG
C
PG
C
PG
PG
SG
PF
C
SG
SG
SF
PF
SG
C
PG
C
SF
PF
C-PF
PF
SF
SG
SF
SF
PG
SG
PF
C
PG
PG
SG
PG
C
SF
PG
C
SF
SF
SF
SG
SG
SG
PF
PF
SG
C
PF
PF
PG
SF
SG-PG
SG
SG
SG
PF
SF
SG-SF
C
PG
C
P

In [50]:
df

Pos
0        1
C        1
C-PF     1
C-SF     1
PF       1
PF-C     1
PF-SF    1
PG       1
PG-SF    1
PG-SG    1
SF       1
SF-PF    1
SF-SG    1
SG       1
SG-PF    1
SG-PG    1
SG-SF    1
Name: Pos, dtype: int64

In [51]:
df = df_all[df_all['Pos'] == 'PG-SF'][['Player', 'Pos']]

In [52]:
df

Unnamed: 0,Player,Pos
9115,Jalen Rose,PG-SF


In [53]:
# Create a DataFrame with top 25 single season scorers 
df_top_25_scorers = df_all.sort_values('PTS_per_G', ascending = False).head(n=25)

# Create a DataFrame with top 50 single season scorers 
df_top_50_scorers = df_all.sort_values('PTS_per_G', ascending = False).head(n=50)

In [54]:
# Write to CSV files and DONE!
#df_all.to_csv("bref_1981_2017_player_data.csv", encoding = 'utf-8', index = False)

In [55]:
#df_top_50_scorers.to_csv("bref_1981_2017_top_50_season_scorers.csv", encoding = "utf-8", index = False)