In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import sys

In [2]:
# Create url templates for each kind of stats
per_g_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_per_game.html"
adv_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_advanced.html"
tot_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_totals.html"
per_36m_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}_\
per_minute.html"
per_100p_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}_\
per_poss.html"

# Put all the URL templates into a list
url_template_list = [per_g_url_template, adv_url_template, tot_url_template, 
                     per_36m_url_template,]

In [3]:
# Ask user to input start and end years
# Also checks to see if entry is a number
try:
    user_start_year = int(input("Enter start year in YYYY format: "))
except:
    print('Enter a valid 4 digit year.')
    
try:
    user_end_year = int(input("Enter end year in YYYY format: "))
except:
    print('Enter a valid 4 digit year.')

Enter start year in YYYY format: 1981
Enter end year in YYYY format: 2017


In [4]:
# Check if end year is after start year
if user_end_year >= user_start_year:
    print('Year range accepted.')
else:
    print('Year range is unacceptable.')

# Check if formats are in proper YYYY format
def check_year(user_input_year):
    if user_input_year > 999 and user_input_year < 10000: # Then check if it's 4 digits
        print('Year format accepted.')
    else:
        print('Enter a valid 4 digit year.')
        sys.exit()
    
check_year(user_start_year)
check_year(user_end_year)

Year range accepted.
Year is in numbers.
Year format accepted.
Year is in numbers.
Year format accepted.


In [5]:
# Create empty lists to store data before appending to Dataframe
column_headers = []
player_data = []
# Create empty DataFrame for following functions to fill
df = pd.DataFrame()

In [6]:
# Empty DataFrames for each set of pages
df_adv = pd.DataFrame()
df_per_g = pd.DataFrame()
df_tot = pd.DataFrame()
df_per_36m = pd.DataFrame()
#df_per_100p = pd.DataFrame

# Create df_list of DataFrames for looping
df_list = [df_per_g, df_adv, df_tot, df_per_36m]

In [7]:
# Get column headers from each page
# Assigns a new list of column headers each time this is called
def get_column_headers(soup):
    headers = []
    for th in soup.find('tr').findAll('th'):
        #print th.getText()
        headers.append(th.getText())
    #print headers # this line was for a bug check
    # Assign global variable to headers gathered by function
    return headers    
    #column_headers = [th.getText() for th in soup.find('tr').findAll('th')]

In [8]:
# old function that's a mess
def get_player_data(soup):
    temp_player_data = []
    for i in range(len(soup.findAll('tr')[1:])):
        # temp list to store player data
        player_row = []
        
        # Loop through 'td' tags to extract player data
        for td in soup.findAll('tr')[1:][i].findAll('td'):
            player_row.append(td.getText())
        
        # Append data to a list    
        temp_player_data.append(player_row)
        
        # Replace global variable with gathered player data
    print(temp_player_data)
    player_data = temp_player_data

In [9]:
# Function to get player data from each page
def get_player_data(soup):
    # Temporary list within function to store data
    temp_player_data = []
    
    data_rows = soup.findAll('tr')[1:] # skip first row
    for i in range(len(data_rows)): # loop through each table row
        player_row = [] # empty list for each player row
        for td in data_rows[i].findAll('td'):
            player_row.append(td.getText()) # append separate data points
        temp_player_data.append(player_row) # append player row data
    return temp_player_data

In [10]:
def scrape_page(url):
    r = requests.get(url) # get the url
    soup = BeautifulSoup(r.text, 'html.parser') # Create BS object
    
    # call function to get column headers
    column_headers = get_column_headers(soup)
    
    # call function to get player data
    player_data = get_player_data(soup)
    
    # input data to DataFrame
    # Skip first value of column headers, 'Rk'
    df = pd.DataFrame(player_data, columns = column_headers[1:])
    
    return df

In [11]:
def get_season(input_year):
    first_yr = input_year - 1
    season = str(first_yr) + "-" + str(input_year)[2:]
    return season

In [12]:
# This function drops empty rows an columns, drops duplicates, and changes
# % character in columns
def gen_cleaning(df):
    # Convert values to numeric values first
    df = df.apply(pd.to_numeric, errors = 'ignore')
    
    # Drop columns with no data
    df.dropna(axis = 1, how = "all", inplace = True)
    
    # Drop rows with no data
    df.dropna(axis = 0, how = "all", inplace = True)
    
    # Remove duplicates player inputs; ie. players who were traded
    # I only kept the TOT per game season values
    #df.drop_duplicates(["Player"], keep = "first", inplace = True)
    
    # Change % symbol to _perc
    df.columns = df.columns.str.replace('%', '_perc')
    
    return df

In [13]:
# This function scrapes player data from multiple pages by start and end years
def scrape_pages(url_template, start_year, end_year, output_df):
    count = 0 
    for year in range(start_year, (end_year+1)):
        url = url_template.format(year = year) # grab URL per year
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html5lib') # Create soup item
        
        # Check to grab column headers
        if count == 0: # only append column headers once
            columns = get_column_headers(soup)
            count += 1
            
        # grab player data for each year
        player_data = get_player_data(soup)
        
        # Create temporary DataFrame first for each year
        # Duplicates are removed before putting into bigger DataFrame
        # These duplicates come from players playing on multiple teams in one season
        # This script only keeps the TOT output as Tm
        year_df = pd.DataFrame(player_data, columns = columns[1:])
        year_df.drop_duplicates(['Player'], keep = 'first', inplace = True)
        year_df.insert(0, 'Season', get_season(year)) # insert season year column
        
        # Append to big DataFrame for detailed cleaning
        output_df = output_df.append(year_df, ignore_index = True)
        
    # Do common, general cleaning practices
    output_df = gen_cleaning(output_df)
        
    return output_df

In [14]:
# This bunch of code is just for me to check things as I go

#url = "https://www.basketball-reference.com/leagues/NBA_2006_per_game.html"
#r = requests.get(url)
#soup = BeautifulSoup(r.text, 'html.parser')
#column_headers = get_column_headers(soup)
#player_data = get_player_data(soup)
#df_test = pd.DataFrame(player_data, columns = column_headers[1:])
#df_test = gen_cleaning(df_test)

In [15]:
#df_test.sort_values('PS/G', ascending = False)

In [16]:
#df_test[df_test['Player'] == 'Kobe Bryant']

In [17]:
# Fill each DataFrame with data scraped from their respective pages
# Each print statement is a check for if any pages or functions give issues
# Added timer to check how long this was taking

start = time.time()

df_per_g = scrape_pages(per_g_url_template, user_start_year, user_end_year, df_per_g)
print("Finished per g")
df_adv = scrape_pages(adv_url_template, user_start_year, user_end_year, df_adv)
print("Finished adv")
df_tot = scrape_pages(tot_url_template, user_start_year, user_end_year, df_tot)
print("Finished tots")
df_per_36m = scrape_pages(per_36m_url_template, user_start_year, user_end_year, df_per_36m)
print("Finished per 36m")

end = time.time()
print("Time elapsed :" +str((end - start) / 60) + " minutes")

Finished per g
Finished adv
Finished tots
Finished per 36m
Time elapsed :11.03308045466741 minutes


In [18]:
# Check all column names to see what needs to be cleaned

print("totals")
print(list(df_tot))
print("per game")
print(list(df_per_g))
print("per 36 minutes")
print(list(df_per_36m))
print("advanced")
print(list(df_adv))

totals
['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG_perc', '3P', '3PA', '3P_perc', '2P', '2PA', '2P_perc', 'eFG_perc', 'FT', 'FTA', 'FT_perc', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
per game
['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG_perc', '3P', '3PA', '3P_perc', '2P', '2PA', '2P_perc', 'eFG_perc', 'FT', 'FTA', 'FT_perc', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PS/G']
per 36 minutes
['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG_perc', '3P', '3PA', '3P_perc', '2P', '2PA', '2P_perc', 'FT', 'FTA', 'FT_perc', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
advanced
['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS_perc', '3PAr', 'FTr', 'ORB_perc', 'DRB_perc', 'TRB_perc', 'AST_perc', 'STL_perc', 'BLK_perc', 'TOV_perc', 'USG_perc', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']


In [19]:
# Label columns properly by adding "_tot" to the end of some column values
df_tot.columns.values[[7, 8 , 9, 11, 12, 14, 15, 18, 19]] = \
[df_tot.columns.values[[7, 8 , 9, 11, 12, 14, 15, 18, 19]][col] + "_tot" for col in range(9)]

df_tot.columns.values[21:30] = [df_tot.columns.values[21:30][col] + \
"_tot" for col in range(9)]

In [20]:
# Check column titles again
list(df_tot)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'FG_tot',
 'FGA_tot',
 'FG_perc',
 '3P_tot',
 '3PA_tot',
 '3P_perc',
 '2P_tot',
 '2PA_tot',
 '2P_perc',
 'eFG_perc',
 'FT_tot',
 'FTA_tot',
 'FT_perc',
 'ORB_tot',
 'DRB_tot',
 'TRB_tot',
 'AST_tot',
 'STL_tot',
 'BLK_tot',
 'TOV_tot',
 'PF_tot',
 'PTS_tot']

In [21]:
# drop _perc columns from per_g and per_36m
# Never mind, drop duplicates later on
# Add _per_g and _per_36m to column values
# Add _per_G to some values in df_per_g
df_per_g.columns.values[[7, 8 , 9, 11, 12, 14, 15, 18, 19]] = \
[df_per_g.columns.values[[7, 8 , 9, 11, 12, 14, 15, 18, 19]][col] + "_per_G" for col in range(9)]

df_per_g.columns.values[21:29] = [df_per_g.columns.values[21:30][col] + \
"_per_G" for col in range(8)]

# Rename PS/G to PTS_per_G
df_per_g.rename(columns={'PS/G': 'PTS_per_G'}, inplace = True)

In [22]:
df_per_36m.columns.values[[7, 8, 9, 11, 12, 14, 15, 18, 19]]

array(['MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FTA', 'FT_perc'], dtype=object)

In [23]:
# Check if proper values were changed
list(df_per_g)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 'FG_perc',
 '3P_per_G',
 '3PA_per_G',
 '3P_perc',
 '2P_per_G',
 '2PA_per_G',
 '2P_perc',
 'eFG_perc',
 'FT_per_G',
 'FTA_per_G',
 'FT_perc',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G']

In [24]:
df_per_36m.columns.values[[8, 9, 11, 12, 14, 15, 17, 18]] = \
[df_per_36m.columns.values[[8, 9, 11, 12, 14, 15, 17, 18]][col] + "_per_36m" \
for col in range(8)]

df_per_36m.columns.values[20:30] = [df_per_36m.columns.values[20:30][col] + "_per_36m" \
                                   for col in range(9)]

In [25]:
# Check columns were changed properly
list(df_per_36m)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG_per_36m',
 'FGA_per_36m',
 'FG_perc',
 '3P_per_36m',
 '3PA_per_36m',
 '3P_perc',
 '2P_per_36m',
 '2PA_per_36m',
 '2P_perc',
 'FT_per_36m',
 'FTA_per_36m',
 'FT_perc',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m']

In [26]:
# Find where '\xa0' columns are for removal
print(df_adv.columns[-5])
print(df_adv.columns[19])

WS/48
OWS


In [27]:
# Drop '\xa0' columns, last one first
#df_adv.drop(df_adv.columns[-5], axis = 1, inplace = True)
#df_adv.drop(df_adv.columns[19], axis = 1, inplace = True)

In [28]:
list(df_adv)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [29]:
df_adv.rename(columns = {'WS/48' : 'WS_per_48'}, inplace = True)

In [30]:
# Check to see if columns were dropped properly
list(df_adv)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 'WS',
 'WS_per_48',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [31]:
# Merge dataframes later on season, player name, and team
# Order of merges: tots, per_g, per_36m, adv
# DFs: df_tot, df_per_g, df_per_36m, df_adv
# Common things: Season, Player, Pos, Age, Tm, G

In [32]:
df_all = pd.merge(df_tot, df_per_g, how = "left", 
                 on = ['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'FT_perc',
                      '3P_perc', '2P_perc', 'FG_perc', 'eFG_perc'])

In [33]:
df_all = pd.merge(df_all, df_per_36m, how = "left",
                 on = ['Season', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'FT_perc',
                      '3P_perc', '2P_perc', 'FG_perc'])

In [34]:
df_all = pd.merge(df_all, df_adv, how = "left",
                on = ['Season', 'Player', 'Pos', 'Age', 'Tm', 'G'])

In [35]:
# Check columns to make sure they're all right
list(df_all)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'FG_tot',
 'FGA_tot',
 'FG_perc',
 '3P_tot',
 '3PA_tot',
 '3P_perc',
 '2P_tot',
 '2PA_tot',
 '2P_perc',
 'eFG_perc',
 'FT_tot',
 'FTA_tot',
 'FT_perc',
 'ORB_tot',
 'DRB_tot',
 'TRB_tot',
 'AST_tot',
 'STL_tot',
 'BLK_tot',
 'TOV_tot',
 'PF_tot',
 'PTS_tot',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 '3P_per_G',
 '3PA_per_G',
 '2P_per_G',
 '2PA_per_G',
 'FT_per_G',
 'FTA_per_G',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G',
 'MP_x',
 'FG_per_36m',
 'FGA_per_36m',
 '3P_per_36m',
 '3PA_per_36m',
 '2P_per_36m',
 '2PA_per_36m',
 'FT_per_36m',
 'FTA_per_36m',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m',
 'MP_y',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',

In [36]:
# Try to drop duplicate MP columns
list(df_all.drop(['MP_x', 'MP_y'], axis = 1))

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'FG_tot',
 'FGA_tot',
 'FG_perc',
 '3P_tot',
 '3PA_tot',
 '3P_perc',
 '2P_tot',
 '2PA_tot',
 '2P_perc',
 'eFG_perc',
 'FT_tot',
 'FTA_tot',
 'FT_perc',
 'ORB_tot',
 'DRB_tot',
 'TRB_tot',
 'AST_tot',
 'STL_tot',
 'BLK_tot',
 'TOV_tot',
 'PF_tot',
 'PTS_tot',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 '3P_per_G',
 '3PA_per_G',
 '2P_per_G',
 '2PA_per_G',
 'FT_per_G',
 'FTA_per_G',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G',
 'FG_per_36m',
 'FGA_per_36m',
 '3P_per_36m',
 '3PA_per_36m',
 '2P_per_36m',
 '2PA_per_36m',
 'FT_per_36m',
 'FTA_per_36m',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 

In [37]:
df_all.drop(['MP_x', 'MP_y'], axis = 1, inplace = True)

In [38]:
# Final check of columns
list(df_all)

['Season',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP_tot',
 'FG_tot',
 'FGA_tot',
 'FG_perc',
 '3P_tot',
 '3PA_tot',
 '3P_perc',
 '2P_tot',
 '2PA_tot',
 '2P_perc',
 'eFG_perc',
 'FT_tot',
 'FTA_tot',
 'FT_perc',
 'ORB_tot',
 'DRB_tot',
 'TRB_tot',
 'AST_tot',
 'STL_tot',
 'BLK_tot',
 'TOV_tot',
 'PF_tot',
 'PTS_tot',
 'MP_per_G',
 'FG_per_G',
 'FGA_per_G',
 '3P_per_G',
 '3PA_per_G',
 '2P_per_G',
 '2PA_per_G',
 'FT_per_G',
 'FTA_per_G',
 'ORB_per_G',
 'DRB_per_G',
 'TRB_per_G',
 'AST_per_G',
 'STL_per_G',
 'BLK_per_G',
 'TOV_per_G',
 'PF_per_G',
 'PTS_per_G',
 'FG_per_36m',
 'FGA_per_36m',
 '3P_per_36m',
 '3PA_per_36m',
 '2P_per_36m',
 '2PA_per_36m',
 'FT_per_36m',
 'FTA_per_36m',
 'ORB_per_36m',
 'DRB_per_36m',
 'TRB_per_36m',
 'AST_per_36m',
 'STL_per_36m',
 'BLK_per_36m',
 'TOV_per_36m',
 'PF_per_36m',
 'PTS_per_36m',
 'PER',
 'TS_perc',
 '3PAr',
 'FTr',
 'ORB_perc',
 'DRB_perc',
 'TRB_perc',
 'AST_perc',
 'STL_perc',
 'BLK_perc',
 'TOV_perc',
 'USG_perc',
 'OWS',
 'DWS',
 

In [39]:
# First check length of dataframe
print(len(df_all))

15271


In [40]:
# Fill Null values with 0
df_all.fillna(0, inplace = True)

In [41]:
# Address ambiguous positions and combination positions
df = df_all.groupby(['Pos'])['Pos'].nunique()
df

Pos
0        1
C        1
C-PF     1
C-SF     1
PF       1
PF-C     1
PF-SF    1
PG       1
PG-SF    1
PG-SG    1
SF       1
SF-PF    1
SF-SG    1
SG       1
SG-PF    1
SG-PG    1
SG-SF    1
Name: Pos, dtype: int64

In [42]:
# Remove where 'Pos' value is 0
df_all = df_all[df_all['Pos'] != 0]

# Then check df_all length again
print(len(df_all))

15234


In [43]:
# I think the PG-SF and C-SF positions are mistakes
# Check the value to see the player
df_all[df_all['Pos'] == 'C-SF']

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
10817,2007-08,Bobby Jones,C-SF,24.0,TOT,47.0,2.0,531.0,60.0,140.0,...,14.7,15.9,0.4,0.4,0.8,0.07,-1.7,-1.5,-3.1,-0.2


In [44]:
# Check Bobby Jones' actual, commonly played position
df_all[df_all['Player'] == 'Bobby Jones']

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
142,1980-81,Bobby Jones,PF,29.0,PHI,81.0,0.0,2046.0,407.0,755.0,...,14.1,20.8,5.4,3.9,9.2,0.217,2.8,2.2,5.0,3.6
448,1981-82,Bobby Jones,PF,30.0,PHI,76.0,73.0,2181.0,416.0,737.0,...,14.1,19.6,4.8,3.4,8.2,0.181,2.0,2.2,4.2,3.4
756,1982-83,Bobby Jones,PF,31.0,PHI,74.0,0.0,1749.0,250.0,460.0,...,16.5,15.0,3.3,3.1,6.4,0.175,1.2,3.3,4.5,2.8
1067,1983-84,Bobby Jones,PF,32.0,PHI,75.0,0.0,1761.0,226.0,432.0,...,16.1,14.7,2.8,3.2,6.0,0.163,0.6,3.8,4.4,2.8
1377,1984-85,Bobby Jones,PF,33.0,PHI,80.0,8.0,1633.0,207.0,385.0,...,19.7,14.7,2.8,2.0,4.8,0.142,1.1,1.9,3.0,2.1
1698,1985-86,Bobby Jones,PF,34.0,PHI,70.0,42.0,1519.0,189.0,338.0,...,18.3,13.0,1.9,1.4,3.2,0.102,-0.6,0.8,0.2,0.9
10354,2006-07,Bobby Jones,SF,23.0,PHI,44.0,5.0,336.0,43.0,93.0,...,13.9,17.7,0.0,0.3,0.3,0.045,-2.8,-1.3,-4.1,-0.2
10817,2007-08,Bobby Jones,C-SF,24.0,TOT,47.0,2.0,531.0,60.0,140.0,...,14.7,15.9,0.4,0.4,0.8,0.07,-1.7,-1.5,-3.1,-0.2


In [45]:
# Create list of dual positions in DataFrame
# Create empty DataFrame to audit dual position values
column_names = list(df_all.columns.values)
dual_pos_rows = []
df_dual_pos = pd.DataFrame(columns = column_names)

In [46]:
# Gather all the dual positions by seeing which ones have a dash
for pos in df_all['Pos']:
    if "-" in pos:
        if pos not in dual_pos_rows:
            dual_pos_rows.append(pos)

In [47]:
# Append all dual position rows to a new DataFrame for auditing
for pos in dual_pos_rows:
    df_dual_pos = df_dual_pos.append(df_all[df_all['Pos'] == pos],
                                    ignore_index = True)

In [48]:
df_dual_pos
# It looks like all these players moved teams before
# Certain players have multiple positions or changed positions

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
0,1980-81,Terry Duerod,SG-SF,24.0,TOT,50.0,0.0,451.0,104.0,234.0,...,12.2,26.4,0.0,0.2,0.2,0.018,-1.6,-3.1,-4.7,-0.3
1,1990-91,Tony Brown,SG-SF,30.0,TOT,30.0,0.0,294.0,30.0,80.0,...,15.1,16.0,-0.1,0.2,0.2,0.026,-2.8,-1.3,-4.2,-0.2
2,1995-96,Michael Curry,SG-SF,27.0,TOT,46.0,1.0,783.0,73.0,161.0,...,11.3,12.9,0.8,0.8,1.6,0.100,-1.1,-0.1,-1.2,0.2
3,1996-97,Jim Jackson,SG-SF,26.0,TOT,77.0,77.0,2831.0,444.0,1029.0,...,15.1,21.6,2.8,1.6,4.4,0.075,0.9,-0.4,0.5,1.8
4,1997-98,Aaron McKie,SG-SF,25.0,TOT,81.0,32.0,1813.0,139.0,381.0,...,15.8,12.0,-0.7,2.0,1.4,0.036,-2.8,1.5,-1.3,0.3
5,1997-98,Jerry Stackhouse,SG-SF,23.0,TOT,79.0,37.0,2545.0,424.0,975.0,...,16.0,25.3,2.4,2.3,4.7,0.089,0.9,-0.5,0.4,1.5
6,1998-99,Rodrick Rhodes,SG-SF,25.0,TOT,13.0,1.0,156.0,13.0,52.0,...,24.1,24.5,-0.7,0.0,-0.7,-0.201,-8.6,-2.8,-11.4,-0.4
7,2002-03,Gordan Giricek,SG-SF,25.0,TOT,76.0,62.0,2148.0,350.0,803.0,...,14.1,21.5,0.9,0.7,1.6,0.036,-0.7,-2.3,-3.0,-0.6
8,2003-04,DeShawn Stevenson,SG-SF,22.0,TOT,80.0,78.0,2444.0,376.0,871.0,...,11.1,20.4,0.8,0.7,1.5,0.029,-1.6,-1.9,-3.5,-0.9
9,2004-05,Casey Jacobsen,SG-SF,23.0,TOT,84.0,1.0,1798.0,165.0,408.0,...,12.6,14.2,2.0,0.7,2.7,0.071,-0.7,-1.3,-2.0,0.0


In [49]:
df_dual_pos.groupby(['Player']).size().reset_index(name = 'Count').sort_values(['Count'], ascending = False).head(n=10)

Unnamed: 0,Player,Count
5,Allen Iverson*,3
25,Brian Skinner,2
143,Ricky Davis,2
109,Lonny Baxter,2
76,Jamie Feick,2
22,Bostjan Nachbar,2
84,Jeremy Richardson,2
105,Larry Hughes,2
171,Tony Brown,2
159,Stephen Jackson,2


In [50]:
# Check what is going on with some players with multiple positions
df_all[df_all['Player'] == 'Allen Iverson*']

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
5870,1996-97,Allen Iverson*,PG,21.0,PHI,76.0,74.0,3045.0,625.0,1504.0,...,16.2,28.9,3.1,1.0,4.1,0.065,3.4,-2.0,1.5,2.7
6320,1997-98,Allen Iverson*,PG,22.0,PHI,80.0,80.0,3150.0,649.0,1407.0,...,12.9,26.9,6.3,2.8,9.0,0.138,4.5,-0.7,3.8,4.6
6751,1998-99,Allen Iverson*,SG,23.0,PHI,48.0,48.0,1990.0,435.0,1056.0,...,11.7,32.6,4.3,2.9,7.2,0.173,4.9,-0.3,4.6,3.3
7201,1999-00,Allen Iverson*,SG,24.0,PHI,70.0,70.0,2853.0,729.0,1733.0,...,10.3,34.4,3.3,3.6,6.9,0.116,3.4,-0.8,2.6,3.3
7640,2000-01,Allen Iverson*,SG,25.0,PHI,71.0,71.0,2979.0,762.0,1813.0,...,10.0,35.9,7.3,4.5,11.8,0.19,5.0,-0.1,4.8,5.1
8087,2001-02,Allen Iverson*,SG,26.0,PHI,60.0,59.0,2622.0,665.0,1669.0,...,11.0,37.8,2.6,4.3,6.9,0.126,3.7,0.2,3.8,3.9
8518,2002-03,Allen Iverson*,SG,27.0,PHI,82.0,82.0,3485.0,804.0,1940.0,...,11.2,32.9,5.0,4.2,9.2,0.127,3.1,-0.3,2.8,4.2
8961,2003-04,Allen Iverson*,SG,28.0,PHI,48.0,47.0,2040.0,435.0,1125.0,...,13.6,35.3,0.5,2.3,2.8,0.066,3.1,-0.6,2.5,2.3
9414,2004-05,Allen Iverson*,PG,29.0,PHI,75.0,75.0,3174.0,771.0,1818.0,...,13.7,35.0,5.3,3.7,9.0,0.136,4.8,-0.5,4.3,5.0
9871,2005-06,Allen Iverson*,PG,30.0,PHI,72.0,72.0,3103.0,815.0,1822.0,...,10.2,35.8,8.8,1.8,10.6,0.165,6.0,-2.5,3.5,4.3


In [51]:
# Find most common position for this player
df_all[df_all['Player'] == 'Allen Iverson*']\
.groupby(['Pos']).size().reset_index(name = 'Count')\
.sort_values(['Count'], ascending = False).iloc[0][0]

'SG'

In [52]:
df_all[df_all['Player'] == 'Jim Jackson'].groupby(['Pos']).size().reset_index(name = 'Count').sort_values(['Count'], ascending = False).iloc[0][0]

'SF'

In [162]:
df_all[df_all['Player'] == 'Jim Jackson'].groupby(['Pos']).size().reset_index(name = 'Count').sort_values(['Count'], ascending = False).iloc[1][0]

'SG'

In [64]:
# Count of seasons played at most common position
df_all[df_all['Player'] == 'Allen Iverson*'].groupby(['Pos']).size().iloc[0]

4

In [77]:
for i, k in df_all[['Player', 'Pos']].head(n=2).iterrows():
    #print("this is for i")
    #print(i)
    print("this is for k")
    print(k.iloc[1])

this is for k
C
this is for k
SF


In [55]:
# Create a smaller, shuffled sample DataFrame to test cleaning function
df_test = df_all.sample(frac = 0.05).copy()
print(len(df_test))

762


In [56]:
df_test.head(n=10)

Unnamed: 0,Season,Player,Pos,Age,Tm,G,GS,MP_tot,FG_tot,FGA_tot,...,TOV_perc,USG_perc,OWS,DWS,WS,WS_per_48,OBPM,DBPM,BPM,VORP
1225,1983-84,Mitchell Wiggins,SG,24.0,CHI,82.0,40.0,2123.0,399.0,890.0,...,12.0,22.5,1.1,2.2,3.3,0.076,-0.1,0.1,0.0,1.1
2297,1987-88,Joe Dumars*,SG,24.0,DET,82.0,82.0,2732.0,453.0,960.0,...,13.6,19.3,3.2,2.7,5.9,0.103,0.2,-0.2,0.0,1.4
3426,1990-91,Scott Haffner,PG,24.0,CHH,7.0,0.0,50.0,8.0,21.0,...,15.5,22.1,-0.1,0.0,-0.1,-0.058,-3.9,-1.4,-5.3,0.0
9017,2003-04,Donyell Marshall,PF,30.0,TOT,82.0,74.0,2988.0,470.0,1020.0,...,9.6,19.5,4.9,5.5,10.4,0.167,1.2,2.0,3.2,3.9
4202,1992-93,Jay Guidinger,C,23.0,CLE,32.0,5.0,215.0,19.0,55.0,...,13.2,15.7,-0.1,0.4,0.3,0.067,-3.7,2.0,-1.7,0.0
632,1982-83,Dave Batton,C,26.0,WSB,54.0,5.0,558.0,85.0,191.0,...,12.4,16.8,-0.2,0.9,0.7,0.062,-3.7,0.8,-2.9,-0.1
5132,1994-95,Sam Perkins,C,33.0,SEA,82.0,37.0,2356.0,346.0,742.0,...,8.2,17.2,5.9,2.6,8.5,0.173,2.8,-0.2,2.6,2.7
2779,1988-89,Hakeem Olajuwon*,C,26.0,HOU,82.0,82.0,3024.0,790.0,1556.0,...,13.0,28.2,4.6,7.8,12.4,0.197,1.3,4.1,5.4,5.6
9192,2003-04,Mike Wilks,PG,24.0,HOU,26.0,0.0,145.0,17.0,36.0,...,14.5,15.9,0.4,0.2,0.5,0.169,0.0,-1.9,-1.9,0.0
11529,2009-10,Raja Bell,SG,33.0,TOT,6.0,5.0,180.0,28.0,61.0,...,7.4,17.3,0.3,0.2,0.5,0.131,0.9,-1.0,-0.1,0.1


In [57]:
df_test.iloc[2][1]

'Scott Haffner'

In [58]:
# This format will change positions
#df_test.loc[df_test['Player'] == 'Glen Davis', 'Pos'] = 'New Pos'

In [174]:
most_common_pos_test = {}

# Use dictionary as key to replace 'Pos' values in the big DataFrame
most_common_pos = {}

# Check if a player has more than one common position

def grab_most_common_pos(df, pos_dict):
    # Loop through a dataframe and assign names and most common positions to a dictionary
    for index, row in df[['Player', 'Pos']].iterrows():
        player_name = row.iloc[0] # Assign player name to variable
        pos = df[df['Player'] == player_name].groupby('Pos').size()\
        .reset_index(name = 'Count')\
        .sort_values(['Count'], ascending = False)\
        .iloc[0][0] # Assign position to variable
        #try:
        #    second_pos = df[df['Player'] == player_name].groupby('Pos').size()\
        #    .reset_index(name = 'Count')\
        #    .sort_values(['Count'], ascending = False)\
        #    .iloc[1][0] # Second most played position
        #except:
        #    continue
        
        #if pos != second_pos:
        #    continue
        #elif pos == second_pos:
        #    print(player_name)
        #    print(pos)
        #    print(second_pos)
        if player_name not in pos_dict.keys(): # Check if name exists first
            pos_dict[player_name] = pos
        else:
            continue
    
    return pos_dict

def clean_pos(df, pos_dict):
    # Loop through rows to check players' positions
    grab_most_common_pos(df, pos_dict)
    # Check if it is indeed their most common position
    
    # If not, replace it with the most common one
    
    # Return DataFrame with cleaned positions

In [175]:
clean_pos(df_test, most_common_pos_test)

In [176]:
most_common_pos_test

{'A.J. English': 'SG',
 'A.J. Price': 'PG',
 'Aaron Harrison': 'SG',
 'Aaron McKie': 'SG',
 'Aaron Williams': 'C',
 'Adonal Foyle': 'C',
 'Al Jefferson': 'C',
 'Al Thornton': 'SF',
 'Alan Anderson': 'SF',
 'Alex Len': 'C',
 'Allen Iverson*': 'PG',
 'Allen Leavell': 'PG',
 'Alonzo Mourning*': 'C',
 'Alton Lister': 'C',
 'Alvan Adams': 'C',
 'Alvin Robertson': 'SG',
 'Amir Johnson': 'SF',
 'Andray Blatche': 'PF',
 'Andre Brown': 'PF',
 'Andre Iguodala': 'SF',
 'Andre Miller': 'PG',
 'Andrei Kirilenko': 'SF',
 'Andrew Bogut': 'C',
 'Andrew DeClercq': 'C',
 'Andrew Goudelock': 'SG',
 'Anfernee Hardaway': 'SG',
 'Anthony Frederick': 'SF',
 'Anthony Johnson': 'PG',
 'Antoine Walker': 'PF',
 'Antonio Daniels': 'SG',
 'Antonio Davis': 'PF',
 'Antonio McDyess': 'PF',
 'Archie Goodwin': 'PG',
 'Armen Gilliam': 'PF',
 'Armon Johnson': 'PG',
 'Armond Hill': 'PG',
 'Askia Jones': 'SG',
 'Austin Rivers': 'SG',
 'Avery Johnson': 'PG',
 'Baron Davis': 'PG',
 'Bart Kofoed': 'SG',
 'Ben McLemore': 'SG',

In [119]:
df_all.iloc[0][2]

'C'

In [112]:
print(most_common_pos_test.keys())

dict_keys(['Name', 'Mitchell Wiggins', 'Joe Dumars*', 'Scott Haffner', 'Donyell Marshall', 'Jay Guidinger', 'Dave Batton', 'Sam Perkins', 'Hakeem Olajuwon*', 'Mike Wilks', 'Raja Bell', 'Andray Blatche', 'Reggie Williams', 'Nenad Krstic', 'Jerry Sichting', 'Tellis Frank', 'Rashard Lewis', 'Terrence Rencher', 'Doug Smith', 'Askia Jones', 'Jarvis Hayes', 'Magic Johnson*', 'Elden Campbell', 'Joey Graham', 'Mike Brown', 'Bruno Sundov', 'Keith Owens', 'Mike Mitchell', 'C.J. Watson', 'Reggie Hanson', 'A.J. English', 'Lorenzen Wright', 'Mike Woodson', 'Chasson Randle', 'Matt Bullard', 'Lester Conner', 'Bart Kofoed', 'Jamaal Tinsley', 'Rafer Alston', 'Cory Jefferson', 'Sam Vincent', 'Cal Bowdler', 'Andre Brown', 'Rodney Stuckey', 'Jeff McInnis', 'Terrence Williams', 'Scotty Hopson', 'Yao Ming*', 'Chris Whitney', 'Andre Iguodala', 'Terrence Jones', 'Adonal Foyle', 'Brook Steppe', 'Joe Kopicki', 'Bobby Phills', 'Bill Hanzlik', 'Terence Stansbury', 'Marc Jackson', 'Chris Anstey', 'Litterial Green'

In [115]:
for index, row in df_test[['Player', 'Pos']].iterrows():
    player_name = row.iloc[0]
    #print(player_name)
    if player_name not in most_common_pos_test.keys():
        pos = df_test[df_test['Player'] == player_name].groupby('Pos').size()\
        .reset_index(name = 'Count')\
        .sort_values(['Count'], ascending = False).iloc[0][0]
        #print(pos)
        most_common_pos_test[player_name] = pos
    else:
        continue
        

In [116]:
most_common_pos_test

{'A.J. English': 'SG',
 'A.J. Price': 'PG',
 'Aaron Harrison': 'SG',
 'Aaron McKie': 'SG',
 'Aaron Williams': 'C',
 'Adonal Foyle': 'C',
 'Al Jefferson': 'C',
 'Al Thornton': 'SF',
 'Alan Anderson': 'SF',
 'Alex Len': 'C',
 'Allen Iverson*': 'PG',
 'Allen Leavell': 'PG',
 'Alonzo Mourning*': 'C',
 'Alton Lister': 'C',
 'Alvan Adams': 'C',
 'Alvin Robertson': 'SG',
 'Amir Johnson': 'SF',
 'Andray Blatche': 'PF',
 'Andre Brown': 'PF',
 'Andre Iguodala': 'SF',
 'Andre Miller': 'PG',
 'Andrei Kirilenko': 'SF',
 'Andrew Bogut': 'C',
 'Andrew DeClercq': 'C',
 'Andrew Goudelock': 'SG',
 'Anfernee Hardaway': 'SG',
 'Anthony Frederick': 'SF',
 'Anthony Johnson': 'PG',
 'Antoine Walker': 'PF',
 'Antonio Daniels': 'SG',
 'Antonio Davis': 'PF',
 'Antonio McDyess': 'PF',
 'Archie Goodwin': 'PG',
 'Armen Gilliam': 'PF',
 'Armon Johnson': 'PG',
 'Armond Hill': 'PG',
 'Askia Jones': 'SG',
 'Austin Rivers': 'SG',
 'Avery Johnson': 'PG',
 'Baron Davis': 'PG',
 'Bart Kofoed': 'SG',
 'Ben McLemore': 'SG',

In [69]:
df_test[['Player', 'Pos']].head(n=2)

Unnamed: 0,Player,Pos
1225,Mitchell Wiggins,SG
2297,Joe Dumars*,SG


In [None]:
# Create a new column 'Rounded Pos' to assign to players' most commonly played position
# PG, SG, SF, PF, C are 1-5, respectively
single_pos = ['PG', 'SG', 'SF', 'PF', 'C']
rounded_pos = ['1', '2', '3', '4', '5']

In [60]:
# Create a DataFrame with top 25 single season scorers 
#df_top_25_scorers = df_all.sort_values('PTS_per_G', ascending = False).head(n=25)

# Create a DataFrame with top 50 single season scorers 
#df_top_50_scorers = df_all.sort_values('PTS_per_G', ascending = False).head(n=50)

In [61]:
# Write to CSV files and DONE!
#df_all.to_csv("bref_1981_2017_player_data.csv", encoding = 'utf-8', index = False)

In [62]:
#df_top_50_scorers.to_csv("bref_1981_2017_top_50_season_scorers.csv", encoding = "utf-8", index = False)