In [1]:
__author__ = "Victor Xu"
__email__ = "victor.c.xu@gmail.com"
__website__ = "victorxu.me"

__copyright__ = "Copyright 2019, Victor Xu"

# Problem Definition

The goal of this analysis is to predict NBA player positions given performance data such the player shot count, shot location, and player defense etc... Each player in the league is assigned a position label such as point guard, shooting guard, and center.

### Why is predicting player position important?
If you are basketball fan, you probably realized that the player positions are inconsistent across data sources. For instance ESPN and the League even uses different position categories altogether.


| ESPN | NBA   |
|------|------|
|Center|Center|
|Point Guard|Guard|
|Shooting Guard|Guard|
|Small Forward|Forward|
|Power Forward|Forward|
|No Direct Translation|Guard-Forward|
|No Direct Translation|Center-Forward|

Though we will not be examining the difference between NBA and ESPN player labeling methodologies, never the less by looking at how ML algos approach classifying player into ESPN categories, we could gain insights into the structure of player labels. Specifically, allowing us to examine if player labels are well separated or not.

In [2]:
import re
import pandas as pd
import numpy as np

from src.espn_scraper import espn_player_scraper
from src.player_stat import get_player_stat, NoDataError

from sqlalchemy import create_engine, Integer
from tqdm import tqdm_notebook
from nba_py import player
from time import sleep

## Data Acqusition
#### Scrape ESPN for player name and position label /w our scraper

In [3]:
# Scrape ESPN pages
teams_overview_url = "http://www.espn.com/nba/players"
scraper = espn_player_scraper()
espn_player_list = scraper.scrape_all_players(teams_overview_url)

100%|██████████| 30/30 [02:59<00:00,  4.69s/it]


In [7]:
import re
s = 'Carsen EdwardsPG'

re.search(re.compile('PG|C|SG|SF|PW'), 'Carsen EdwardsPG')[0]

IndexError: no such group

In [17]:
espn_player_list.head()

Unnamed: 0,name,position,espn_player_id,url
0,Jaylen Brown,SG,3917376,http://www.espn.com/nba/player/_/id/3917376/ja...
1,Carsen EdwardsPG,,4066407,http://www.espn.com/nba/player/_/id/4066407/ca...
2,Tacko Fall,C,3904625,http://www.espn.com/nba/player/_/id/3904625/ta...
3,Jonathan Gibson,PG,2234666,http://www.espn.com/nba/player/_/id/2234666/jo...
4,Javonte Green,SG,2596112,http://www.espn.com/nba/player/_/id/2596112/ja...


In [20]:
espn_player_list.head()

Unnamed: 0,name,position,espn_player_id,url
0,Jaylen Brown,SG,3917376,http://www.espn.com/nba/player/_/id/3917376/ja...
1,Carsen EdwardsPG,,4066407,http://www.espn.com/nba/player/_/id/4066407/ca...
2,Tacko Fall,C,3904625,http://www.espn.com/nba/player/_/id/3904625/ta...
3,Jonathan Gibson,PG,2234666,http://www.espn.com/nba/player/_/id/2234666/jo...
4,Javonte Green,SG,2596112,http://www.espn.com/nba/player/_/id/2596112/ja...


### Getting player name and NBA ID data from the Official NBA API
Here we will use nba_py by seemethere, a Python wrapper for the offical but unpublished NBA API.

We will be using the player performance data from the NBA API and using it to predict player's ESPN position label.

The official NBA API uses a different set of Player IDs, so we will have to join ESPN and NBA data by cross referencing player names. Also, the NBA API returns some players in the NBA Delevelopment league, which we are not interested in. We will thus use a left join on the ESPN table.

In [7]:
# Loading back the data
espn_player_list = read_data_from_db()

# Getting players for the current season
nba_player_list = player.PlayerList().info()

# Convert upper case letter to upper case letter
nba_player_list.columns = [col.lower() for col in nba_player_list.columns]

# Check if number of players are the same
print("NBA roster has {} players".format(nba_player_list.shape[0]))
print("ESPN roster has {} players".format(espn_player_list.shape[0]))

Successfully read in players data into data frame
NBA roster has 516 players
ESPN roster has 563 players


#### Load NBA player name data from API to db

In [12]:
def load_data_to_db(nba_player_list):
    engine = create_engine("sqlite:///db/nba.db", echo=False)
    with engine.connect() as conn:
        nba_player_list.to_sql("nba_players", conn, dtype={'person_id':Integer}, if_exists="replace", index=False)
        print("Successfully loaded into table nba_players")
        
load_data_to_db(nba_player_list)

Successfully loaded into table nba_players


#### Examine the data

In [34]:
# Selecting columns of interest
nba_player_list = nba_player_list[['person_id','display_first_last']]
nba_player_list.head()

Unnamed: 0,person_id,display_first_last
0,203500,steven adams
1,1628389,bam adebayo
2,1629061,deng adel
3,200746,lamarcus aldridge
4,1629734,kyle alexander


In [35]:
nba_player_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 2 columns):
person_id             516 non-null int64
display_first_last    516 non-null object
dtypes: int64(1), object(1)
memory usage: 8.2+ KB


In [15]:
espn_player_list.head()

Unnamed: 0,name,position,espn_player_id,url
0,Jaylen Brown,SG,3917376,http://www.espn.com/nba/player/_/id/3917376/ja...
1,Carsen EdwardsPG,,4066407,http://www.espn.com/nba/player/_/id/4066407/ca...
2,Tacko Fall,C,3904625,http://www.espn.com/nba/player/_/id/3904625/ta...
3,Jonathan Gibson,PG,2234666,http://www.espn.com/nba/player/_/id/2234666/jo...
4,Javonte Green,SG,2596112,http://www.espn.com/nba/player/_/id/2596112/ja...


In [36]:
espn_player_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563 entries, 0 to 562
Data columns (total 4 columns):
name              563 non-null object
position          482 non-null object
espn_player_id    563 non-null int64
url               563 non-null object
dtypes: int64(1), object(3)
memory usage: 17.7+ KB


#### Check duplicates

In [39]:
nba_player_list.duplicated().sum()

0

In [40]:
espn_player_list.duplicated().sum()

3

In [43]:
espn_player_list[espn_player_list.duplicated(keep=False)]

Unnamed: 0,name,position,espn_player_id,url
123,rodney mcgruder,SF,2488826,http://www.espn.com/nba/player/_/id/2488826/ro...
156,jawun evans,PG,3912854,http://www.espn.com/nba/player/_/id/3912854/ja...
385,brandon goodwin,PG,3057198,http://www.espn.com/nba/player/_/id/3057198/br...
427,rodney mcgruder,SF,2488826,http://www.espn.com/nba/player/_/id/2488826/ro...
481,brandon goodwin,PG,3057198,http://www.espn.com/nba/player/_/id/3057198/br...
517,jawun evans,PG,3912854,http://www.espn.com/nba/player/_/id/3912854/ja...


In [46]:
espn_player_list = espn_player_list.drop_duplicates()

In [47]:
espn_player_list.duplicated().sum()

0

#### Check N/A

In [50]:
nba_player_list.isna().sum()

person_id             0
display_first_last    0
dtype: int64

In [51]:
espn_player_list.isna().sum()

name               0
position          81
espn_player_id     0
url                0
dtype: int64

#### Load to db

In [56]:
def load_data_to_db(espn_player_list, nba_player_list):
    """Loads eplayer name and position data to db"""
    
    
    engine = create_engine("sqlite:///db/nba.db", echo=False)
    
    with engine.connect() as conn:
        espn_player_list.to_sql('espn_players', 
                                conn, 
                                dtype={"espn_player_id":Integer}, 
                                if_exists="replace", index=False)
        
        nba_player_list.to_sql('nba_players', 
                                conn, 
                                dtype={"nba_player_id":Integer}, 
                                if_exists="replace", index=False)
        
        print("Successfully loaded into table espn_players & nba_player_id")
        
        
def read_data_from_db():
    """Retrieves player name and position data from db
    
    Returns:
        espn_player_list: pd.DataFrame
            df containing espn player info including 
                -  player name, 
                -  player position, 
                -  espn player id,
                -  url to player profile
            
        nba_player_list: pd.DataFrame
            df containing nba player info including
                -  player name
                -  nba player id
    """
    
    engine = create_engine("sqlite:///db/nba.db", echo=False)
    with engine.connect() as conn:
        espn_player_list = pd.read_sql('espn_players', conn)
        nba_player_list = pd.read_sql('nba_players', conn)
        
        print("Successfully read in players data into dataframes")
        
        return espn_player_list, nba_player_list

In [57]:
load_data_to_db(espn_player_list, nba_player_list)

Successfully loaded into table espn_players & nba_player_id


### Joining 2 datasets together

We need the ESPN player position label which lives in the ESPN table, and the NBA player ID, which lives in the NBA table.

We will later use NBA player ID to call the official NBA API to retrieve player performence data, which is used to train our models.

| name | position | 
|------|------|
|Center|Center|
|Point Guard|Guard|

#### Cleaning before joining 2 data scources

We will be joining on player names, which are different across ESPN and NBA records. As such, cleaning is required



In [58]:
def sanitize_name(name_str):
    """Remove all special characters and player name suffix"""
    sanitized = name_str.lower()
    sanitized = sanitized.replace('-', ' ')
    
    to_remove = [".", "'", "jr", 'sr', 'iii']
    
    for pattern in to_remove:
        sanitized = sanitized.replace(pattern, '')
        
    # Remove special characters and trailing spaces
    sanitized = re.sub(re.compile("\s*$"), '', sanitized)
        
    return sanitized

In [68]:
espn_player_list

Unnamed: 0,name,position,espn_player_id,url
0,jaylen brown,SG,3917376,http://www.espn.com/nba/player/_/id/3917376/ja...
1,carsen edwardspg,,4066407,http://www.espn.com/nba/player/_/id/4066407/ca...
2,tacko fall,C,3904625,http://www.espn.com/nba/player/_/id/3904625/ta...
3,jonathan gibson,PG,2234666,http://www.espn.com/nba/player/_/id/2234666/jo...
4,javonte green,SG,2596112,http://www.espn.com/nba/player/_/id/2596112/ja...
5,gordon hayward,SF,4249,http://www.espn.com/nba/player/_/id/4249/gordo...
6,rj hunter,SG,2983727,http://www.espn.com/nba/player/_/id/2983727/rj...
7,enes kanter,C,6447,http://www.espn.com/nba/player/_/id/6447/enes-...
8,romeo langfordsg,,4397008,http://www.espn.com/nba/player/_/id/4397008/ro...
9,semi ojeleye,PF,3056602,http://www.espn.com/nba/player/_/id/3056602/se...


In [59]:
# Getting a list of ESPN player names not in NBA player name list
espn_player_list[~espn_player_list.name.isin(nba_player_list.display_first_last)].head()

Unnamed: 0,name,position,espn_player_id,url
1,carsen edwardspg,,4066407,http://www.espn.com/nba/player/_/id/4066407/ca...
3,jonathan gibson,PG,2234666,http://www.espn.com/nba/player/_/id/2234666/jo...
6,rj hunter,SG,2983727,http://www.espn.com/nba/player/_/id/2983727/rj...
8,romeo langfordsg,,4397008,http://www.espn.com/nba/player/_/id/4397008/ro...
17,tremont waterspg,,4278080,http://www.espn.com/nba/player/_/id/4278080/tr...


In [60]:
# Sanitize both dfs
nba_player_list.display_first_last = nba_player_list.display_first_last.astype('str')
espn_player_list.name = espn_player_list.name.astype('str')

espn_player_list.name = espn_player_list.name.apply(sanitize_name)
nba_player_list.display_first_last = nba_player_list.display_first_last.apply(sanitize_name)

In [61]:
# Handle a few special cases where names are different across two data sources
espn_player_list.loc[espn_player_list.espn_player_id == 1713, 'name'] = 'nene'
espn_player_list.loc[espn_player_list.espn_player_id == 4017839, 'name'] = 'juancho hernangomez'
espn_player_list.loc[espn_player_list.espn_player_id == 3056247, 'name'] = 'kendrick nunn'
espn_player_list.loc[espn_player_list.espn_player_id == 2528586, 'name'] = 'walter lemon'
espn_player_list.loc[espn_player_list.espn_player_id == 3133602, 'name'] = 'svi mykhailiuk'

In [62]:
merged_df = espn_player_list.join(nba_player_list.set_index("display_first_last", drop=True),
                                    on='name')

In [63]:
# Check for rows that didn't join correctly
merged_df[merged_df.isnull().any(axis=1)]

Unnamed: 0,name,position,espn_player_id,url,person_id
1,carsen edwardspg,,4066407,http://www.espn.com/nba/player/_/id/4066407/ca...,
3,jonathan gibson,PG,2234666,http://www.espn.com/nba/player/_/id/2234666/jo...,
6,rj hunter,SG,2983727,http://www.espn.com/nba/player/_/id/2983727/rj...,
8,romeo langfordsg,,4397008,http://www.espn.com/nba/player/_/id/4397008/ro...,
17,tremont waterspg,,4278080,http://www.espn.com/nba/player/_/id/4278080/tr...,
18,grant williamspf,,4066218,http://www.espn.com/nba/player/_/id/4066218/gr...,
21,nicolas claxtonpf,,4278067,http://www.espn.com/nba/player/_/id/4278067/ni...,
25,jaylen handspg,,4278526,http://www.espn.com/nba/player/_/id/4278526/ja...,
36,alan williams,PF,2579326,http://www.espn.com/nba/player/_/id/2579326/al...,
38,rj barrettsf,,4395625,http://www.espn.com/nba/player/_/id/4395625/rj...,


In [66]:
merged_df[merged_df.position.isnull()]

Unnamed: 0,name,position,espn_player_id,url,person_id
1,carsen edwardspg,,4066407,http://www.espn.com/nba/player/_/id/4066407/ca...,
8,romeo langfordsg,,4397008,http://www.espn.com/nba/player/_/id/4397008/ro...,
17,tremont waterspg,,4278080,http://www.espn.com/nba/player/_/id/4278080/tr...,
18,grant williamspf,,4066218,http://www.espn.com/nba/player/_/id/4066218/gr...,
21,nicolas claxtonpf,,4278067,http://www.espn.com/nba/player/_/id/4278067/ni...,
25,jaylen handspg,,4278526,http://www.espn.com/nba/player/_/id/4278526/ja...,
38,rj barrettsf,,4395625,http://www.espn.com/nba/player/_/id/4395625/rj...,
39,ignas brazdeikissf,,4397205,http://www.espn.com/nba/player/_/id/4397205/ig...,
71,marial shayoksg,,3138201,http://www.espn.com/nba/player/_/id/3138201/ma...,
74,matisse thybullesg,,3907498,http://www.espn.com/nba/player/_/id/3907498/ma...,


In [30]:
# Kendrick Nunn and Cody Demps just got traded and 
# has not played any games in his career, so we're dropping them

merged_df = merged_df.dropna()
merged_df = merged_df.rename(columns={"person_id":"nba_id"})
merged_df.shape

(411, 5)

In [31]:
# Load to DB
with engine.connect() as conn:
    merged_df.to_sql("players", conn, dtype={'nba_id':Integer, 'espn_player_id':Integer}, index=False, if_exists='replace')

NameError: name 'engine' is not defined

In [32]:
merged_df.head()

Unnamed: 0,name,position,espn_player_id,url,nba_id
0,jaylen brown,SG,3917376,http://www.espn.com/nba/player/_/id/3917376/ja...,1627759.0
2,tacko fall,C,3904625,http://www.espn.com/nba/player/_/id/3904625/ta...,1629605.0
4,javonte green,SG,2596112,http://www.espn.com/nba/player/_/id/2596112/ja...,1629750.0
5,gordon hayward,SF,4249,http://www.espn.com/nba/player/_/id/4249/gordo...,202330.0
7,enes kanter,C,6447,http://www.espn.com/nba/player/_/id/6447/enes-...,202683.0


## Loading NBA Player Stat into DB

In [None]:
# Read back the db
with engine.connect() as conn:
    players = pd.read_sql("players", conn)
    
# Getting unique set of player_ids
nba_ids = players.nba_id.unique()

print("Total of {} unique IDs".format(nba_ids.shape[0]))

In [None]:
dfs_to_concat = []
for idx in tqdm_notebook(nba_ids):
    sleep(1) # Prevent from being banned
    try: 
        dfs_to_concat.append(get_player_stat(idx))
    except NoDataError:
        # This is when the player has no data and returns an empty df
        continue

In [None]:
final_df = pd.concat(dfs_to_concat, sort=False)

In [None]:
final_df.to_csv("Finaldf.csv")

In [None]:
final_df.index.name = "nba_id"

In [None]:
final_df.head()

In [None]:
with engine.connect() as conn:
    final_df.to_sql("stat", conn)