In [1]:
__author__ = "Victor Xu"
__email__ = "victor.c.xu@gmail.com"
__website__ = "victorxu.me"

__copyright__ = "Copyright 2019, Victor Xu"

# Problem Definition

The goal of this analysis is to predict NBA player positions given performance data such the player shot count, shot location, and player defense etc... Each player in the league is assigned a position label such as point guard, shooting guard, and center.

### Why is predicting player position important?
If you are basketball fan, you probably realized that the player positions are inconsistent across data sources. For instance ESPN and the League even uses different position categories altogether.


| ESPN | NBA   |
|------|------|
|Center|Center|
|Point Guard|Guard|
|Shooting Guard|Guard|
|Small Forward|Forward|
|Power Forward|Forward|
|No Direct Translation|Guard-Forward|
|No Direct Translation|Center-Forward|

Though we will not be examining the difference between NBA and ESPN player labeling methodologies, never the less by looking at how ML algos approach classifying player into ESPN categories, we could gain insights into the structure of player labels. Specifically, allowing us to examine if player labels are well separated or not.

In [2]:
import re
import pandas as pd
import numpy as np

from src.espn_scraper import espn_player_scraper
from src.player_stat import get_player_stat, NoDataError

from sqlalchemy import create_engine, Integer
from tqdm import tqdm_notebook
from nba_py import player
from time import sleep

## Data Acqusition
#### Scrape ESPN for player name and position label /w our scraper

In [None]:
# Scrape ESPN pages
teams_overview_url = "http://www.espn.com/nba/players"
scraper = espn_player_scraper()
espn_player_list = scraper.scrape_all_players(teams_overview_url)

 87%|████████▋ | 26/30 [01:42<00:15,  3.87s/it]

In [None]:
espn_player_list.head()

In [None]:
espn_player_list.head()

### Getting player name and NBA ID data from the Official NBA API
Here we will use nba_py by seemethere, a Python wrapper for the offical but unpublished NBA API.

We will be using the player performance data from the NBA API and using it to predict player's ESPN position label.

The official NBA API uses a different set of Player IDs, so we will have to join ESPN and NBA data by cross referencing player names. Also, the NBA API returns some players in the NBA Delevelopment league, which we are not interested in. We will thus use a left join on the ESPN table.

In [None]:
# Getting players who were active in the last 3 seasons for the NBA database
nba_player_list_17 = player.PlayerList(season='2017-18').info()
nba_player_list_18 = player.PlayerList(season='2018-19').info()
nba_player_list_19 = player.PlayerList(season='2019-20').info()

# Concat the 3 dfs together
nba_player_list = pd.concat([nba_player_list_17,nba_player_list_18,nba_player_list_19])
nba_player_list = nba_player_list.drop_duplicates()

# Convert upper case letter to upper case letter
nba_player_list.columns = [col.lower() for col in nba_player_list.columns]

# Check if number of players are the same
print("NBA roster has {} players over past 3 seasons".format(nba_player_list.shape[0]))
print("ESPN roster has {} players in the current season.".format(espn_player_list.shape[0]))

#### Examine the data

In [None]:
# Selecting columns of interest
nba_player_list = nba_player_list[['person_id','display_first_last']]
nba_player_list.display_first_last = nba_player_list.display_first_last.str.lower()
nba_player_list.head()

In [None]:
nba_player_list.info()

In [None]:
espn_player_list.head()

In [None]:
espn_player_list.info()

#### Check duplicates

In [None]:
nba_player_list.duplicated().sum()

In [None]:
espn_player_list.duplicated().sum()

In [None]:
espn_player_list[espn_player_list.duplicated(keep=False)]

In [None]:
espn_player_list = espn_player_list.drop_duplicates()

In [None]:
espn_player_list.duplicated().sum()

#### Check N/A

A small number of newly drafted rookies will not have a position assigned to them yet. Their positions will be N/As

In [None]:
nba_player_list.isna().sum()

In [None]:
espn_player_list.isna().sum()

In [None]:
espn_player_list = espn_player_list.dropna()

#### Load to db

In [None]:
def load_data_to_db(espn_player_list, nba_player_list):
    """Loads eplayer name and position data to db"""
    
    
    engine = create_engine("sqlite:///db/nba.db", echo=False)
    
    with engine.connect() as conn:
        espn_player_list.to_sql('espn_players', 
                                conn, 
                                dtype={"espn_player_id":Integer}, 
                                if_exists="replace", index=False)
        
        nba_player_list.to_sql('nba_players', 
                                conn, 
                                dtype={"nba_player_id":Integer}, 
                                if_exists="replace", index=False)
        
        print("Successfully loaded into table espn_players & nba_player_id")
        
        
def read_data_from_db():
    """Retrieves player name and position data from db
    
    Returns:
        espn_player_list: pd.DataFrame
            df containing espn player info including 
                -  player name, 
                -  player position, 
                -  espn player id,
                -  url to player profile
            
        nba_player_list: pd.DataFrame
            df containing nba player info including
                -  player name
                -  nba player id
    """
    
    engine = create_engine("sqlite:///db/nba.db", echo=False)
    with engine.connect() as conn:
        espn_player_list = pd.read_sql('espn_players', conn)
        nba_player_list = pd.read_sql('nba_players', conn)
        
        print("Successfully read in players data into dataframes")
        
        return espn_player_list, nba_player_list

In [None]:
load_data_to_db(espn_player_list, nba_player_list)

### Joining 2 datasets together

We need the ESPN player position label which lives in the ESPN table, and the NBA player ID, which lives in the NBA table.

We will later use NBA player ID to call the official NBA API to retrieve player performence data, which is used to train our models.

| name | position | 
|------|------|
|Center|Center|
|Point Guard|Guard|

#### Cleaning before joining 2 data scources

We will be joining on player names, which are different across ESPN and NBA records. As such, cleaning is required



In [None]:
def sanitize_name(name_str):
    """Remove all special characters and player name suffix"""
    sanitized = name_str.lower()
    sanitized = sanitized.replace('-', ' ')
    
    to_remove = [".", "'", "jr", 'sr', 'iii']
    
    for pattern in to_remove:
        sanitized = sanitized.replace(pattern, '')
        
    # Remove special characters and trailing spaces
    sanitized = re.sub(re.compile("\s*$"), '', sanitized)
        
    return sanitized

In [None]:
# Getting a list of ESPN player names not in NBA player name list
espn_player_list[~espn_player_list.name.isin(nba_player_list.display_first_last)].head()

In [None]:
# Sanitize player name for both dfs so they can later be used to join the tables
nba_player_list.display_first_last = nba_player_list.display_first_last.astype('str')
espn_player_list.name = espn_player_list.name.astype('str')

espn_player_list.name = espn_player_list.name.apply(sanitize_name)
nba_player_list.display_first_last = nba_player_list.display_first_last.apply(sanitize_name)

In [None]:
# Handle a few special cases where names are different across two data sources
espn_player_list.loc[espn_player_list.espn_player_id == '1713', 'name'] = 'nene'
espn_player_list.loc[espn_player_list.espn_player_id == '4017839', 'name'] = 'juancho hernangomez'
espn_player_list.loc[espn_player_list.espn_player_id == '3056247', 'name'] = 'kendrick nunn'
espn_player_list.loc[espn_player_list.espn_player_id == '2528586', 'name'] = 'walter lemon'
espn_player_list.loc[espn_player_list.espn_player_id == '3133602', 'name'] = 'svi mykhailiuk'
espn_player_list.loc[espn_player_list.espn_player_id == '4066508', 'name'] = 'charles brown'
espn_player_list.loc[espn_player_list.espn_player_id == '4395627', 'name'] = 'cameron reddish'
espn_player_list.loc[espn_player_list.espn_player_id == '4395627', 'name'] = 'cameron reddish'

In [None]:
merged_df = espn_player_list.join(nba_player_list.set_index("display_first_last", drop=True),
                                    on='name')

In [None]:
# Check for rows that didn't join correctly
merged_df[merged_df.isnull().any(axis=1)]

In [None]:
merged_df[merged_df.position.isnull()]

In [None]:
# Tyler Cook and Cody Demps just got traded and 
# has not played any games in his career, so we're dropping them
merged_df = merged_df.dropna()
merged_df = merged_df.rename(columns={"person_id":"nba_id", 'position':'espn_position'})
merged_df = merged_df[['name','espn_position','nba_id']]

In [None]:
merged_df.head()

#### Loading joined df to db

In [None]:
def load_joined_data_to_db(merged_df):
    """Loads final player identify and position data to db"""
    
    
    engine = create_engine("sqlite:///db/nba.db", echo=False)
    
    with engine.connect() as conn:
        espn_player_list.to_sql('players', 
                                conn, 
                                dtype={"player_id":Integer}, 
                                if_exists="replace", index=False)
        
        print("Successfully loaded into table players")
        engine.dispose()
        
load_joined_data_to_db(merged_df)

In [None]:
# Getting unique set of player_ids
nba_ids = merged_df.nba_id.unique()

print("Total of {} unique IDs".format(nba_ids.shape[0]))

In [None]:
dfs_to_concat = []
for idx in tqdm_notebook(nba_ids):
    sleep(2) # Prevents from being banned
    try: 
        dfs_to_concat.append(get_player_stat(idx))
    except NoDataError:
        # This is when the player has no data and returns an empty df
        continue

In [None]:
final_df = pd.concat(dfs_to_concat, sort=False)

In [None]:
final_df.to_csv("Finaldf.csv")

In [None]:
final_df.index.name = "nba_id"

In [None]:
final_df.head()

In [None]:
with engine.connect() as conn:
    final_df.to_sql("stat", conn)