In [5]:
import sys
sys.path.append('/Users/victor/Documents/code/nba_api')

import re
import pandas as pd
import numpy as np

from src.espn_scraper import espn_player_scraper
from src.player_stat import get_player_stat, NoDataError

from sqlalchemy import create_engine, Integer
from tqdm import tqdm_notebook
from nba_py import player
from time import sleep

## Scrape ESPN for Player Position and Load to DB

In [6]:
# Initialize the db. Relatively small dataset, so we'll use SQLite
engine = create_engine("sqlite:///db/nba.db", echo=False)

In [None]:
# Scrape ESPN pages
teams_overview_url = "http://www.espn.com/nba/players"
scraper = espn_player_scraper()
espn_player_list = scraper.scrape_all_players(teams_overview_url)

In [None]:
# Load to DB
with engine.connect() as conn:
    espn_player_list.to_sql('espn_players', conn, dtype={"espn_player_id":Integer}, if_exists="replace", index=False)

## Getting player performance data from NBA API
Here we will use nba_py by seemethere, a Python wrapper for the unpublished offical NBA API.

The official NBA API uses a different set of Player IDs, so we will have to join ESPN and NBA data by cross referencing player names. Also, the NBA API returns some players in the NBA Delevelopment league, which we are not interested in. We will thus use a left join on the ESPN table.

In [None]:
# Loading back the data
with engine.connect() as conn:
    espn_player_list = pd.read_sql('espn_players', conn)

# Getting players for the current season
nba_player_list = player.PlayerList().info()

# Convert upper case letter to upper case letter
nba_player_list.columns = [col.lower() for col in nba_player_list.columns]

# Check if number of players are the same
print("NBA roster has {} players".format(nba_player_list.shape[0]))
print("ESPN roster has {} players".format(espn_player_list.shape[0]))

In [None]:
with engine.connect() as conn:
    nba_player_list.to_sql("nba_players", conn, dtype={'person_id':Integer}, if_exists="replace", index=False)

In [None]:
nba_player_list.head()

In [None]:
# Selecting columns of interest
nba_player_list = nba_player_list[['person_id','display_first_last']]
nba_player_list.head()

In [None]:
espn_player_list.head()

### Cleaning before join

In [None]:
# Cleaning the data a bit for joins
# Remove all special characters, convert to lower case
def sanitize_name(name_str):
    sanitized = name_str.lower()
    sanitized = sanitized.replace('-', ' ')
    
    to_remove = [".", "'", "jr", 'sr', 'iii']
    for pattern in to_remove:
        sanitized = sanitized.replace(pattern, '')
        
    # Remove trailing space
    sanitized = re.sub(re.compile("\s*$"), '', sanitized)
        
    return sanitized

In [None]:
# Getting a list of ESPN player names not in NBA player name list
espn_player_list[~espn_player_list.name.isin(nba_player_list.display_first_last)].head()

In [None]:
# Sanitize both dfs
nba_player_list.display_first_last = nba_player_list.display_first_last.astype('str')
espn_player_list.name = espn_player_list.name.astype('str')

espn_player_list.name = espn_player_list.name.apply(sanitize_name)
nba_player_list.display_first_last = nba_player_list.display_first_last.apply(sanitize_name)

In [None]:
# Handle a few special cases where names are different across two 
espn_player_list.loc[espn_player_list.espn_player_id == 1713, 'name'] = 'nene'
espn_player_list.loc[espn_player_list.espn_player_id == 4017839, 'name'] = 'juancho hernangomez'
espn_player_list.loc[espn_player_list.espn_player_id == 3056247, 'name'] = 'kendrick nunn'
espn_player_list.loc[espn_player_list.espn_player_id == 2528586, 'name'] = 'walter lemon'
espn_player_list.loc[espn_player_list.espn_player_id == 3133602, 'name'] = 'svi mykhailiuk'

In [None]:
merged_df = espn_player_list.join(nba_player_list.set_index("display_first_last", drop=True),
                                    on='name')

In [None]:
# Check for rows that didn't join correctly
merged_df[merged_df.isnull().any(axis=1)]

In [None]:
merged_df[merged_df.position.isnull()]

In [None]:
# Kendrick Nunn and Cody Demps just got traded and 
# has not played any games in his career, so we're dropping them

merged_df = merged_df.dropna()
merged_df = merged_df.rename(columns={"person_id":"nba_id"})
merged_df.shape

In [None]:
# Load to DB
with engine.connect() as conn:
    merged_df.to_sql("players", conn, dtype={'nba_id':Integer, 'espn_player_id':Integer}, index=False, if_exists='replace')

In [None]:
merged_df.head()

## Loading NBA Player Stat into DB

In [7]:
# Read back the db
with engine.connect() as conn:
    players = pd.read_sql("players", conn)
    
# Getting unique set of player_ids
nba_ids = players.nba_id.unique()

print("Total of {} unique IDs".format(nba_ids.shape[0]))

Total of 489 unique IDs


In [11]:
dfs_to_concat = []
for idx in tqdm_notebook(nba_ids):
    sleep(1) # Prevent from being banned
    try: 
        dfs_to_concat.append(get_player_stat(idx))
    except NoDataError:
        # This is when the player has no data and returns an empty df
        continue

HBox(children=(IntProgress(value=0, max=489), HTML(value='')))




In [12]:
final_df = pd.concat(dfs_to_concat, sort=False)

In [13]:
final_df.to_csv("Finaldf.csv")

In [17]:
final_df.index.name = "nba_id"

In [19]:
final_df.head()

Unnamed: 0_level_0,shot_res,shot_in_paint,shot_mid_range,shot_lcorner_3,shot_rcorner_3,shot_above_3,fga,block_res,block_in_paint,block_mid_range,block_lcorner_3,block_rcorner_3,block_above_3,blka,oreb,dreb,ast,stl,min
nba_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
203382,0.515695,0.125561,0.0852018,0.0224215,0.0269058,0.224215,223,0.749999,0.25,0.0,0.0,0.0,0.0,12,9.3,18.7,6.3,1.4,2305.288333
1627759,0.367424,0.156566,0.127525,0.0328283,0.0643939,0.251263,792,0.674418,0.27907,0.0,0.0,0.0232558,0.0232558,43,3.1,12.4,5.0,3.3,4065.096667
1628408,0.333333,0.190476,0.095238,0.0,0.095238,0.285714,21,0.0,0.0,0.0,0.0,0.0,0.0,0,5.7,25.5,4.7,1.9,54.031667
202330,0.261417,0.15748,0.218898,0.0503937,0.0598425,0.251968,635,0.629629,0.259259,0.037037,0.037037,0.0,0.037037,27,1.3,16.1,6.3,1.6,1868.588333
201143,0.285319,0.198061,0.235457,0.0124654,0.0166205,0.252078,722,0.586207,0.37931,0.0344827,0.0,0.0,0.0,29,5.1,17.2,14.0,2.3,4249.708333


In [18]:
with engine.connect() as conn:
    final_df.to_sql("stat", conn)