# Unsupervised Learning for Clustering NBA Players
---
## Web Scraping
Data for this project was scraped from the popular NBA stats tracking website https://www.basketball-reference.com/.

Remy Shea, May 2019, https://github.com/RemShea/nba-player-clustering

---
# Import Libraries

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import requests
from bs4 import BeautifulSoup
import time

%matplotlib inline

In [5]:
url='https://www.basketball-reference.com/'
res=requests.get(url)
soup=BeautifulSoup(res.content,'lxml')

---
# Gathering Team URLs

In [7]:
west = soup.find('table',{'id':'confs_standings_W'})
east = soup.find('table',{'id':'confs_standings_E'})
teams=[]
team_urls=[]
for row in west.find_all('tr')[1:]:
    team_link = row.find('a')['href']
    team_url = 'https://www.basketball-reference.com' + team_link
    team_urls.append(team_url)
for row in east.find_all('tr')[1:]:
    team_link = row.find('a')['href']
    team_url = 'https://www.basketball-reference.com' + team_link
    team_urls.append(team_url)

---
# Gathering Player URLs

In [8]:
%%time
players = dict()
for url in team_urls:
    print(f'Scraping data from {url}...')
    res=requests.get(url)
    soup=BeautifulSoup(res.content,'lxml')
    table=soup.find('table')
    links=table.find_all('a')
    for i in links:
        link=i['href']
        name=i.text
        if 'html' in link:
            players[name]=dict()
            players[name]['url']=('https://www.basketball-reference.com' + link)
    time.sleep(1)
print('Finished collecting roster data!\n'+'-'*60)

Scraping data from https://www.basketball-reference.com/teams/GSW/2019.html...
Scraping data from https://www.basketball-reference.com/teams/DEN/2019.html...
Scraping data from https://www.basketball-reference.com/teams/POR/2019.html...
Scraping data from https://www.basketball-reference.com/teams/HOU/2019.html...
Scraping data from https://www.basketball-reference.com/teams/UTA/2019.html...
Scraping data from https://www.basketball-reference.com/teams/OKC/2019.html...
Scraping data from https://www.basketball-reference.com/teams/SAS/2019.html...
Scraping data from https://www.basketball-reference.com/teams/LAC/2019.html...
Scraping data from https://www.basketball-reference.com/teams/SAC/2019.html...
Scraping data from https://www.basketball-reference.com/teams/LAL/2019.html...
Scraping data from https://www.basketball-reference.com/teams/MIN/2019.html...
Scraping data from https://www.basketball-reference.com/teams/MEM/2019.html...
Scraping data from https://www.basketball-reference.

---
# Gathering Player Data

In [10]:
MIN_GAMES=10
MIN_MINUTES=500

failed_to_qualify=[]
for i,player in enumerate(players):
    start = time.time()
    url = players[player]['url']
    players[player]['name']=player
    print(f'Collecting stats for {player} from {url} ...')
    res=requests.get(url)
    soup=BeautifulSoup(res.content,'lxml')
    
    try:
        per_game_stats = soup.find('table',{'id':'per_game'}).find('tr',{'id':'per_game.2019'}).find_all('td',{'class':'right'})
    except AttributeError:
        print(f'{player} didn\'t play in an NBA game in 2019! YEET!')
        failed_to_qualify.append(player)
        continue
        
    for stat in per_game_stats:
        if stat.text != '':
            stat_name=stat['data-stat']
            players[player][stat_name]=float(stat.text)
    
    advanced_stats = BeautifulSoup(str(soup.find('div',{'id':'all_advanced'})).replace('<!--','')).find('table',{'id':'advanced'}).find('tr',{'id':'advanced.2019'}).find_all('td',{'class':'right'})
    for stat in advanced_stats:
        if stat.text != '':
            stat_name=stat['data-stat']
            players[player][stat_name]=float(stat.text)
            
    shooting_stats = BeautifulSoup(str(soup.find('div',{'id':'all_shooting'})).replace('<!--','')).find('table',{'id':'shooting'}).find('tr',{'id':'shooting.2019'}).find_all('td',{'class':'right'})
    for stat in shooting_stats:
        if stat.text!='':
            stat_name=stat['data-stat']
            players[player][stat_name]=float(stat.text)
            
    pbp_stats = BeautifulSoup(str(soup.find('div',{'id':'all_pbp'})).replace('<!--','')).find('table',{'id':'pbp'}).find('tr',{'id':'pbp.2019'}).find_all('td',{'class':'right'})
    for stat in pbp_stats:
        if stat.text!='':
            stat_name=stat['data-stat']
            if stat.text.endswith('%'):
                text = stat.text.replace('%','')
            else:
                text = stat.text
            players[player][stat_name]=float(text)
        
    if players[player]['g'] < MIN_GAMES or players[player]['mp'] < MIN_MINUTES:
        print(f'{player} failed to meet for minutes or games played minimum! YEET!')
        failed_to_qualify.append(player)
        
    time.sleep(3)
    end=time.time()
    print(f'ETA: {(len(players)-(i+1))*(end-start)} seconds...')
print('Finished collecting player data!\n'+'-'*60)

Collecting stats for Klay Thompson from https://www.basketball-reference.com/players/t/thompkl01.html ...
ETA: 1828.7703475952148 seconds...
Collecting stats for Draymond Green from https://www.basketball-reference.com/players/g/greendr01.html ...
ETA: 1825.1189517974854 seconds...
Collecting stats for Damian Jones from https://www.basketball-reference.com/players/j/jonesda03.html ...
Damian Jones failed to meet for minutes or games played minimum! YEET!
ETA: 1755.4769048690796 seconds...
Collecting stats for Jordan Bell from https://www.basketball-reference.com/players/b/belljo01.html ...
ETA: 1750.8032658100128 seconds...
Collecting stats for Stephen Curry from https://www.basketball-reference.com/players/c/curryst01.html ...
ETA: 1874.4748106002808 seconds...
Collecting stats for Andre Iguodala from https://www.basketball-reference.com/players/i/iguodan01.html ...
ETA: 1843.2811691761017 seconds...
Collecting stats for Shaun Livingston from https://www.basketball-reference.com/playe

ETA: 1646.0064613819122 seconds...
Collecting stats for Nene from https://www.basketball-reference.com/players/h/hilarne01.html ...
ETA: 1753.7365064620972 seconds...
Collecting stats for P.J. Tucker from https://www.basketball-reference.com/players/t/tuckepj01.html ...
ETA: 1658.8625783920288 seconds...
Collecting stats for Chris Paul from https://www.basketball-reference.com/players/p/paulch01.html ...
ETA: 1692.1095480918884 seconds...
Collecting stats for Gerald Green from https://www.basketball-reference.com/players/g/greenge01.html ...
ETA: 1704.437667131424 seconds...
Collecting stats for Trevon Duval from https://www.basketball-reference.com/players/d/duvaltr01.html ...
Trevon Duval failed to meet for minutes or games played minimum! YEET!
ETA: 1525.2911758422852 seconds...
Collecting stats for Isaiah Hartenstein from https://www.basketball-reference.com/players/h/harteis01.html ...
Isaiah Hartenstein failed to meet for minutes or games played minimum! YEET!
ETA: 1561.490623950

ETA: 1408.3901381492615 seconds...
Collecting stats for Derrick White from https://www.basketball-reference.com/players/w/whitede01.html ...
ETA: 1396.4788134098053 seconds...
Collecting stats for Patty Mills from https://www.basketball-reference.com/players/m/millspa02.html ...
ETA: 1439.1676120758057 seconds...
Collecting stats for Rudy Gay from https://www.basketball-reference.com/players/g/gayru01.html ...
ETA: 1520.1684784889221 seconds...
Collecting stats for Davis Bertans from https://www.basketball-reference.com/players/b/bertada01.html ...
ETA: 1444.4398641586304 seconds...
Collecting stats for Lonnie Walker from https://www.basketball-reference.com/players/w/walkelo01.html ...
Lonnie Walker failed to meet for minutes or games played minimum! YEET!
ETA: 1393.2516553401947 seconds...
Collecting stats for Dante Cunningham from https://www.basketball-reference.com/players/c/cunnida01.html ...
ETA: 1462.7845067977905 seconds...
Collecting stats for Marco Belinelli from https://www

ETA: 1278.5688779354095 seconds...
Collecting stats for Mike Muscala from https://www.basketball-reference.com/players/m/muscami01.html ...
ETA: 1261.6697630882263 seconds...
Collecting stats for Moritz Wagner from https://www.basketball-reference.com/players/w/wagnemo01.html ...
Moritz Wagner failed to meet for minutes or games played minimum! YEET!
ETA: 1210.6430730819702 seconds...
Collecting stats for Isaac Bonga from https://www.basketball-reference.com/players/b/bongais01.html ...
Isaac Bonga failed to meet for minutes or games played minimum! YEET!
ETA: 1190.860071182251 seconds...
Collecting stats for Kentavious Caldwell-Pope from https://www.basketball-reference.com/players/c/caldwke01.html ...
ETA: 1220.082329750061 seconds...
Collecting stats for Rajon Rondo from https://www.basketball-reference.com/players/r/rondora01.html ...
ETA: 1276.8142309188843 seconds...
Collecting stats for LeBron James from https://www.basketball-reference.com/players/j/jamesle01.html ...
ETA: 1353

ETA: 1096.0339698791504 seconds...
Collecting stats for Cheick Diallo from https://www.basketball-reference.com/players/d/diallch01.html ...
ETA: 1031.4082517623901 seconds...
Collecting stats for Solomon Hill from https://www.basketball-reference.com/players/h/hillso01.html ...
ETA: 1059.8189687728882 seconds...
Collecting stats for Jrue Holiday from https://www.basketball-reference.com/players/h/holidjr01.html ...
ETA: 1070.5124304294586 seconds...
Collecting stats for Frank Jackson from https://www.basketball-reference.com/players/j/jacksfr01.html ...
ETA: 1009.4780044555664 seconds...
Collecting stats for Darius Miller from https://www.basketball-reference.com/players/m/milleda01.html ...
ETA: 1037.515061378479 seconds...
Collecting stats for Elfrid Payton from https://www.basketball-reference.com/players/p/paytoel01.html ...
ETA: 1080.5784029960632 seconds...
Collecting stats for Julius Randle from https://www.basketball-reference.com/players/r/randlju01.html ...
ETA: 1028.9699685

ETA: 870.4873900413513 seconds...
Collecting stats for Donte DiVincenzo from https://www.basketball-reference.com/players/d/divindo01.html ...
Donte DiVincenzo failed to meet for minutes or games played minimum! YEET!
ETA: 823.6817157268524 seconds...
Collecting stats for Ersan Ilyasova from https://www.basketball-reference.com/players/i/ilyaser01.html ...
ETA: 917.1958541870117 seconds...
Collecting stats for Brook Lopez from https://www.basketball-reference.com/players/l/lopezbr01.html ...
ETA: 865.3120589256287 seconds...
Collecting stats for Pat Connaughton from https://www.basketball-reference.com/players/c/connapa01.html ...
ETA: 845.2438144683838 seconds...
Collecting stats for Bonzie Colson from https://www.basketball-reference.com/players/c/colsobo01.html ...
Bonzie Colson failed to meet for minutes or games played minimum! YEET!
ETA: 816.2460570335388 seconds...
Collecting stats for Pau Gasol from https://www.basketball-reference.com/players/g/gasolpa01.html ...
Pau Gasol fai

Robert Williams failed to meet for minutes or games played minimum! YEET!
ETA: 668.5143070220947 seconds...
Collecting stats for Aron Baynes from https://www.basketball-reference.com/players/b/baynear01.html ...
ETA: 694.2565298080444 seconds...
Collecting stats for Marcus Smart from https://www.basketball-reference.com/players/s/smartma01.html ...
ETA: 674.0590076446533 seconds...
Collecting stats for PJ Dozier from https://www.basketball-reference.com/players/d/doziepj01.html ...
PJ Dozier failed to meet for minutes or games played minimum! YEET!
ETA: 639.393789768219 seconds...
Collecting stats for R.J. Hunter from https://www.basketball-reference.com/players/h/hunterj01.html ...
R.J. Hunter failed to meet for minutes or games played minimum! YEET!
ETA: 652.6146764755249 seconds...
Collecting stats for Jonathan Gibson from https://www.basketball-reference.com/players/g/gibsojo01.html ...
Jonathan Gibson didn't play in an NBA game in 2019! YEET!
Collecting stats for Victor Oladipo fr

Troy Caupain failed to meet for minutes or games played minimum! YEET!
ETA: 465.5567307472229 seconds...
Collecting stats for Amile Jefferson from https://www.basketball-reference.com/players/j/jeffeam01.html ...
Amile Jefferson failed to meet for minutes or games played minimum! YEET!
ETA: 462.9602138996124 seconds...
Collecting stats for Michael Carter-Williams from https://www.basketball-reference.com/players/c/cartemi01.html ...
Michael Carter-Williams failed to meet for minutes or games played minimum! YEET!
ETA: 486.8083162307739 seconds...
Collecting stats for Reggie Jackson from https://www.basketball-reference.com/players/j/jacksre01.html ...
ETA: 482.48925256729126 seconds...
Collecting stats for Jon Leuer from https://www.basketball-reference.com/players/l/leuerjo01.html ...
Jon Leuer failed to meet for minutes or games played minimum! YEET!
ETA: 488.4938359260559 seconds...
Collecting stats for Ish Smith from https://www.basketball-reference.com/players/s/smithis01.html ...

ETA: 299.0358829498291 seconds...
Collecting stats for Sam Dekker from https://www.basketball-reference.com/players/d/dekkesa01.html ...
ETA: 290.41001200675964 seconds...
Collecting stats for Bobby Portis from https://www.basketball-reference.com/players/p/portibo01.html ...
ETA: 295.1656723022461 seconds...
Collecting stats for Ian Mahinmi from https://www.basketball-reference.com/players/m/mahinia01.html ...
Ian Mahinmi failed to meet for minutes or games played minimum! YEET!
ETA: 292.3168423175812 seconds...
Collecting stats for Tomas Satoransky from https://www.basketball-reference.com/players/s/satorto01.html ...
ETA: 276.98033380508423 seconds...
Collecting stats for Bradley Beal from https://www.basketball-reference.com/players/b/bealbr01.html ...
ETA: 280.5291659832001 seconds...
Collecting stats for Devin Robinson from https://www.basketball-reference.com/players/r/robinde01.html ...
Devin Robinson failed to meet for minutes or games played minimum! YEET!
ETA: 265.1611070632

ETA: 114.10638785362244 seconds...
Collecting stats for Kevin Love from https://www.basketball-reference.com/players/l/loveke01.html ...
ETA: 111.6877555847168 seconds...
Collecting stats for Tristan Thompson from https://www.basketball-reference.com/players/t/thomptr01.html ...
ETA: 105.97687029838562 seconds...
Collecting stats for Jordan Clarkson from https://www.basketball-reference.com/players/c/clarkjo01.html ...
ETA: 100.40110969543457 seconds...
Collecting stats for Matthew Dellavedova from https://www.basketball-reference.com/players/d/dellama01.html ...
ETA: 99.18914937973022 seconds...
Collecting stats for Marquese Chriss from https://www.basketball-reference.com/players/c/chrisma01.html ...
Marquese Chriss failed to meet for minutes or games played minimum! YEET!
ETA: 92.87187051773071 seconds...
Collecting stats for J.R. Smith from https://www.basketball-reference.com/players/s/smithjr01.html ...
J.R. Smith failed to meet for minutes or games played minimum! YEET!
ETA: 97.

---
# Data Dictionary Mock-Up

In [335]:
data_dict=dict()
tables = ['all_per_game','all_advanced','all_shooting','all_pbp']
for table in tables:
    my_soup = BeautifulSoup(str(soup.find('div',{'id':table})).replace('<!--',''))
    for th in my_soup.find_all('th',{'class':'poptip'}):
        var_name = th['data-stat']
        stat_name = th['aria-label']
        try:
            stat_desc = th['data-tip']
            data_dict[var_name]=stat_name + ' : ' + stat_desc
        except KeyError:
            data_dict[var_name]=stat_name

In [340]:
for key,value in data_dict.items():
    print(key + '\n' + value)
    print('-'*60+'\n')  

season
If listed as single number, the year the season ended.★ - Indicates All-Star for league.Only on regular season tables. : If listed as single number, the year the season ended.<br>★ - Indicates All-Star for league.<br>Only on regular season tables.
------------------------------------------------------------

age
Age of Player at the start of February 1st of that season. : Age of Player at the start of February 1st of that season.
------------------------------------------------------------

team_id
Team : Team
------------------------------------------------------------

lg_id
League : League
------------------------------------------------------------

pos
Position : Position
------------------------------------------------------------

g
Games : Games
------------------------------------------------------------

gs
Games Started : Games Started
------------------------------------------------------------

mp_per_g
Minutes Played Per Game : Minutes Played Per Game
-------------

---
# Exporting Scraped Player Data

In [4]:
player_db = pd.DataFrame(players).T
player_db.to_csv('../data/player_raw.csv',index=False)
player_db.drop(failed_to_qualify,inplace=True)
player_db.to_csv('../data/player_db.csv',index=False)