In [2]:
# NBA Stats scraping
#
# R. Andrew Fowler 
# Last update: march 14 2016

from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import numpy as np
import time
import re

####################################################

# SCRAPING FUNCTIONS

# Get the website text with a built in retry if 
# there is no connection.  Returns a string if it 
# tries more than a designated maximum number of 
# times.

def make_soup(url, maxTries=20):
    connection = 0
    tries = 0
    
    while not connection:
        
        try:
            html = urlopen(url).read()
            soup = BeautifulSoup(html,'html.parser')
            connection = 1
            tries += 1
            
        except:
            time.sleep(0.5)
            tries += 1
            
        if tries == maxTries:
            connection = 1
            soup = 'The soup is ruined!!'
    
    return soup

####################################################
#
# FUNCTION TESTING
#
# This area is a scratch pad for testing functions 
# after they have been written or changed.

g = 'http://m.bkref.com/m?p=XXteamsXX'
soupySoup = make_soup(g)

print('Dear god I can\'t read this horrible horse hockey!')


Dear god I can't read this horrible horse hockey!


In [2]:
# Make a list of the teams' names & abbreviations

# Using the mobile site because it's simpler
teamUrl = 'http://m.bkref.com/m?p=XXteamsXX'
teamSoup = make_soup(teamUrl)

# Everything is in one big table
teams = teamSoup.find('ul')

# Only active teams have an active link
activeTeams = teams.findAll('a')

teamNames = list()
teamAbrev = list()

for team in activeTeams:
    teamNames.append(team.getText())
    
    currLink = team['href']
    teamAbrev.append(currLink[13:16])

# Bundle all this up into a dataframe
teamFrame = pd.DataFrame({
        'Name':teamNames,
        'Abrv':teamAbrev})    
    
print(teamFrame)

   Abrv                    Name
0   ATL           Atlanta Hawks
1   BOS          Boston Celtics
2   NJN           Brooklyn Nets
3   CHA       Charlotte Hornets
4   CHI           Chicago Bulls
5   CLE     Cleveland Cavaliers
6   DAL        Dallas Mavericks
7   DEN          Denver Nuggets
8   DET         Detroit Pistons
9   GSW   Golden State Warriors
10  HOU         Houston Rockets
11  IND          Indiana Pacers
12  LAC    Los Angeles Clippers
13  LAL      Los Angeles Lakers
14  MEM       Memphis Grizzlies
15  MIA              Miami Heat
16  MIL         Milwaukee Bucks
17  MIN  Minnesota Timberwolves
18  NOH    New Orleans Pelicans
19  NYK         New York Knicks
20  OKC   Oklahoma City Thunder
21  ORL           Orlando Magic
22  PHI      Philadelphia 76ers
23  PHO            Phoenix Suns
24  POR  Portland Trail Blazers
25  SAC        Sacramento Kings
26  SAS       San Antonio Spurs
27  TOR         Toronto Raptors
28  UTA               Utah Jazz
29  WAS      Washington Wizards


In [3]:
# Get a link list for games:
cavsUrl ='http://www.basketball-reference.com/teams/CLE/2015_games.html'
cavSoup = make_soup(cavsUrl)

# Note that this will get all regular season and post season games
# for the cavs, this was 102 games in 2015
gameTable = cavSoup.findAll('tr')

linkList = list()

for game in gameTable:
    currGame = game.findAll('td')
    
    # Checks to make sure you're not on a divider line
    if len(currGame) > 1:
        # Boxscore link
        currLink = currGame[4].find('a')
        currLink = currLink['href']
        
        # Cuts up the link you scrape and redirects you to the play-by-play
        pbpLink = currLink[0:9] + '/pbp' + currLink[10:0]
        linkList.append(pbpLink)

print(len(linkList))

102


In [4]:
# Play by play analysis:
#
# Take a link from the link list and stick it 
# on the end of basketball-reference.com/

pbpUrl = 'http://www.basketball-reference.com/boxscores/pbp/201410300CLE.html'
pbpSoup = make_soup(pbpUrl).findAll('td',{'class':'align_right'})

# Regex string which finds game clock times
alwaysTime = re.compile('\d*:\d{2}\.\d')

maybeTime = list()
standardTime = list()

for thing in pbpSoup:
    maybeTime.append(thing.getText())
    
for times in maybeTime:
    if alwaysTime.search(times):
        standardTime.append(times)
    
print(len(standardTime))


410


In [7]:
pbpUrl = 'http://www.basketball-reference.com/boxscores/pbp/201410300CLE.html'

pbpSoup = make_soup(pbpUrl)

pbpTable = pbpSoup.findAll('table',{'class':'no_highlight stats_table'})

print(pbpTable)


[<table class="no_highlight stats_table">
<tr id="q1">
<th colspan="6">1st Quarter</th></tr></table>]


In [6]:
# This loads the data from some version of the CSV... not sure which right now....

# fullDat = pd.DataFrame.from_csv('PBPComplete_20012015.csv')

In [3]:
print(fullDat.index)

DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', '2001-01-01',
               '2001-01-01', '2001-01-01', '2001-01-01', '2001-01-01',
               '2001-01-01', '2001-01-01',
               ...
               '2015-01-01', '2015-01-01', '2015-01-01', '2015-01-01',
               '2015-01-01', '2015-01-01', '2015-01-01', '2015-01-01',
               '2015-01-01', '2015-01-01'],
              dtype='datetime64[ns]', name='year', length=8466106, freq=None)


In [4]:
print(fullDat.columns)

Index(['game_id', 'away_team', 'home_team', 'quarter',
       'time_remaining_quarter', 'time_remaining_game',
       'time_remaining_in_seconds', 'score', 'away_score', 'home_score',
       'total_differential', 'score_change', 'timeout', 'timeout_called',
       'basket_made', 'basket_missed', 'rebound', 'assist', 'turnover', 'foul',
       'distance_from_basket', 'event'],
      dtype='object')


In [5]:
print(len(fullDat.index))

8466106


In [7]:
indexTup = tuple(fullDat.index)

print(len(indexTup))

8466106


In [None]:
g = pd.DataFrame.