# NBA Data Scrape

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import csv
import logging
import re
import time
import datetime
from datetime import date
from dateutil.parser import parse

In [2]:
# Setup the csv writer
nbafile = open("NBA_data/NBA_Season_avgs_2015.csv", "w", newline='', encoding='utf-8')
nbawriter = csv.writer(nbafile)
# set headings
nbawriter.writerow(['RK', 
                    'Team', 
                    'Year', 
                    'G', 
                    'W', 
                    'L', 
                    'Conf', 
                    'MP', 
                    'FG', 
                    'FGA', 
                    'FG%', 
                    '3P',
                    '3PA', 
                    '3P%', 
                    '2P', 
                    '2PA', 
                    '2P%', 
                    'FT', 
                    'FTA', 
                    'FT%', 
                    'ORB', 
                    'DRB', 
                    'AST', 
                    'STL', 
                    'BLK', 
                    'TOV', 
                    'PF', 
                    'PTS'])

103

## Select Start and End Year

In [3]:
# Define Year for URL
start_year = 2015
end_year = 2015

In [4]:
# Create model to scrape data

class NBAscrapy:
    runcount = 1
    rk = 1
    year = 2015
    
    # __init__ required to initialize the scrape tool
    def __init__(self):
        self.main()

    # starting URL according to start year above
    def main(self):
        url = "https://www.basketball-reference.com/leagues/NBA_"+str(start_year)+".html"
        self.getyearteamdata(url)

    # build data for per-year
    def getyearteamdata(self, url):
        
        # running through each table by rank since website uses rank in each table used in ascending order
        # rank will reset to 1 once we move to a new year
        self.rk = 1
        print("--------------year: " + str(self.year) + " |" + " row: " + str(self.runcount) + "-----------------")
        
        # get data, total_pages
        page = requests.get(url)
        page.encoding = "UTF-8"
        pagecontent = str(page.content)
        pagecontent = pagecontent.replace('<!--',"")
        pagecontent = pagecontent.replace("-->", "")
        pagecontent = pagecontent.replace("\n", "")

        # print(pagecontent)
        soup = BeautifulSoup(pagecontent, 'html.parser')
        soup.encode('utf-8')

        # format material for page.json()
        
        # data will be pulled from 'Team Per Game Stats' table (avg game data)
        Team = ""
        Year = self.year
        G = ""
        MP = ""
        FG = ""
        FGA = ""
        FGpercent = ""
        ThreeP = ""
        ThreePA = ""
        ThreePpercent = ""
        TwoP = ""
        TwoPA = ""
        TwoPpercent = ""
        FT = ""
        FTA = ""
        FTpercent = ""
        ORB = ""
        DRB = ""
        TRB = ""
        AST = ""
        STL = ""
        BLK = ""
        TOV = ""
        PF = ""
        PTS = ""
        
        # data will be pulled from 'Conference Standings' table
        W = ""
        L = ""
        Conf = ""
        E_W_teams = []

        # for testing we wanted to split between East and West
        # This required pulling from the 'Conference Standings' table
        
        # If statement to Eastern Conference
        if soup.find('table', {'id':'divs_standings_E'}) is not None:
            if soup.find('table', {'id':'divs_standings_E'}).findAll('tr', {'class': 'full_table'}) is not None:
                E_team_trs = soup.find('table', {'id':'divs_standings_E'}).findAll('tr', {'class': 'full_table'})
                nI = 0
                
                # Loop through Eastern Conference, collect wins, lossess
                for E_team_tr in E_team_trs:
                    E_team = {
                        'index': 'East', # Index the data by 'East' Conference
                        'teamname': E_team_tr.find('a').text.strip(),
                        'W': E_team_tr.find('td',{'data-stat': 'wins'}).text.strip(),
                        'L': E_team_tr.find('td',{'data-stat': 'losses'}).text.strip()
                    }
                    E_W_teams.append(E_team.copy())
                    nI += 1

        # If statement to Western Conference
        if soup.find('table', {'id':'divs_standings_W'}) is not None:
            if soup.find('table', {'id':'divs_standings_W'}).findAll('tr', {'class': 'full_table'}) is not None:
                W_team_trs = soup.find('table', {'id':'divs_standings_W'}).findAll('tr', {'class': 'full_table'})
                nI = 0
                
                # Loop through Western Conference, collect wins, lossess
                for W_team_tr in W_team_trs:
                    W_team = {
                        'index': 'West', # Index the data by 'West' Conference
                        'teamname': W_team_tr.find('a').text.strip(),
                        'W': W_team_tr.find('td',{'data-stat': 'wins'}).text.strip(),
                        'L': W_team_tr.find('td',{'data-stat': 'losses'}).text.strip()
                    }
                    E_W_teams.append(W_team.copy())
                    nI += 1
                    
        # If statement to collect data from league "Team Per Game Stats" table            
        if soup.find('table', {'id':'per_game-team'}) is not None:
            if soup.find('table', {'id':'per_game-team'}).findAll('tr') is not None:
                Team_stats_trs = soup.find('table', {'id':'per_game-team'}).findAll('tr')
                #print(Team_stats_trs)
                for Team_stats_tr in Team_stats_trs:
                    thhead = Team_stats_tr.find('th',{'scope': 'row'})
                    sheet = []
                    if thhead is not None:
                        Team = Team_stats_tr.find('td',{'data-stat': 'team'}).text.strip()
                        if Team != "League Average":
                            G = Team_stats_tr.find('td',{'data-stat': 'g'}).text.strip()
                            MP = Team_stats_tr.find('td',{'data-stat': 'mp'}).text.strip()
                            FG = Team_stats_tr.find('td',{'data-stat': 'fg'}).text.strip()
                            FGA = Team_stats_tr.find('td',{'data-stat': 'fga'}).text.strip()
                            FGpercent = Team_stats_tr.find('td',{'data-stat': 'fg_pct'}).text.strip()
                            ThreeP = Team_stats_tr.find('td',{'data-stat': 'fg3'}).text.strip()
                            ThreePA = Team_stats_tr.find('td',{'data-stat': 'fg3a'}).text.strip()
                            ThreePpercent = Team_stats_tr.find('td',{'data-stat': 'fg3_pct'}).text.strip()
                            TwoP = Team_stats_tr.find('td',{'data-stat': 'fg2'}).text.strip()
                            TwoPA = Team_stats_tr.find('td',{'data-stat': 'fg2a'}).text.strip()
                            TwoPpercent = Team_stats_tr.find('td',{'data-stat': 'fg2_pct'}).text.strip()
                            FT = Team_stats_tr.find('td',{'data-stat': 'ft'}).text.strip()
                            FTA = Team_stats_tr.find('td',{'data-stat': 'fta'}).text.strip()
                            FTpercent = Team_stats_tr.find('td',{'data-stat': 'ft_pct'}).text.strip()
                            ORB = Team_stats_tr.find('td',{'data-stat': 'orb'}).text.strip()
                            DRB = Team_stats_tr.find('td',{'data-stat': 'drb'}).text.strip()
                            TRB = Team_stats_tr.find('td',{'data-stat': 'trb'}).text.strip()
                            AST = Team_stats_tr.find('td',{'data-stat': 'ast'}).text.strip()
                            STL = Team_stats_tr.find('td',{'data-stat': 'stl'}).text.strip()
                            BLK = Team_stats_tr.find('td',{'data-stat': 'blk'}).text.strip()
                            TOV = Team_stats_tr.find('td',{'data-stat': 'tov'}).text.strip()
                            PF = Team_stats_tr.find('td',{'data-stat': 'pf'}).text.strip()
                            PTS = Team_stats_tr.find('td',{'data-stat': 'pts'}).text.strip()

                            # Confirm the Win, Loss, and Conference
                            e_w_confirm = self.check_E_W(E_W_teams, Team)
                            if e_w_confirm != "error":
                                W = e_w_confirm['W']
                                L = e_w_confirm['L']
                                Conf = e_w_confirm['index']
                                
                            # append ALL team data
                            sheet.append(self.rk)
                            sheet.append(Team)
                            sheet.append(Year)
                            sheet.append(G)

                            sheet.append(W)
                            sheet.append(L)
                            sheet.append(Conf)

                            sheet.append(MP)
                            sheet.append(FG)
                            sheet.append(FGA)
                            sheet.append(FGpercent)
                            sheet.append(ThreeP)
                            sheet.append(ThreePA)
                            sheet.append(ThreePpercent)
                            sheet.append(TwoP)
                            sheet.append(TwoPA)
                            sheet.append(TwoPpercent)
                            sheet.append(FT)
                            sheet.append(FTA)
                            sheet.append(FTpercent)
                            sheet.append(ORB)
                            sheet.append(DRB)
                            sheet.append(AST)
                            sheet.append(STL)
                            sheet.append(BLK)
                            sheet.append(TOV)
                            sheet.append(PF)
                            sheet.append(PTS)
                            self.rk += 1
                            print(sheet)
                            nbawriter.writerow(sheet)
                            print("--------------------------------------------------------")
                            print(" ")
                            

        print("--------------------------------------------------------")
        print(" ")

        # move to next page to scrape if under year boundries
        if self.year <= (end_year-1):
            self.runcount += 1
            self.year += 1
            url = "https://www.basketball-reference.com/leagues/NBA_" + str(self.year) + ".html"
            self.getyearteamdata(url)

    def check_E_W(self, E_W_teams, teamname):
        t_name = "error"
        for E_W_team in E_W_teams:
            if E_W_team['teamname'] == teamname.replace("*",""):
                t_name = E_W_team
                break
        return t_name
NBAscrapy()

--------------year: 2015 | row: 1-----------------
[1, 'Golden State Warriors*', 2015, '82', '67', '15', 'West', '240.6', '41.6', '87.0', '.478', '10.8', '27.0', '.398', '30.8', '60.0', '.514', '16.0', '20.8', '.768', '10.4', '34.3', '27.4', '9.3', '6.0', '14.5', '19.9', '110.0']
--------------------------------------------------------
 
[2, 'Los Angeles Clippers*', 2015, '82', '56', '26', 'West', '240.6', '39.4', '83.3', '.473', '10.1', '26.9', '.376', '29.3', '56.4', '.519', '17.9', '25.2', '.710', '9.6', '33.1', '24.8', '7.8', '5.0', '12.3', '21.3', '106.7']
--------------------------------------------------------
 
[3, 'Dallas Mavericks*', 2015, '82', '50', '32', 'West', '242.4', '39.7', '85.8', '.463', '8.9', '25.4', '.352', '30.8', '60.4', '.509', '16.9', '22.5', '.752', '10.5', '31.8', '22.5', '8.1', '4.5', '13.0', '20.0', '105.2']
--------------------------------------------------------
 
[4, 'Oklahoma City Thunder', 2015, '82', '45', '37', 'West', '241.8', '38.8', '86.8', '.44

<__main__.NBAscrapy at 0x237ff697670>