##Mapeamento do Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/aqui')

Mounted at /content/aqui


##instalação de Pacotes

In [None]:
!pip install playwright --upgrade



In [None]:
!playwright install-deps

In [None]:
!playwright install

##Bibliotecas

In [None]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import re

In [None]:
SEASONS = list(range(2018, 2025))
SEASONS

[2018, 2019, 2020, 2021, 2022, 2023, 2024]

In [None]:
DATA_DIR = '/content/aqui/MyDrive/WebMining_Scraping_DE05/AULA2'
CLASSIFICACAO_DIR = os.path.join(DATA_DIR, 'classificacao')
PONTUACAO_DIR = os.path.join(DATA_DIR, 'pontuacao')

In [None]:
!mkdir -p $CLASSIFICACAO_DIR
!mkdir -p $PONTUACAO_DIR

##ASYNC

In [None]:
#teste
async def soma(x,y):
  return (x + y)

In [None]:
soma(12, 15)

<coroutine object soma at 0x7ed39bc0d620>

##CAPTURA HTML

In [None]:
async def get_html(url, selector, sleep=5, retries=3):
  html = None
  for i in range(1, retries+1):
    # time.sleep(sleep * i)

    try:
      async with async_playwright() as p:
        browser = await p.firefox.launch() #chromium
        page = await browser.new_page()
        await page.goto(url)
        print(await page.title())
        html = await page.inner_html(selector)
    except PlaywrightTimeout:
      print(f'Timeout em {url}')
    else:
      break

  return html

In [None]:
season = 2024

url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games.html'

In [None]:
url

'https://www.basketball-reference.com/leagues/NBA_2024_games.html'

In [None]:
html = await get_html(url, '#content .filter')

2023-24 NBA Schedule | Basketball-Reference.com


In [None]:
html

'\n\n<div class="">\n\t<a href="/leagues/NBA_2024_games-october.html">October</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2024_games-november.html">November</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2024_games-december.html">December</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2024_games-january.html">January</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2024_games-february.html">February</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2024_games-march.html">March</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2024_games-april.html">April</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2024_games-may.html">May</a>\n</div><div class="">\n\t<a href="/leagues/NBA_2024_games-june.html">June</a>\n</div>'

##CAPTURA TEMPORADAS

In [None]:
async def scrape_season(season):
  url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games.html'
  html = await get_html(url, '#content .filter')

  soup = BeautifulSoup(html)
  links = soup.find_all('a')
  href = [l['href'] for l in links]
  standings_pages = [f'https://www.basketball-reference.com{l}' for l in href]

  for url in standings_pages:
    save_path = os.path.join(CLASSIFICACAO_DIR, url.split('/')[-1])
    if os.path.exists(save_path):
      continue

    html = await get_html(url, '#all_schedule')
    with open(save_path, 'w+') as f:
      f.write(html)

In [None]:
for season in SEASONS:
  await scrape_season(season)

2017-18 NBA Schedule | Basketball-Reference.com
2018-19 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2022-23 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com


In [None]:
classificacao_files = os.listdir(CLASSIFICACAO_DIR)
classificacao_files

['NBA_2024_games-october.html',
 'NBA_2024_games-november.html',
 'NBA_2024_games-december.html',
 'NBA_2024_games-january.html',
 'NBA_2024_games-february.html',
 'NBA_2024_games-march.html',
 'NBA_2024_games-april.html',
 'NBA_2018_games-october.html',
 'NBA_2018_games-november.html',
 'NBA_2018_games-december.html',
 'NBA_2018_games-january.html',
 'NBA_2018_games-february.html',
 'NBA_2018_games-march.html',
 'NBA_2018_games-april.html',
 'NBA_2018_games-may.html',
 'NBA_2018_games-june.html',
 'NBA_2019_games-october.html',
 'NBA_2019_games-november.html',
 'NBA_2019_games-december.html',
 'NBA_2019_games-january.html',
 'NBA_2019_games-february.html',
 'NBA_2019_games-march.html',
 'NBA_2019_games-april.html',
 'NBA_2019_games-may.html',
 'NBA_2019_games-june.html',
 'NBA_2020_games-october-2019.html',
 'NBA_2020_games-november.html',
 'NBA_2020_games-december.html',
 'NBA_2020_games-january.html',
 'NBA_2020_games-february.html',
 'NBA_2020_games-march.html',
 'NBA_2020_games-ju

##CAPTURA PARTIDAS

In [None]:
async def scrape_game(classif_arq):
  with open(classif_arq, 'r') as f:
    html = f.read()

  soup = BeautifulSoup(html)

  links = soup.find_all('a')

  boxscores = []
  for l in links:
    txt = re.search('\/boxscores\/.*\.html', str(l))
    if txt:
      boxscores.append(txt.group())

  boxscores = [f'https://www.basketball-reference.com{l}' for l in boxscores]

  for url in boxscores:
    save_path = os.path.join(PONTUACAO_DIR, url.split('/')[-1])
    if os.path.exists(save_path):
      continue

    html = await get_html(url, '#content')
    if not html:
      continue

    with open(save_path, 'w+') as f:
      f.write(html)

In [None]:
for c in classificacao_files:
  c_file = os.path.join(CLASSIFICACAO_DIR, c)

  await scrape_game(c_file)

76ers vs Trail Blazers, February 11, 2021 | Basketball-Reference.com
Timberwolves vs Hornets, February 12, 2021 | Basketball-Reference.com
Knicks vs Wizards, February 12, 2021 | Basketball-Reference.com
Spurs vs Hawks, February 12, 2021 | Basketball-Reference.com
Pelicans vs Mavericks, February 12, 2021 | Basketball-Reference.com
Pistons vs Celtics, February 12, 2021 | Basketball-Reference.com
Clippers vs Bulls, February 12, 2021 | Basketball-Reference.com
Thunder vs Nuggets, February 12, 2021 | Basketball-Reference.com
Bucks vs Jazz, February 12, 2021 | Basketball-Reference.com


CancelledError: 

In [None]:
pontuacao_files = os.listdir(PONTUACAO_DIR)
pontuacao_files

FileNotFoundError: [Errno 2] No such file or directory: '/content/aqui/MyDrive/WebMining_Scraping_DE05/AULA2/pontuacao'

##Scraping Pontuação

In [None]:
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
boxscores = os.listdir(PONTUACAO_DIR)
boxscores = [os.path.join(PONTUACAO_DIR, p) for p in boxscores]
boxscores

['/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001020LAC.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001030BOS.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001030ORL.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001030WAS.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001030HOU.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001030PHO.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001030LAL.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001040LAC.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001040BRK.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001040ORL.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001040ATL.html',
 '/content/aqui/MyDrive/WebMining_Scraping_DE04/AULA2/pontuacao/202001040CLE

In [None]:
def parse_html(box):
  with open(box) as b:
    html = b.read()

  soup = BeautifulSoup(html)
  [s.decompose() for s in soup.select('tr.over_header')]
  [s.decompose() for s in soup.select('tr.thead')]

  return soup

In [None]:
def read_linha_pontuacao(soup):
  df_pont = pd.read_html(str(soup), attrs={'id':'line_score'})[0]
  cols = list(df_pont.columns)
  cols[0] = 'time'
  cols[-1] = 'total'
  df_pont.columns = cols
  df_pont = df_pont[['time','total']]
  return df_pont

In [None]:
def read_estatisticas(soup, team, estat):
  df_estat = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{estat}'}, index_col=0)[0]
  df_estat = df_estat.apply(pd.to_numeric, errors='coerce')

  return df_estat

In [None]:
def read_season_info(soup):
  nav = soup.find('h1').text
  season = nav.split(',')[-1].replace(' ','')
  return season

In [None]:
base_cols = None
partidas = []

# boxscore = boxscores[0]

for boxscore in boxscores:
  soup_resp = parse_html(boxscore)
  linha_pontos = read_linha_pontuacao(soup_resp)
  times = list(linha_pontos['time'])

  sumarios = []
  for time in times:
    basic = read_estatisticas(soup_resp, time, 'basic')
    advanced = read_estatisticas(soup_resp, time, 'advanced')

    totais = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
    totais.index = totais.index.str.lower() + '_tot'

    maximos = pd.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
    maximos.index = maximos.index.str.lower() + '_max'

    sumario = pd.concat([totais, maximos])

    if base_cols is None:
      base_cols = list(sumario.index.drop_duplicates(keep='first'))
      base_cols = [b for b in base_cols if 'bpm' not in b]

    sumario = sumario[base_cols]

    sumarios.append(sumario)

  sumario = pd.concat(sumarios, axis=1).T

  partida = pd.concat([sumario, linha_pontos], axis=1)

  partida['home'] = [0, 1]
  partida_opp = partida.iloc[::-1].reset_index(drop=True)
  partida_opp.columns += '_opp'

  partida_completa = pd.concat([partida, partida_opp], axis=1)

  partida_completa['season'] = read_season_info(soup_resp)

  partida_completa['data'] = os.path.basename(boxscore)[:8]
  partida_completa['data'] = pd.to_datetime(partida_completa['data'], format='%Y%m%d')

  partida_completa['ganhador'] = partida_completa['total'] > partida_completa['total_opp']

  partidas.append(partida_completa)

  if len(partidas) % 10 == 0:
    print(f'{len(partidas)} / {len(boxscores)}')

ERROR:asyncio:Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


10 / 4828
20 / 4828
30 / 4828
40 / 4828
50 / 4828
60 / 4828
70 / 4828
80 / 4828
90 / 4828
100 / 4828
110 / 4828
120 / 4828
130 / 4828
140 / 4828
150 / 4828


KeyboardInterrupt: 

In [None]:
partidas_df = pd.concat(partidas, ignore_index=True)

In [None]:
partidas_df

In [None]:
[p.shape[1] for p in partidas if p.shape[1] != 149]

In [None]:
partidas_df.to_csv(os.path.join(DATA_DIR, 'nba_estats.csv'))