In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import json


def cbf_scrap(championship, ano):
    cbf = "https://www.cbf.com.br/futebol-brasileiro/competicoes/campeonato-brasileiro"
    
    r = requests.get(f"{cbf}-{championship}/{ano}")
    soup = BeautifulSoup(r.text)

    teams = soup.find_all("span", {"class": "time-sigla"})
    scores = soup.find_all("span", {"class": "bg-blue color-white label-2"})
    datetimes = soup.find_all("span", {"class": "partida-desc text-1 color-lightgray p-b-15 block uppercase text-center"})
    locations = soup.find_all("span", {"class": "partida-desc text-1 color-lightgray block uppercase text-center"})
    
    games = []
    for i, dtt in enumerate(datetimes):
        dtt_utf0 = datetime.strptime("T".join(dtt.text.split()[1:3]), "%d/%m/%YT%H:%M") + timedelta(hours=3)
        hscore, ascore = scores[i].text.split()[0::2]
        stadium, state = locations[i].text.split(" - ")[:2]
        game_info = {
            "dtt": dtt_utf0.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "home": teams[i*2].text,
            "away": teams[i*2 + 1].text,
            "hscore": int(hscore),
            "ascore": int(ascore),
            "stadium": stadium.strip(),
            "state": state.strip()
        }
        games.append(game_info)
    return games

In [2]:
data = {}
championships = ["serie-a", "serie-b"]
years = range(2020, 2023)

for championship in championships:
    data[championship] = {}
    for year in years:
        data[championship][year] = cbf_scrap(championship, year)
        
data

{'serie-a': {2020: [{'dtt': '2020-08-08T22:00:00Z',
    'home': 'FOR',
    'away': 'ATH',
    'hscore': 2,
    'ascore': 1,
    'stadium': 'Arena Castelão',
    'state': 'Fortaleza'},
   {'dtt': '2020-08-08T22:30:00Z',
    'home': 'COR',
    'away': 'INT',
    'hscore': 0,
    'ascore': 0,
    'stadium': 'Couto Pereira',
    'state': 'Curitiba'},
   {'dtt': '2020-08-09T00:00:00Z',
    'home': 'SPO',
    'away': 'CEA',
    'hscore': 2,
    'ascore': 0,
    'stadium': 'Ilha do Retiro',
    'state': 'Recife'},
   {'dtt': '2020-08-09T19:00:00Z',
    'home': 'FLA',
    'away': 'ATL',
    'hscore': 2,
    'ascore': 1,
    'stadium': 'Maracanã',
    'state': 'Rio de Janeiro'},
   {'dtt': '2020-08-09T19:00:00Z',
    'home': 'SAN',
    'away': 'RED',
    'hscore': 2,
    'ascore': 0,
    'stadium': 'Vila Belmiro',
    'state': 'Santos'},
   {'dtt': '2020-08-09T22:00:00Z',
    'home': 'GRE',
    'away': 'FLU',
    'hscore': 1,
    'ascore': 0,
    'stadium': 'Arena do Grêmio',
    'state': 'Port

In [3]:
with open("cbf_data.json", "w+") as file:
    json.dump(data, file)