#

## Scraping.ipynb

---

Notebook tem como objetivo extrair dados de partidas do site https://football.esportsbattle.com/en/ e salvar em um arquivo .csv

In [1]:
import pandas as pd
import requests, time, random, json
from requests.adapters import HTTPAdapter
from datetime import datetime, timedelta
from urllib3.util.retry import Retry



##### gerar_url()

Função que recebe uma data de referência e gera uma lista de URLs que busca os torneios

In [2]:
def gerar_url(data_inicial, data_final = datetime.now()):

    datas = [
        [(data_inicial + timedelta(days=n)).year,
        (data_inicial + timedelta(days=n)).month,
        (data_inicial + timedelta(days=n)).day]
        for n in range((data_final - data_inicial).days + 1)
    ]   

    datas_url = []
    pag = 1 
    for i in range(len(datas)):
        ano = datas[i][0]
        mes = datas[i][1]
        dia = datas[i][2]
        if mes in [1,3,5,7,8,10,12] and dia == 31:
            datas_url.append(f"https://football.esportsbattle.com/api/tournaments?page={pag}&dateFrom={ano}%2F{mes}%2F{dia}+11%3A00&dateTo={ano}%2F{mes}%2F{1}+02%3A59")
        elif mes == 2 and dia == 28:
            datas_url.append(f"https://football.esportsbattle.com/api/tournaments?page={pag}&dateFrom={ano}%2F{mes}%2F{dia}+11%3A00&dateTo={ano}%2F{mes}%2F{1}+02%3A59")
        else:
            datas_url.append(f"https://football.esportsbattle.com/api/tournaments?page={pag}&dateFrom={ano}%2F{mes}%2F{dia}+11%3A00&dateTo={ano}%2F{mes}%2F{dia+1}+02%3A59")


    return datas_url

In [3]:
def pega_torneios(datas_url):
    lista_torneios = []  # Lista de todos os torneios disponíveis por página

    for url in datas_url:
        try:
            response = requests.get(url)
            response.raise_for_status()  # Lança erro se status != 200
            json_torneios = response.json()

            paginas = json_torneios.get('totalPages', 0)
            torneios = json_torneios.get('tournaments', [])

            for pag in range(paginas):
                for torneio in torneios:
                    lista_torneios.append([pag + 1, torneio])

        except Exception as e:
            print(f"[AVISO] Erro ao processar URL: {url}")
            print(f"Motivo: {e}\n")
            continue

    return lista_torneios


##### Função que pega todas as partidas por torneio

In [4]:
def pega_partidas(lista_torneios):
    
    lista_partidas = []
    for i in range(len(lista_torneios)):
        # pag = lista_torneios[i][0]
        id_torneio = lista_torneios[i][1]['id']

        lista_partidas.append(f'https://football.esportsbattle.com/api/tournaments/{id_torneio}/matches')

    return lista_partidas

In [5]:
# ---- sessão global robusta (1x por processo) ----
_session = requests.Session()
_session.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    "Connection": "keep-alive",
    "Referer": "https://football.esportsbattle.com/"
})
_retry = Retry(
    total=5,                      # nº máximo de tentativas
    backoff_factor=0.7,           # 0.7s, 1.4s, 2.8s, 5.6s, ...
    status_forcelist=(429, 500, 502, 503, 504),
    allowed_methods=["GET"],
    raise_on_status=False
)
_adapter = HTTPAdapter(max_retries=_retry, pool_connections=20, pool_maxsize=50)
_session.mount("http://", _adapter)
_session.mount("https://", _adapter)

In [6]:
def _get_json(url, timeout=(5, 20), tries_extra=3):
    """GET + JSON com proteção a desconexão sem resposta."""
    for t in range(tries_extra):
        try:
            r = _session.get(url, timeout=timeout)
            if r.status_code != 200:
                # algumas APIs retornam HTML de erro; log mínimo e retry
                raise requests.HTTPError(f"status={r.status_code}")
            # às vezes a resposta vem vazia/HTML e quebra .json()
            return r.json()
        except (requests.exceptions.JSONDecodeError,
                requests.exceptions.ConnectionError,
                requests.exceptions.ReadTimeout,
                requests.exceptions.ChunkedEncodingError,
                requests.exceptions.HTTPError) as e:
            # backoff exponencial com jitter
            wait = (0.5 * (2 ** t)) + random.uniform(0, 0.4)
            print(f"[AVISO] Falha em {url} ({type(e).__name__}). Tentando de novo em {wait:.1f}s...")
            time.sleep(wait)
    # desistiu desta URL
    print(f"[ERRO] Desisti de {url}")
    return None

In [7]:
def pega_resultados(lista_partidas):
    """
    Mesma assinatura e lógica geral:
    - Faz GET de cada URL
    - Ignora se não conseguir JSON
    - Achata com pandas.json_normalize
    - Concatena tudo num único DataFrame
    """
    # aceitar string única também
    if isinstance(lista_partidas, str):
        lista_partidas = [lista_partidas]

    dfs = []
    for url in lista_partidas:
        js = _get_json(url)
        if not js:
            continue

        # endpoint às vezes é lista ou dict com 'matches'
        if isinstance(js, dict) and "matches" in js:
            payload = js["matches"]
        else:
            payload = js

        if not payload:
            continue

        df_json = pd.json_normalize(payload)
        dfs.append(df_json)

        # opcional: pequeno intervalo pra não sobrecarregar servidor
        time.sleep(0.15)

    if dfs:
        return pd.concat(dfs, ignore_index=True)
    return pd.DataFrame()

In [8]:
# urls = gerar_url(datetime(2025,1,1), datetime(2025,7,31))
# torneios = pega_torneios(urls)
# partidas = pega_partidas(torneios)
# jogos = pega_resultados(partidas)

In [9]:
if __name__ == "__main__":
    urls = gerar_url(datetime(2025,1,1), datetime(2025,7,31))
    torneios = pega_torneios(urls)
    partidas = pega_partidas(torneios)
    jogos = pega_resultados(partidas)


In [10]:
jogos.to_csv("../../data/raw/base_partidas_raw.csv", index=False, encoding='utf-8')

In [11]:
jogos

Unnamed: 0,id,date,status_id,tournament.id,tournament.token,tournament.token_international,tournament.status_id,console.id,console.token,console.token_international,...,participant1.prevPeriodsScores,participant2.id,participant2.nickname,participant2.score,participant2.photo,participant2.team.logo,participant2.team.id,participant2.team.token,participant2.team.token_international,participant2.prevPeriodsScores
0,1404256,2025-01-01T11:50:00Z,3,181773,Лига чемпионов А 2х6 2025-01-01,Champions League A 2x6 2025-01-01,4,136,Anfield-1,Anfield-1,...,[1],603265,Kinshiki,3,L2dhbWVycy81LzU1LzU1OS8xNzA1NTk5NDkyLXBob3RvLm...,L3RlYW1zLzYvNi82L2xvZ28ucG5n,6,Манчестер Сити,Manchester City,[2]
1,1404257,2025-01-01T11:50:00Z,3,181773,Лига чемпионов А 2х6 2025-01-01,Champions League A 2x6 2025-01-01,4,137,Anfield-2,Anfield-2,...,[1],603264,Decade,4,L2dhbWVycy81LzU2LzU2Ni8xNzA5NTcwMTcxLXBob3RvLm...,L3RlYW1zLzIvMi8yLzE2OTE1ODQ4NjMtbG9nby5wbmc=,2,Ливерпуль,Liverpool,[2]
2,1404258,2025-01-01T12:08:00Z,3,181773,Лига чемпионов А 2х6 2025-01-01,Champions League A 2x6 2025-01-01,4,136,Anfield-1,Anfield-1,...,[2],603262,flamez,3,L2dhbWVycy81LzUzLzUzOS8xNzAwNTk0MDE4LXBob3RvLm...,L3RlYW1zLzIvMjEvMjEvbG9nby5wbmc=,21,Пари Сен Жермен,Paris Saint-Germain F.C.,[1]
3,1404259,2025-01-01T12:08:00Z,3,181773,Лига чемпионов А 2х6 2025-01-01,Champions League A 2x6 2025-01-01,4,137,Anfield-2,Anfield-2,...,[4],603266,SpeciAL,6,L2dhbWVycy81LzU0LzU0OC8xNzAwNTkzODg0LXBob3RvLm...,L3RlYW1zLzkvOS85L2xvZ28ucG5n,9,Бавария,Bayern Munich,[3]
4,1404260,2025-01-01T12:26:00Z,3,181773,Лига чемпионов А 2х6 2025-01-01,Champions League A 2x6 2025-01-01,4,136,Anfield-1,Anfield-1,...,[1],603264,Decade,3,L2dhbWVycy81LzU2LzU2Ni8xNzA5NTcwMTcxLXBob3RvLm...,L3RlYW1zLzIvMi8yLzE2OTE1ODQ4NjMtbG9nby5wbmc=,2,Ливерпуль,Liverpool,[1]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158407,1675357,2025-07-30T20:42:00Z,3,220193,Вольта Лига Чемпионов А 2025-07-30,Volta Champions League A 2025-07-30,4,140,Hillsborough-2,Hillsborough-2,...,[0],718276,Andrew,2,L2dhbWVycy80LzQ2LzQ2Ni8xNjk1MzkzMzM1LXBob3RvLm...,L3RlYW1zLzIvMjAvMjAvbG9nby5wbmc=,20,Реал Мадрид,Real Madrid,[1]
158408,1675358,2025-07-30T20:51:00Z,3,220193,Вольта Лига Чемпионов А 2025-07-30,Volta Champions League A 2025-07-30,4,139,Hillsborough-1,Hillsborough-1,...,[0],718277,Fireball,3,L2dhbWVycy80LzQ4LzQ4Mi8xNzIwMTA5MDA5LXBob3RvLm...,L3RlYW1zLzIvMi8yLzE2OTE1ODQ4NjMtbG9nby5wbmc=,2,Ливерпуль,Liverpool,[1]
158409,1675359,2025-07-30T20:51:00Z,3,220193,Вольта Лига Чемпионов А 2025-07-30,Volta Champions League A 2025-07-30,4,140,Hillsborough-2,Hillsborough-2,...,[2],718278,fantazer,2,L2dhbWVycy80LzQ2LzQ2Ny8xNzA4NTEzNTAxLXBob3RvLm...,L3RlYW1zLzIvMjEvMjEvbG9nby5wbmc=,21,Пари Сен Жермен,Paris Saint-Germain F.C.,[2]
158410,1674820,2025-07-30T20:20:00Z,3,220157,Чемпионат Мира A 2025-07-30,International A 2025-07-30,4,1,1,1,...,[3],718146,Samurai,1,L2dhbWVycy82LzY4LzY4Ny8xNzQxMDE1NDEyLXBob3RvLm...,L3RlYW1zLzMvMzIvMzIvbG9nby5wbmc=,32,Германия,Germany,[0]
