In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import json

# Headers

In [3]:
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%a, %d %b %Y %H:%M:%S GMT")

HEADERS = {
    'authority': 'api.sofascore.com',
    'accept': '*/*',
    'accept-language': 'pt-BR,pt;q=0.6',
    'cache-control': 'max-age=0',
    'if-none-match': 'W/"7854d9f830"',
    'origin': 'https://www.sofascore.com',
    'referer': 'https://www.sofascore.com/',
    'sec-ch-ua': '"Brave";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'sec-gpc': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'If-Modified-Since': formatted_datetime
}

In [4]:

ATRB = {
    'attack': 'goals,successfulDribblesPercentage,blockedShots,penaltyWon,goalsFromOutsideTheBox,hitWoodwork,expectedGoals,totalShots,goalConversionPercentage,shotFromSetPiece,headedGoals,offsides,bigChancesMissed,shotsOnTarget,penaltiesTaken,freeKickGoal,leftFootGoals,penaltyConversion,successfulDribbles,shotsOffTarget,penaltyGoals,goalsFromInsideTheBox,rightFootGoals,setPieceConversion,rating',
    'defense': 'tackles,errorLeadToGoal,cleanSheet,interceptions,errorLeadToShot,penaltyConceded,ownGoals,clearances,dribbledPast,rating',
    'passing': 'bigChancesCreated,totalPasses,accurateFinalThirdPasses,accurateLongBalls,assists,accuratePassesPercentage,keyPasses,accurateLongBallsPercentage,accuratePasses,accurateOwnHalfPasses,accurateCrosses,passToAssist,inaccuratePasses,accurateOppositionHalfPasses,accurateCrossesPercentage,rating',
    'keepers': 'saves,savedShotsFromInsideTheBox,punches,crossesNotClaimed,cleanSheet,savedShotsFromOutsideTheBox,runsOut,penaltyFaced,goalsConcededInsideTheBox,successfulRunsOut,penaltySave,goalsConcededOutsideTheBox,highClaims,rating',
    'others': 'yellowCards,aerialDuelsWon,minutesPlayed,possessionLost,redCards,aerialDuelsWonPercentage,wasFouled,appearances,groundDuelsWon,totalDuelsWon,fouls,matchesStarted,groundDuelsWonPercentage,totalDuelsWonPercentage,dispossessed,rating'
}

FILTERS = 'position.in.G~D~M~F'


FILES_NAMES = {
    0: 'attack',
    1: 'defense', 
    2: 'passing', 
    3: 'keepers',
    4: 'others',
}


In [9]:
# offset (ajustar para pegar todas as tabelas) ele pega de 20 em 20
def scrape_tournament_players_stats(id, season_id, headers=HEADERS, atr='attack', filters=FILTERS, offset=0):
    
    params = {
        'limit': '20',
        'order': '-rating',
        'accumulation': 'total',
        'fields': ATRB[atr],
        'filters': filters,
    }

    if offset > 0 :
        params['offset'] = str(offset)
        
    response = requests.get(
        f'https://api.sofascore.com/api/v1/unique-tournament/{id}/season/{season_id}/statistics',
        params=params,
        headers=HEADERS,
    )
    return response

In [66]:
def scrape_all_pages(id, season_id, atr='attack', pages=37):
    response_list = []
    offset=0
    while True:
        resp = scrape_tournament_players_stats(id, season_id, atr=atr, offset=offset)
        if offset <= pages*20:
            response_list.append(resp)
            offset += 20
        else:
            break
        time.sleep(1)
    return response_list

def scrape_all_atr(id, season_id, pages=37):
    response_list = []
    for key, _ in ATRB.items():
        resp = scrape_all_pages(id, season_id, atr=key, pages=pages)
        response_list.append(resp)
    return response_list


In [82]:
def generate_dfs(response_list: list):
    raw_dfs = []
    for resp in response_list:
        raw_json = resp.json()
        df = pd.json_normalize(raw_json['results'])
        raw_dfs.append(df)
    return raw_dfs

def generate_single_df(response):
    return pd.json_normalize(response.json()['results'])


def join_same_atr_df(dfs: list):
    return pd.concat(dfs, ignore_index=True)

In [57]:
import os

BASE_DIR = 'data'
RAW_DIR = os.path.join(BASE_DIR, 'raw')
ENGINEERED_DIR = os.path.join(BASE_DIR, 'engineered')
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(ENGINEERED_DIR, exist_ok=True)

In [58]:
# _ID_SEASON_ID.EXT
def create_all_stats_files(dfs: pd.DataFrame, filenames: dict, id: int, season_id: int, ext: str):
    for i, base_name in filenames.items():
        df = dfs[i]
        filename = f'{base_name}_{id}_{season_id}.{ext}'
        filepath = os.path.join(RAW_DIR, filename)
        if ext == 'csv':
            df.to_csv(filepath, index=False)
        elif ext == 'xls':
            df.to_excel(filepath, index=False)
        else: raise TypeError('Unsupported File Type')
    

# Scrapping From Brazil

In [67]:
brazil_players_stats_2023 = scrape_all_atr(id=325, season_id=48982)

In [89]:
brazil_players_stats_2023_dataframes = []
for i in range(len(brazil_players_stats_2023)):
    dfs = generate_dfs(brazil_players_stats_2023[i])
    df = join_same_atr_df(dfs)
    brazil_players_stats_2023_dataframes.append(df)
    

In [93]:

create_all_stats_files(brazil_players_stats_2023_dataframes, FILES_NAMES, 325, 48982, 'xls')

  df.to_excel(filepath, index=False)
