In [37]:
import pandas as pd
import requests
from datetime import datetime
import time
from PIL import Image
from io import BytesIO
from tqdm.notebook import tqdm

In [38]:
import os

BASE_DIR = 'data'
RAW_DIR = os.path.join(BASE_DIR, 'raw')
ENGINEERED_DIR = os.path.join(BASE_DIR, 'engineered')
IMAGES_DIR = os.path.join(ENGINEERED_DIR, 'players_images')
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(ENGINEERED_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)

In [39]:
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%a, %d %b %Y %H:%M:%S GMT")

HEADERS = {
    'authority': 'api.sofascore.com',
    'accept': '*/*',
    'accept-language': 'pt-BR,pt;q=0.6',
    'cache-control': 'max-age=0',
    'if-none-match': 'W/"7854d9f830"',
    'origin': 'https://www.sofascore.com',
    'referer': 'https://www.sofascore.com/',
    'sec-ch-ua': '"Brave";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'sec-gpc': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'If-Modified-Since': formatted_datetime
}

In [40]:
def scrape_player_info(player_id, headers=HEADERS):
    params = {
            'limit': '20',
            'order': '-rating',
        }

    response = requests.get(
            f'https://api.sofascore.app/api/v1/player/{player_id}',
            params=params,
            headers=headers,
        )
    return response

In [41]:
def scrape_player_image(player_id, league_id,headers=HEADERS):
    params = {
            'limit': '20',
            'order': '-rating',
        }

    response = requests.get(
            f'https://api.sofascore.app/api/v1/player/{player_id}/image',
            params=params,
            headers=headers,
        )
    league_image_dir = os.path.join(IMAGES_DIR, f'{league_id}')
    os.makedirs(league_image_dir, exist_ok=True)
    filename = os.path.join(league_image_dir, f'{player_id}.png')
    try:
        imagem_pillow = Image.open(BytesIO(response.content))
        imagem_pillow.save(filename)
    except Image.UnidentifiedImageError:
        pass



In [42]:
def scrape_all_league_players(players_id: list, league_id: int):
    raw_list = []
    for id in tqdm(players_id):
        player_response = scrape_player_info(id)
        scrape_player_image(id, league_id)
        data = player_response.json()
        data['player']['team'].pop('tournament', None)
        raw_list.append(data)
        time.sleep(0.5)
    return raw_list

In [43]:
def generate_dfs(response_list: list):
    raw_dfs = []
    for resp in response_list:
        df = pd.json_normalize(resp['player'])
        raw_dfs.append(df)
    return raw_dfs

def concat_dfs(raw_dfs: list):
    return pd.concat(raw_dfs, ignore_index=True)


In [44]:
def create_all_files(df: pd.DataFrame, league_id: int, ext: str = 'csv'):
    league_dir = os.path.join(RAW_DIR, f'{league_id}')
    player_dir = os.path.join(league_dir, 'players_info')
    os.makedirs(league_dir, exist_ok=True)
    os.makedirs(player_dir, exist_ok=True)
    filename = f'players_info_{league_id}.{ext}'
    filepath = os.path.join(player_dir, filename)
    if ext == 'csv':
            df.to_csv(filepath, index=False)
    elif ext == 'xls':
        df.to_excel(filepath, index=False)


# Scrape Brazil Players Info

In [45]:
df = pd.read_csv('data\\raw\\325\\attack_325_48982.csv')
df.head(5)

Unnamed: 0,goals,successfulDribblesPercentage,blockedShots,penaltyWon,goalsFromOutsideTheBox,hitWoodwork,expectedGoals,totalShots,goalConversionPercentage,shotFromSetPiece,...,player.id,team.name,team.slug,team.shortName,team.userCount,team.type,team.id,team.teamColors.primary,team.teamColors.secondary,team.teamColors.text
0,0,0.0,0,0,0,0,0.0,0,0.0,0,...,145054,Bahia,bahia,Bahia,0,0,1955,#52b030,#52b030,#ffffff
1,0,0.0,0,0,0,0,0.0,0,0.0,0,...,1507616,Coritiba,coritiba,Coritiba,0,0,1982,#52b030,#52b030,#ffffff
2,0,100.0,0,0,0,0,0.0,0,0.0,0,...,840103,Internacional,internacional,Internacional,0,0,1966,#52b030,#52b030,#ffffff
3,9,50.98,25,0,3,1,7.07,84,10.71,13,...,840020,Palmeiras,palmeiras,Palmeiras,0,0,1963,#52b030,#52b030,#ffffff
4,14,55.48,29,2,3,3,11.83,117,11.97,32,...,34705,Atlético Mineiro,atletico-mineiro,Atlético Mineiro,0,0,1977,#52b030,#52b030,#ffffff


In [46]:
players_list = list(df['player.id'])
league_id = 325

In [47]:
data = scrape_all_league_players(players_list, league_id)

  0%|          | 0/723 [00:00<?, ?it/s]

In [48]:
raw_dfs = generate_dfs(data)
final_df = concat_dfs(raw_dfs)

In [49]:
create_all_files(final_df, league_id)