In [13]:
import pandas as pd
import requests
from datetime import datetime
from PIL import Image
from io import BytesIO
from tqdm.notebook import tqdm

In [14]:
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%a, %d %b %Y %H:%M:%S GMT")

HEADERS = {
    'authority': 'api.sofascore.com',
    'accept': '*/*',
    'accept-language': 'pt-BR,pt;q=0.6',
    'cache-control': 'max-age=0',
    'if-none-match': 'W/"7854d9f830"',
    'origin': 'https://www.sofascore.com',
    'referer': 'https://www.sofascore.com/',
    'sec-ch-ua': '"Brave";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'sec-gpc': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'If-Modified-Since': formatted_datetime
}

In [15]:
import os

BASE_DIR = 'data'
RAW_DIR = os.path.join(BASE_DIR, 'raw')
ENGINEERED_DIR = os.path.join(BASE_DIR, 'engineered')
IMAGES_DIR = os.path.join(ENGINEERED_DIR, 'teams_images')
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(ENGINEERED_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)

In [16]:
def scrape_team_info(team_id, headers=HEADERS):
    params = {
            'limit': '20',
            'order': '-rating',
        }

    response = requests.get(
            f'https://api.sofascore.app/api/v1/team/{team_id}',
            params=params,
            headers=headers,
        )
    return response

In [17]:
def scrape_team_image(team_id, league_id,headers=HEADERS):
    params = {
            'limit': '20',
            'order': '-rating',
        }

    response = requests.get(
            f'https://api.sofascore.app/api/v1/team/{team_id}/image',
            params=params,
            headers=headers,
        )
    league_image_dir = os.path.join(IMAGES_DIR, f'{league_id}')
    os.makedirs(league_image_dir, exist_ok=True)
    filename = os.path.join(league_image_dir, f'{team_id}.png')
    imagem_pillow = Image.open(BytesIO(response.content))
    imagem_pillow.save(filename)

In [18]:
def scrape_all_league_teams(teams_id: list, league_id: int):
    raw_list = []
    for id in tqdm(teams_id):
        team_response = scrape_team_info(id)
        scrape_team_image(id, league_id)
        data = team_response.json()
        raw_list.append(data)
    return raw_list

In [19]:
def generate_dfs(response_list: list):
    raw_dfs = []
    for resp in response_list:
        df = pd.json_normalize(resp['team'])
        raw_dfs.append(df)
    return raw_dfs

def concat_dfs(raw_dfs: list):
    return pd.concat(raw_dfs, ignore_index=True)


In [20]:
def create_all_files(df: pd.DataFrame, league_id: int, ext: str = 'csv'):
    league_dir = os.path.join(RAW_DIR, f'{league_id}')
    player_dir = os.path.join(league_dir, 'teams_info')
    os.makedirs(league_dir, exist_ok=True)
    os.makedirs(player_dir, exist_ok=True)
    filename = f'teams_info_{league_id}.{ext}'
    filepath = os.path.join(player_dir, filename)
    if ext == 'csv':
            df.to_csv(filepath, index=False)
    elif ext == 'xls':
        df.to_excel(filepath, index=False)

In [21]:
def load_file(league_id, season_id, ext='csv'):
    filepath = f'data\\raw\\{league_id}\\{season_id}\\attack_{league_id}_{season_id}.{ext}'
    if ext == 'xls':
        return pd.read_excel(filepath)
    return pd.read_csv(filepath)

def run(df: pd.DataFrame, league_id):
    teams_ids = list(df['team.id'].unique())
    data = scrape_all_league_teams(teams_ids, league_id)
    raw_dfs = generate_dfs(data)
    final_df = concat_dfs(raw_dfs)
    return final_df

In [22]:
league_id = 325
season_id = 48982

brazil_df = load_file(league_id, season_id)
brazil_df_final = run(brazil_df, league_id)

  0%|          | 0/20 [00:00<?, ?it/s]

In [23]:
create_all_files(brazil_df_final, league_id)

# Primera Division Argentina

In [24]:
league_id = 155
season_id = 47647

argentina = load_file(league_id, season_id)
argentina_final = run(argentina, league_id)

  0%|          | 0/28 [00:00<?, ?it/s]

In [25]:
create_all_files(argentina_final, league_id)

# Bolivia

In [26]:
league_id = 16736
season_id = 48353

bol = load_file(league_id, season_id)
bol_final = run(bol, league_id)

  0%|          | 0/17 [00:00<?, ?it/s]

In [27]:
create_all_files(bol_final, league_id)

# Ecuador

In [28]:
league_id = 16736
season_id = 48353

ecuador = load_file(league_id, season_id)
ecuador_final = run(ecuador, league_id)

  0%|          | 0/17 [00:00<?, ?it/s]

In [29]:
create_all_files(ecuador_final, league_id)