In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
from glob import glob 
from os.path import join

folder = "scoreboard-players"
base_url = "https://www.scoreboard.com"
teams_folder = os.path.join(folder,"teams")

if not os.path.exists(teams_folder):
    os.mkdir(teams_folder)

## Get countries

In [None]:
soccer_page = requests.get( base_url + "/en/soccer/")
soccer_soup = BeautifulSoup(soccer_page.content, "html5lib")

In [None]:
left_content = soccer_soup.find("div", {"class":"left-content"})
uls = left_content.findAll("ul", recursive=False)

countries_links = []
region_lis = [ul.findAll("li", recursive=False) for ul in uls]
for region in region_lis:
    for li in region:
        a = li.find('a')
        id_ = int(li.get('id')[6:])
        if a and a.get('href'):
            href = a.get('href')
            countries_links.append([a.text.strip(), href, id_])

countries_links[:10]

## Get leagues per country

In [None]:
def get_leagues(string):
    leagues = []
    current_league = ''
    in_league = False
    in_string = False
    for i in range(len(string)):
        c = string[i]
        if c == "÷" and string[i-1] == "U":
            in_string = True
        elif c == "¬" and current_league:
            in_string = False
            leagues.append(current_league)
            current_league = ''
        elif in_string:
            current_league += c
    return leagues

In [None]:
countries = []
for country in countries_links:
    c = {
        'name': country[0],
        'link': country[1]
    }
    country_page = requests.get("https://www.scoreboard.com/en/x/req/m_1_" + str(country[2]))
    text = country_page.content.decode('UTF-8') 
    c['leagues'] = get_leagues(text)
    countries.append(c)
print(len(countries))

In [None]:
with open(os.path.join(folder, "countries.json"), "w") as w:
    json.dump(countries, w, indent=2)

## Get squads

In [None]:
def is_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def get_team_id(url):
    v = url.split("/")
    return v[-2] + "_" + v[-1]

def get_squad(squad_soup):
    squad_table = squad_soup.select("table[class*='squad-table']")[0]
    squad_trs = squad_table.findAll("tr")
    type_title = None
    squad = []
    for tr in squad_trs:
        tr_class = tr.get("class")[0]
        tds = tr.findAll("td")
        if tr_class == "player-type-title":
            type_title = tr.text.strip()
        elif tr_class == "player" or tr_class == "coach":
            jersey_number = int(tds[0].text) if is_int(tds[0].text) else -1
            country = tds[1].find('span').get("title")
            a = tds[1].find('a')
            age = int(tds[2].text) if is_int(tds[2].text) else -1

            squad.append({
                'jersey':jersey_number, 
                'kind': tr_class, 
                'type': type_title,
                'name': a.text.strip(),
                'age': age,
                'country': country,
                'link': a.get("href"),
            })

    return squad

In [None]:
leagues = []

for c in countries:
    link = base_url + c['link']
    for l in c['leagues']:
        leagues.append(link+l)

In [None]:
downloaded_squads = set([f.split("/")[-1][:-5]  for f in glob(teams_folder +"/*_*.json")])

for league_url in leagues:
    parts = league_url.split("/")
    country = parts[-2]
    league_id = parts[-1]
    file = os.path.join(folder, country + "_" + league_id + ".json")
    if os.path.exists(file): 
        continue
    
    teams_page = requests.get( league_url + "/teams/")
    teams_soup = BeautifulSoup(teams_page.content, "html5lib")
    participants = teams_soup.find("div",id="tournament-page-participants")
    teams_a = [tr.find("a") 
               for tr in participants.findAll("tr") 
               if tr.get("id","").startswith("participant_")]
    
    league = {
        'link': league_url,
        'name': teams_soup.find("div", {'class':'tournament-name'}).text.strip(),
        'country': country
    }
    
    teams = []
    for a in teams_a:
        team_id = get_team_id(a.get("href"))
        teams.append(team_id)
        if not team_id in downloaded_squads:
            squad_link = base_url + a.get("href") + "/squad"
            squad_page = requests.get(squad_link)
            squad_soup = BeautifulSoup(squad_page.content, "html5lib")
            team = { 
                'id': team_id,
                'link': base_url + a.get("href"), 
                'name': a.text.strip(),
                'country': country
            }
            try:
                team["squad"] = get_squad(squad_soup)
            except:
                pass
            with open(join(teams_folder, team_id + ".json"), "w") as w:
                json.dump(team, w, indent=4)
            downloaded_squads.add(team["id"])
            
    league["teams"] = teams
    with open(file, "w") as w:
        json.dump(league, w, indent=1)
        print(file)