In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
from glob import glob 
from os.path import join

folder = "scoreboard-players"
base_url = "https://www.scoreboard.com"
teams_folder = os.path.join(folder,"teams")

if not os.path.exists(teams_folder):
    os.mkdir(teams_folder)

## Get countries

In [None]:
soccer_page = requests.get( base_url + "/en/soccer/")
soccer_soup = BeautifulSoup(soccer_page.content, "html5lib")

In [62]:
left_content = soccer_soup.find("div", {"class":"left-content"})
uls = left_content.findAll("ul", recursive=False)

countries_links = []
region_lis = [ul.findAll("li", recursive=False) for ul in uls]
for region in region_lis:
    for li in region:
        a = li.find('a')
        id_ = int(li.get('id')[6:])
        if a and a.get('href'):
            href = a.get('href')
            countries_links.append([a.text.strip(), href, id_])

countries_links[:10]

[['Bermuda', '/en/soccer/bermuda/', 230],
 ['Canada', '/en/soccer/canada/', 47],
 ['Costa Rica', '/en/soccer/costa-rica/', 57],
 ['Dominican Republic', '/en/soccer/dominican-republic/', 66],
 ['El Salvador', '/en/soccer/el-salvador/', 70],
 ['Guatemala', '/en/soccer/guatemala/', 85],
 ['Haiti', '/en/soccer/haiti/', 89],
 ['Honduras', '/en/soccer/honduras/', 90],
 ['Jamaica', '/en/soccer/jamaica/', 99],
 ['Mexico', '/en/soccer/mexico/', 128]]

## Get leagues per country

In [60]:
def get_leagues(string):
    """Super special parsing function"""
    leagues = []
    current_league = ''
    in_league = False
    in_string = False
    for i in range(len(string)):
        c = string[i]
        if c == "÷" and string[i-1] == "U":
            in_string = True
        elif c == "¬" and current_league:
            in_string = False
            leagues.append(current_league)
            current_league = ''
        elif in_string:
            current_league += c
    return leagues

In [69]:
countries = []
for country in countries_links:
    c = {
        'name': country[0],
        'link': country[1]
    }
    country_page = requests.get("https://www.scoreboard.com/en/x/req/m_1_" + str(country[2]))
    text = country_page.content.decode('UTF-8') 
    c['leagues'] = get_leagues(text)
    countries.append(c)
print(len(countries))

168


In [71]:
with open(os.path.join(folder, "countries.json"), "w") as w:
    json.dump(countries, w, indent=2)

## Get squads

In [80]:
def is_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def get_team_id(url):
    v = url.split("/")
    return v[-2] + "_" + v[-1]

def get_squad(squad_soup):
    squad_table = squad_soup.select("table[class*='squad-table']")[0]
    squad_trs = squad_table.findAll("tr")
    type_title = None
    squad = []
    for tr in squad_trs:
        tr_class = tr.get("class")[0]
        tds = tr.findAll("td")
        if tr_class == "player-type-title":
            type_title = tr.text.strip()
        elif tr_class == "player" or tr_class == "coach":
            jersey_number = int(tds[0].text) if is_int(tds[0].text) else -1
            country = tds[1].find('span').get("title")
            a = tds[1].find('a')
            age = int(tds[2].text) if is_int(tds[2].text) else -1

            squad.append({
                'jersey':jersey_number, 
                'kind': tr_class, 
                'type': type_title,
                'name': a.text.strip(),
                'age': age,
                'country': country,
                'link': a.get("href"),
            })

    return squad

In [95]:
leagues = []

for c in countries:
    link = base_url + c['link']
    for l in c['leagues']:
        leagues.append(link+l)

In [98]:
downloaded_squads = set([f.split("/")[-1][:-5]  for f in glob(teams_folder +"/*_*.json")])

for league_url in leagues:
    parts = league_url.split("/")
    country = parts[-2]
    league_id = parts[-1]
    file = os.path.join(folder, country + "_" + league_id + ".json")
    if os.path.exists(file): 
        continue
    
    teams_page = requests.get( league_url + "/teams/")
    teams_soup = BeautifulSoup(teams_page.content, "html5lib")
    participants = teams_soup.find("div",id="tournament-page-participants")
    teams_a = [tr.find("a") 
               for tr in participants.findAll("tr") 
               if tr.get("id","").startswith("participant_")]
    
    league = {
        'link': league_url,
        'name': teams_soup.find("div", {'class':'tournament-name'}).text.strip(),
        'country': country
    }
    
    teams = []
    for a in teams_a:
        team_id = get_team_id(a.get("href"))
        teams.append(team_id)
        if not team_id in downloaded_squads:
            squad_link = base_url + a.get("href") + "/squad"
            squad_page = requests.get(squad_link)
            squad_soup = BeautifulSoup(squad_page.content, "html5lib")
            team = { 
                'id': team_id,
                'link': base_url + a.get("href"), 
                'name': a.text.strip(),
                'country': country
            }
            try:
                team["squad"] = get_squad(squad_soup)
            except:
                pass
            with open(join(teams_folder, team_id + ".json"), "w") as w:
                json.dump(team, w, indent=4)
            downloaded_squads.add(team["id"])
            
    league["teams"] = teams
    with open(file, "w") as w:
        json.dump(league, w, indent=1)
        print(file)

scoreboard-players/bermuda_premier-league.json
scoreboard-players/canada_championship.json
scoreboard-players/canada_csl.json
scoreboard-players/canada_u-sports.json
scoreboard-players/costa-rica_primera-division.json
scoreboard-players/costa-rica_copa-costa-rica.json
scoreboard-players/costa-rica_super-cup.json
scoreboard-players/dominican-republic_ldf.json
scoreboard-players/el-salvador_primera-division.json
scoreboard-players/guatemala_liga-nacional.json
scoreboard-players/haiti_championnat-national.json
scoreboard-players/honduras_liga-nacional.json
scoreboard-players/jamaica_premier-league.json
scoreboard-players/mexico_primera-division.json
scoreboard-players/mexico_liga-de-ascenso.json
scoreboard-players/mexico_liga-mx-women.json
scoreboard-players/mexico_copa-mexico.json
scoreboard-players/mexico_campeon-de-campeones.json
scoreboard-players/nicaragua_primera-division.json
scoreboard-players/panama_lpf.json
scoreboard-players/trinidad-and-tobago_pro-league.json
scoreboard-player

scoreboard-players/czech-republic_zlinsky-kp.json
scoreboard-players/czech-republic_moravskoslezsky-kp.json
scoreboard-players/czech-republic_mol-cup.json
scoreboard-players/czech-republic_super-cup.json
scoreboard-players/czech-republic_youth-league.json
scoreboard-players/czech-republic_u19-league.json
scoreboard-players/czech-republic_tipsport-liga.json
scoreboard-players/czech-republic_first-league-women.json
scoreboard-players/czech-republic_czech-cup-women.json
scoreboard-players/denmark_superliga.json
scoreboard-players/denmark_1st-division.json
scoreboard-players/denmark_2nd-division-group-1.json
scoreboard-players/denmark_2nd-division-group-2.json
scoreboard-players/denmark_2nd-division-group-3.json
scoreboard-players/denmark_2nd-division-promotion-group.json
scoreboard-players/denmark_2nd-division-relegation-group.json
scoreboard-players/denmark_2nd-division-east.json
scoreboard-players/denmark_2nd-division-west.json
scoreboard-players/denmark_2nd-division-play-offs.json
scor

scoreboard-players/ireland_fai-cup.json
scoreboard-players/ireland_league-cup.json
scoreboard-players/ireland_super-cup.json
scoreboard-players/israel_ligat-ha-al.json
scoreboard-players/israel_leumit-league.json
scoreboard-players/israel_state-cup.json
scoreboard-players/israel_toto-cup.json
scoreboard-players/israel_super-cup.json
scoreboard-players/italy_serie-a.json
scoreboard-players/italy_serie-b.json
scoreboard-players/italy_serie-c-play-out.json
scoreboard-players/italy_serie-c-group-a.json
scoreboard-players/italy_serie-c-group-b.json
scoreboard-players/italy_serie-c-group-c.json
scoreboard-players/italy_serie-c-promotion-play-offs.json
scoreboard-players/italy_serie-d-group-a.json
scoreboard-players/italy_serie-d-group-b.json
scoreboard-players/italy_serie-d-group-c.json
scoreboard-players/italy_serie-d-group-d.json
scoreboard-players/italy_serie-d-group-e.json
scoreboard-players/italy_serie-d-group-f.json
scoreboard-players/italy_serie-d-group-g.json
scoreboard-players/italy

scoreboard-players/slovakia_divison-c-bratislava.json
scoreboard-players/slovakia_slovak-cup.json
scoreboard-players/slovakia_super-cup.json
scoreboard-players/slovenia_prva-liga.json
scoreboard-players/slovenia_2-snl.json
scoreboard-players/slovenia_slovenian-cup.json
scoreboard-players/slovenia_super-cup.json
scoreboard-players/spain_laliga.json
scoreboard-players/spain_laliga2.json
scoreboard-players/spain_segunda-division-b-group-1.json
scoreboard-players/spain_segunda-division-b-group-2.json
scoreboard-players/spain_segunda-division-b-group-3.json
scoreboard-players/spain_segunda-division-b-group-4.json
scoreboard-players/spain_segunda-division-b-winners-play-offs.json
scoreboard-players/spain_segunda-division-b-losers-play-offs.json
scoreboard-players/spain_segunda-division-b-play-offs.json
scoreboard-players/spain_segunda-division-b-play-out.json
scoreboard-players/spain_tercera-division-group-1.json
scoreboard-players/spain_tercera-division-group-2.json
scoreboard-players/spain

scoreboard-players/south-korea_league-cup.json
scoreboard-players/south-korea_korean-cup.json
scoreboard-players/south-korea_wk-league-women.json
scoreboard-players/sri-lanka_champions-league.json
scoreboard-players/syria_premier-league.json
scoreboard-players/syria_syria-cup.json
scoreboard-players/tajikistan_vysshaya-liga.json
scoreboard-players/thailand_thai-premier-league.json
scoreboard-players/thailand_thai-fa-cup.json
scoreboard-players/thailand_champions-cup.json
scoreboard-players/turkmenistan_yokary-liga.json
scoreboard-players/united-arab-emirates_uae-league.json
scoreboard-players/united-arab-emirates_arabian-gulf-cup.json
scoreboard-players/united-arab-emirates_super-cup.json
scoreboard-players/united-arab-emirates_presidents-cup.json
scoreboard-players/united-arab-emirates_fa-cup.json
scoreboard-players/uzbekistan_super-league.json
scoreboard-players/uzbekistan_uzbekistan-cup.json
scoreboard-players/vietnam_v-league.json
scoreboard-players/yemen_division-1.json
scoreboard

scoreboard-players/europe_salzburg-airport-cup.json
scoreboard-players/europe_setanta-sports-cup.json
scoreboard-players/europe_tirol-cup.json
scoreboard-players/europe_uusi-lahti-cup.json
scoreboard-players/europe_uhren-cup.json
scoreboard-players/europe_joint-super-cup.json
scoreboard-players/europe_memorial-cup.json
scoreboard-players/europe_costa-del-sol-trophy.json
scoreboard-players/europe_euro-women.json
scoreboard-players/europe_champions-league-women.json
scoreboard-players/europe_euro-u19-women.json
scoreboard-players/europe_euro-u17-women.json
scoreboard-players/europe_premier-league-crimea.json
scoreboard-players/europe_czech-slovak-super-cup.json
scoreboard-players/europe_elite-league-u20.json
scoreboard-players/north-central-america_gold-cup.json
scoreboard-players/north-central-america_concacaf-champions-league.json
scoreboard-players/north-central-america_concacaf-nations-league.json
scoreboard-players/north-central-america_concacaf-cup.json
scoreboard-players/north-cen