In [31]:
import datetime
from datetime import date
import requests
import pprint
from bs4 import BeautifulSoup
import pandas as pd
import re
import pickle
from urllib.request import urlopen
import glob
import altair as alt
import time
import itertools
import numpy as np
import operator

pd.options.display.max_columns = 100



In [34]:
# Helper Functions

def Filter(string, substr): 
        return [str for str in string if
                any(sub in str for sub in substr)] 
    
def NOTFilter(string, substr): 
    return [str for str in string if
            any(sub not in str for sub in substr)]

In [37]:
# England: 9 | Italy: 11 | Spain: 12 | France: 13 | Germany: 20
def fbref_league_history(league_id = [9,11,12,13,20], first_season = 2010):
    history = []
    for i in league_id:
        comp_history_url = "https://fbref.com/en/comps/" + str(i) + "/history" 
        #print(comp_history_url)

        r=requests.get(comp_history_url)
        soup=BeautifulSoup(r.content, "html.parser")

        find_seasons = soup.find_all(class_ = "left")

        all_seasons_url = []
        for k in range(0, len(find_seasons)):
            if find_seasons[k].get('data-stat') == "season":
                temp = "https://fbref.com" + find_seasons[k].find_all("a")[0].attrs["href"]
                all_seasons_url.append(temp)

        history.append(all_seasons_url)
        time.sleep(0.1)

    # All histories in one array
    history  = list(itertools.chain(*history))

    seasons = list(map(lambda x: str(x)+"-"+str(x+1), np.arange(1950, first_season, 1)))
    for i in seasons:
        history = NOTFilter(history, [i])
    del seasons

    return history


In [38]:
# Test for Big 5
history = fbref_league_history(league_id = [9,11,12,13,20])
history

['https://fbref.com/en/comps/9/Premier-League-Stats',
 'https://fbref.com/en/comps/9/10728/2020-2021-Premier-League-Stats',
 'https://fbref.com/en/comps/9/3232/2019-2020-Premier-League-Stats',
 'https://fbref.com/en/comps/9/1889/2018-2019-Premier-League-Stats',
 'https://fbref.com/en/comps/9/1631/2017-2018-Premier-League-Stats',
 'https://fbref.com/en/comps/9/1526/2016-2017-Premier-League-Stats',
 'https://fbref.com/en/comps/9/1467/2015-2016-Premier-League-Stats',
 'https://fbref.com/en/comps/9/733/2014-2015-Premier-League-Stats',
 'https://fbref.com/en/comps/9/669/2013-2014-Premier-League-Stats',
 'https://fbref.com/en/comps/9/602/2012-2013-Premier-League-Stats',
 'https://fbref.com/en/comps/9/534/2011-2012-Premier-League-Stats',
 'https://fbref.com/en/comps/9/467/2010-2011-Premier-League-Stats',
 'https://fbref.com/en/comps/11/Serie-A-Stats',
 'https://fbref.com/en/comps/11/10730/2020-2021-Serie-A-Stats',
 'https://fbref.com/en/comps/11/3260/2019-2020-Serie-A-Stats',
 'https://fbref.

In [39]:
def fbref_team_url_history(league_history):
    team_season_url = []
    for league_season_url in league_history:
        r=requests.get(league_season_url)
        soup=BeautifulSoup(r.content, "html.parser")
        teams = soup.find("table").find_all("a")
        teams = list(map(lambda x: "https://fbref.com" + x["href"], teams))
        teams = Filter(teams, ["/en/squads/"])
        team_season_url.append(teams)

    # All histories in one array
    team_season_url  = list(itertools.chain(*team_season_url))
    return team_season_url



In [47]:
# Test for Premier League Last Season
team_season_url = fbref_team_url_history(history)
len(team_season_url)

1176

In [48]:
def fbref_player_url(team_season_url):
    player_url = []
    for turl in team_season_url:
        r=requests.get(turl)
        soup=BeautifulSoup(r.content, "html.parser")
        soup.find("div", {"id":"all_stats_standard"})
        players = soup.find("tbody").find_all("a")
        players = list(map(lambda x: x["href"], players))
        players = Filter(players, ["/en/players/"])
        players = NOTFilter(players, ["matchlogs"])
        player_url.append(list(map(lambda x: "https://fbref.com" + x, players)))
        time.sleep(0.01)
    player_url  = list(itertools.chain(*player_url))
    return player_url

In [49]:
# Test for players urls
player_url = fbref_player_url(team_season_url)
player_url

['https://fbref.com/en/players/33887998/Edouard-Mendy',
 'https://fbref.com/en/players/18b896d6/Antonio-Rudiger',
 'https://fbref.com/en/players/45db685d/Jorginho',
 'https://fbref.com/en/players/1cb49278/Andreas-Christensen',
 'https://fbref.com/en/players/79c0821a/Mateo-Kovacic',
 'https://fbref.com/en/players/53cad200/Cesar-Azpilicueta',
 'https://fbref.com/en/players/5eae500a/Romelu-Lukaku',
 'https://fbref.com/en/players/fed7cb61/Kai-Havertz',
 'https://fbref.com/en/players/86e7deaf/Thiago-Silva',
 'https://fbref.com/en/players/f4290206/Marcos-Alonso',
 'https://fbref.com/en/players/1265a93a/Reece-James',
 'https://fbref.com/en/players/5515376c/Trevoh-Chalobah',
 'https://fbref.com/en/players/9674002f/Mason-Mount',
 'https://fbref.com/en/players/d2424d1b/Ben-Chilwell',
 'https://fbref.com/en/players/b9fbae28/NGolo-Kante',
 'https://fbref.com/en/players/15f3ec41/Callum-Hudson-Odoi',
 'https://fbref.com/en/players/49fe9070/Timo-Werner',
 'https://fbref.com/en/players/e97fd090/Ruben-

In [50]:
def fbref_player_info(player_url):
    player_info = []
    for completed, i in enumerate(player_url):

        # PlayerId
        playerId = i.replace("https://fbref.com/en/players/", "").split("/")[0]

        # Request
        r=requests.get(i)
        soup=BeautifulSoup(r.content, "html.parser")

        # Meta
        meta = soup.find("div", {"id":"meta"})

        # Player Name
        playerName = soup.find("h1").find("span").get_text()
        
        # Nationality
        birthplace = meta.find("span", {"itemprop": "birthPlace"}).text.replace("\n", "").strip().split(", ")
        nationality = birthplace[len(birthplace)-1]
        

        # Player Photos
        try:
            photo = soup.find("div", {"class":"media-item"}).find("img").attrs["src"]
        except:
            photo = np.nan


        # Birth
        try:
            birth = meta.find("span", {"itemprop": "birthDate"}).text.replace("\n", "").strip()
            #soup.find("div", {"id":"meta"}).find("span", {"id":"necro-birth"})['data-birth']
        except:
            birth = np.nan

        # Height
        try:
            height = meta.find("span", {"itemprop":"height"}).text.replace("cm", "")
        except:
            height = np.nan

        # Weight
        try:
            weight = soup.find("div", {"id":"meta"}).find("span", {"itemprop":"weight"}).text.replace("kg", "")
        except :
            weight = np.nan


        detail = meta.find_all("p")

        # Player Full Name
        if len(Filter([detail[0].text], ["Position", "Club", "Born", "National Team", "Citizenship"])) > 0:
                playerFullName = np.nan
        else:
            playerFullName = detail[0].get_text()

        # Position & Footed
        fp = list(map(lambda x: str(x), detail))
        position = Filter(fp, ["Position"])
        footed = Filter(fp, ["Footed"])
        if len(position) > 0:
            position = position[0].split("<strong>")[1].replace("Position:</strong>","").replace("\n", "").replace("<p>", "").replace("</p>", "").replace("\xa0", "").replace("▪", "").split("<span")[0].strip()
        else:
            position = np.nan

        if len(footed) > 0:
            footed = footed[0].split("<strong>Footed:</strong>")[1].split("<span")[0].strip().replace("</p>", "").upper()
            footed = footed.split("% ")
            if len(footed) > 1:
                foot = footed[1]
                foot_ability = int(footed[0]) 
            else:
                foot = footed[0]
                foot_ability = 100
        else:
            foot = np.nan
            foot_ability = np.nan

        # International Reputation
        try:
            ir = soup.find("ul", {"id":"bling"}).find_all("a")
            ir = list(map(lambda x: x.text.strip(), ir))
            ir = '||'.join(map(str, ir))  # While the variable will be made || should be separated with
        except:
            ir = np.nan
            
        #Social Media
        sm = Filter(list(map(lambda x: x["href"], meta.find_all("a", href = True))), ["twitter", "instagram"])
        try:
            tw = Filter(sm, ["twitter"])[0]
        except:
            tw = np.nan
        try:
            ins = Filter(sm, ["instagram"])[0]
        except:
            ins = np.nan

        # Data Frame
        temp = pd.DataFrame({
            "FBRefId":[playerId],
            "PlayerName":[playerName],
            "PlayerFullName":[playerFullName],
            "Nationality":[nationality],
            "Photo":[photo],
            "Birth":[birth],
            "Height":[height],
            "Weight":[weight],
            "Position":[position],
            "Foot":[foot],
            "FootAbility":[foot_ability],
            "InternationalReputation":[ir],
            "PlayerUrl":[i],
            "Twitter":[tw],
            "Instagram":[ins]
        })    

        temp["PlayerFullName"] = np.where(temp.PlayerFullName.isnull(), temp.PlayerName, temp.PlayerFullName)

        player_info.append(temp)

        # Print Message
        sys.stdout.write("\r{0} players have just scraped from FBRef!".format(completed+1))
        sys.stdout.flush()

        # System Sleep
        time.sleep(0.01) 

    # Write Player Info
    player_info = pd.concat(player_info)
    
    return player_info



In [52]:
# Test
player_info = fbref_player_info(player_url)
player_info

2397 players have just scraped from FBRef!

KeyboardInterrupt: 

In [None]:
# Saving Player Data Dataframe

pd.DataFrame(frame).to_csv('/Volumes/GoogleDrive/.shortcut-targets-by-id/1KUGn_35OjAoOP2puz6yG-2g_8LBxvDG_/SIADS 697 - Capstone/Dataframes/player_data_df.csv', index=False)

# pd.DataFrame(df_39_columns_1).to_csv('/Users/vruiz.CDS/Downloads/Dataframes/consolidated_df.csv', index = False)