In [1]:
import requests
from pathlib import Path
import os

import shelve

from bs4 import BeautifulSoup
import pandas as pd

In [2]:
fixtures_url = {
    'EPL': {
        '2020-2021': "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures", 
        '2019-2020': "https://fbref.com/en/comps/9/3232/schedule/2019-2020-Premier-League-Scores-and-Fixtures", 
        '2018-2019': "https://fbref.com/en/comps/9/1889/schedule/2018-2019-Premier-League-Scores-and-Fixtures",
        '2017-2018': "https://fbref.com/en/comps/9/1631/schedule/2017-2018-Premier-League-Scores-and-Fixtures", 
#         '2016-2017': "https://fbref.com/en/comps/9/1526/schedule/2016-2017-Premier-League-Scores-and-Fixtures", 
#         '2015-2016': "https://fbref.com/en/comps/9/1467/schedule/2015-2016-Premier-League-Scores-and-Fixtures",
#         '2014-2015': "https://fbref.com/en/comps/9/733/schedule/2014-2015-Premier-League-Scores-and-Fixtures",
    }, 
    'Ligue1': {
        '2020-2021': "https://fbref.com/en/comps/13/schedule/Ligue-1-Scores-and-Fixtures", 
        '2019-2020': "https://fbref.com/en/comps/13/3243/schedule/2019-2020-Ligue-1-Scores-and-Fixtures", 
        '2018-2019': "https://fbref.com/en/comps/13/2104/schedule/2018-2019-Ligue-1-Scores-and-Fixtures",
        '2017-2018': "https://fbref.com/en/comps/13/1632/schedule/2017-2018-Ligue-1-Scores-and-Fixtures", 
    },
    'Bundesliga': {
        '2020-2021': "https://fbref.com/en/comps/20/schedule/Bundesliga-Scores-and-Fixtures", 
        '2019-2020': "https://fbref.com/en/comps/20/3248/schedule/2019-2020-Bundesliga-Scores-and-Fixtures", 
        '2018-2019': "https://fbref.com/en/comps/20/2109/schedule/2018-2019-Bundesliga-Scores-and-Fixtures",
        '2017-2018': "https://fbref.com/en/comps/20/1634/schedule/2017-2018-Bundesliga-Scores-and-Fixtures", 
    },
    'SerieA': {
        '2020-2021': "https://fbref.com/en/comps/11/schedule/Serie-A-Scores-and-Fixtures", 
        '2019-2020': "https://fbref.com/en/comps/11/3260/schedule/2019-2020-Serie-A-Scores-and-Fixtures", 
        '2018-2019': "https://fbref.com/en/comps/11/1896/schedule/2018-2019-Serie-A-Scores-and-Fixtures",
        '2017-2018': "https://fbref.com/en/comps/11/1640/schedule/2017-2018-Serie-A-Scores-and-Fixtures",
    },
    'LaLiga': {
        '2020-2021': "https://fbref.com/en/comps/12/schedule/La-Liga-Scores-and-Fixtures", 
        '2019-2020': "https://fbref.com/en/comps/12/3239/schedule/2019-2020-La-Liga-Scores-and-Fixtures", 
        '2018-2019': "https://fbref.com/en/comps/12/1886/schedule/2018-2019-La-Liga-Scores-and-Fixtures",
        '2017-2018': "https://fbref.com/en/comps/12/1652/schedule/2017-2018-La-Liga-Scores-and-Fixtures",
    },
    
}

In [3]:


# def shelve_it(file_name):
#     d = shelve.open(file_name)

#     def decorator(func):
#         def new_func(param):
#             if param not in d:
#                 d[param] = func(param)
#             else:
#                 print('Found Cached!')
#             return d[param]

#         return new_func

#     return decorator

In [4]:

# @shelve_it('matches.shelve')

In [5]:
from diskcache import Cache

cache = Cache("matches.shelve")

@cache.memoize()
def get_page(link):
    return requests.get(link)

In [6]:
columns = {}
datasets = {}

for league in fixtures_url:
    for season in fixtures_url[league]:
        data = []
        print(league, season)
        link = fixtures_url[league][season]
        page = get_page(link)
        soup = BeautifulSoup(page.content, 'html.parser')
#         break
        results = soup.find_all("div", id=lambda value: value and value.startswith("div_sched"))[0]
        column = ['League', 'Season'] + [el.text.strip() for el in results.find('thead').find_all('th')]
        if league not in columns:
            columns[league] = {}
        columns[league][season] = column
    
        for el in results.find('tbody').find_all('tr'):
            row = []
            row.append(league)
            row.append(season)
            count_empty = 0
            for el2 in el.find_all('th'):
                row.append(el2.text.strip())
                if el2.text.strip() == '':
                    count_empty += 1
            for el2 in el.find_all('td'):
                row.append(el2.text.strip())
                if el2.text.strip() == 'Match Report':
                    if el2.find('a'):
                        row[-1] = 'https://fbref.com' + el2.find('a')['href'].strip()
                if el2.text.strip() == '':
                    count_empty += 1
            if count_empty != (len(row) - 2):
                data.append(row)
                
        if league not in datasets:
            datasets[league] = {}
        datasets[league][season] = data
        
for league in datasets:
    for season in datasets[league]:
        df = pd.DataFrame(datasets[league][season], columns = columns[league][season])
        df['Date'] = pd.to_datetime(df.Date, format='%Y-%m-%d')
        df[['HomeScore', 'AwayScore']] = df['Score'].str.split('–',expand=True)
        df = df.drop(['Score', 'Notes'], 1)
        df = df.rename(columns={'Home': 'HomeTeam', 'Away': 'AwayTeam'})
        df['Attendance'] = pd.to_numeric(df['Attendance'].str.replace(',', ''), errors='coerce').astype('Int64')

        if 'xG' in list(df.columns):
            cols = list(df.columns)
            cols[cols.index('xG')] = 'xG_Home'
            cols[cols.index('xG')] = 'xG_Away'
            df.columns = cols
            df['xG_Home'] = pd.to_numeric(df['xG_Home'], errors='coerce').astype('Float64')
            df['xG_Away'] = pd.to_numeric(df['xG_Away'], errors='coerce').astype('Float64')
        
        df['Wk'] = pd.to_numeric(df['Wk'].str.replace(',', ''), errors='coerce').astype('Int64')
        df['HomeScore'] = pd.to_numeric(df['HomeScore'].str.replace(',', ''), errors='coerce').astype('Int64')
        df['AwayScore'] = pd.to_numeric(df['AwayScore'].str.replace(',', ''), errors='coerce').astype('Int64')

        path = Path(f'dfs/{league}/matches')
        path.mkdir(parents=True, exist_ok=True)

        df.to_csv(os.path.join(path, f'{season}.csv'), index = False)
        

EPL 2020-2021
EPL 2019-2020
EPL 2018-2019
EPL 2017-2018
Ligue1 2020-2021
Ligue1 2019-2020
Ligue1 2018-2019
Ligue1 2017-2018
Bundesliga 2020-2021
Bundesliga 2019-2020
Bundesliga 2018-2019
Bundesliga 2017-2018
SerieA 2020-2021
SerieA 2019-2020
SerieA 2018-2019
SerieA 2017-2018
LaLiga 2020-2021
LaLiga 2019-2020
LaLiga 2018-2019
LaLiga 2017-2018
