In [22]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
ROOT = '/home/robert/.config/JetBrains/DataSpell2021.3/projects/MLS'
CURRENT_YEAR = 2022

In [23]:
def split_score(score):
    split = score.split('-')
    if '(' not in score:
        return split[0], split[1]
    else:
        return split[0].split(') ')[1], split[1].split(' (')[0]

In [24]:
def get_winner(home_team, away_team, score):
    home_score, away_score = split_score(score)
    if home_score > away_score:
        return home_team
    elif away_score > home_score:
        return away_team
    # Return regular season Tie
    elif '(' not in score and home_score == away_score:
        return np.nan
    elif '(' in score and home_score == away_score:
        home_pen = score.split('-')[0]
        home_pen = int(re.search('\(([^)]+)', home_pen).group(1))
        away_pen = score.split('-')[1]
        away_pen = int(re.search('\(([^)]+)', away_pen).group(1))
        if home_pen > away_pen:
            return home_team
        else:
            return away_team

In [25]:
def generate_games_played(code, year):
    """Giver a year and code, this script will go scrape sports-reference, create a csv, and return a df"""
    if year==CURRENT_YEAR:
        url = 'https://fbref.com/en/comps/22/schedule/Major-League-Soccer-Scores-and-Fixtures'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.find_all('table', {'id': 'sched_11499_1'})
        df = pd.read_html(str(table))[0]
        if 'Round' not in list(df.columns):
            df['Round']='Regular Season'
        df = df[df.Score != 'Score']
    else:
        url = f'https://fbref.com/en/comps/22/{code}/schedule/{year}-Major-League-Soccer-Scores-and-Fixtures'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.find_all('table', {'id': 'sched_all'})
        df = pd.read_html(str(table))[0]

    try:
        columns = ['Round', 'Date', 'Home', 'xG', 'Score', 'xG.1', 'Away', 'Venue', 'Attendance']
        df = df[columns]
    except KeyError:
        columns = ['Round', 'Date', 'Home', 'Score', 'Away', 'Venue','Attendance']
        df = df[columns]
    df['GHome'] = np.nan
    df['GAway'] = np.nan
    df['Winner'] = np.nan
    df['Season'] = np.nan
    for i in df.index:
        try:
            df.at[i, 'Score']=df.at[i, 'Score'].replace('–', '-')
        except AttributeError:
            df.drop([i], inplace=True)
            continue
        if df.at[i, 'Round']=='Round':
            df.drop([i], inplace=True)
            continue
        df.at[i, 'Season'] = year
        df.at[i, 'GHome'], df.at[i, 'GAway'] = split_score(df.at[i, 'Score'])
        df.at[i, 'Winner'] = get_winner(df.at[i, 'Home'], df.at[i, 'Away'], df.at[i, 'Score'])
    df['GHome'] = df.GHome.astype('int64')
    df['GAway'] = df.GAway.astype('int64')
    df['Season'] = df.Season.astype('int64')
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    df['Attendance'].fillna(0, inplace=True)
    df['Attendance'] = df.Attendance.astype('int64')
    df.rename(columns={'xG': 'xGH', 'xG.1':'xGA'}, inplace=True)
    df.drop(['Score'], axis=1, inplace=True)
    df.to_csv(f'{ROOT}/Results/{year}.csv')
    return df

In [26]:
url_codes = {2022:0, 2021: 11006, 2020: 10090, 2019:2798, 2018:1759, 2017:1558, 2016:1503, 2015:1369, 2014:708, 2013:643, 2012:577, 2011:509, 2010:442, 2009:374, 2008:316, 2007:260, 2006:211, 2005:168, 2004:133, 2003:100, 2002:75, 2001:56, 2000:44, 1999:37, 1998:34, 1997: 32, 1996: 30}

In [27]:
AllYears = pd.DataFrame()
for year in url_codes:
    df = generate_games_played(url_codes[year], year)
    df = df[['Season','Round', 'Venue', 'Date', 'Home', 'GHome', 'GAway', 'Away','Winner']]
    AllYears = pd.concat([AllYears, df], ignore_index=True)
    AllYears = AllYears.sort_values(by=['Date'])
AllYears.to_csv(f'{ROOT}/Results/AllYears.csv')