In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import lxml

years = [1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016, 2020]

def getMatches(year):
    website = f"https://en.wikipedia.org/wiki/UEFA_Euro_{year}"
    response = requests.get(website)
    content = response.text
    soup = BeautifulSoup(content, "lxml")

    allMatches = soup.find_all("div", class_="footballbox")

    homeTeams = []
    score = []
    awayTeams = []

    for match in allMatches:
        homeTeams.append(match.find("th", class_="fhome").text)
        score.append(match.find("th", class_="fscore").text)
        awayTeams.append(match.find("th", class_="faway").text)

    dictEuro = {"Home" : homeTeams, "Score" : score, "Away" : awayTeams, "Year" : year}
    df_euro = pd.DataFrame(dictEuro)
    return df_euro

# Get All Matches in Euro's History
euro = [getMatches(year) for year in years]
df_euro = pd.concat(euro, ignore_index=True)
df_euro.to_csv("euro_historical_data.csv", index=False)

# Get Euro 2024 Fixtures
df_euro_2024 = getMatches(2024)
df_euro_2024.to_csv("euro_2024_fixtures.csv", index=False)

The number of matches in 1960 is: 4
The number of matches in 1964 is: 4
The number of matches in 1968 is: 5
The number of matches in 1972 is: 4
The number of matches in 1976 is: 4
The number of matches in 1980 is: 14
The number of matches in 1984 is: 15
The number of matches in 1988 is: 15
The number of matches in 1992 is: 15
The number of matches in 1996 is: 31
The number of matches in 2000 is: 31
The number of matches in 2004 is: 31
The number of matches in 2008 is: 31
The number of matches in 2012 is: 31
The number of matches in 2016 is: 51
The number of matches in 2020 is: 51
<class 'int'>


In [4]:
# Validate that my script extracted all the matches from Wikipedia
# Compare the sum of my data to Wikipedia data
def totalMatchesInYear(year):
    website = f"https://en.wikipedia.org/wiki/UEFA_Euro_{year}"
    response = requests.get(website)
    content = response.text
    soup = BeautifulSoup(content, "lxml")
    totalMatches = 0

    matchesPlayed = soup.find_all("td", class_="infobox-data")

    if (year >= 1960 and year < 1984):
        return int(matchesPlayed[8].text)

    # Return from year 1984 onwards
    return int(matchesPlayed[6].text)

myData = 0;
wikiData = 0;

for year in years:
    myMatches = df_euro[df_euro['Year'] == year].shape[0]
    myData += myMatches
    wikiMatches = totalMatchesInYear(year)
    wikiData += wikiMatches

print(f"The sum of my data is {myData}")
print(f"The sum of Wikipedia data is {wikiData}")

if (myData == wikiData):
    print("Hence, my script extracted all the matches from wikipedia correctly!")
else:
    print("My script did not extract all the matches from wikipedia correctly!")

The sum of my data is 337
The sum of Wikipedia data is 337
Hence, my script extracted all the matches from wikipedia correctly!
