### Imports

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import pickle
import time
from string import ascii_uppercase as alphabet

### Scraping Historical and Fixture Data

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
# function to get all the matches for a given year
def get_matches(year):
    request_url = f"https://en.wikipedia.org/wiki/{year}_Africa_Cup_of_Nations"
    res = requests.get(url=request_url).text

    soup = BeautifulSoup(res, "lxml")
    matches = soup.find_all("div", {"class": "footballbox"})

    data = []
    for m in matches:
        # extract the home, away, and score for each match
        match = {}
        match["home"] = m.find("th", {"class": "fhome"}).get_text()
        match["away"] = m.find("th", {"class": "faway"}).get_text() 
        match["score"] = m.find("th", {"class": "fscore"}).get_text().split(" ")[0]
        match["year"] = year
        data.append(match)

    return pd.DataFrame(data)

In [4]:
years = [1957, 1959, 1962, 1963, 1965, 1968, 1970, 1972, 1974, 1976, 1978, 1980, 1982, 1984, 1986, 1988, 1990, 1992, 1994, 1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 2013, 2015, 2017, 2019, 2021]

begin_loop = time.time()
match_dict = [get_matches(y) for y in years] # loop over all the years and get all the matches
df = pd.concat(match_dict, ignore_index=True)

print(f"Scraping completed! - Total run time: {round((time.time() - begin_loop), 2)}seconds")

Scraping completed! - Total run time: 11.07seconds


##### Exporting to CSV

In [5]:
# historical data
df.to_csv("data/afcon_historical_data.csv", index=False)

# fixture for 2023
fixture = get_matches("2023")
fixture.to_csv("data/afcon_fixture_data.csv", index=False)

In [6]:
df

Unnamed: 0,home,away,score,year
0,Sudan,Egypt,1–2,1957
1,Ethiopia,South Africa,2–0,1957
2,Egypt,Ethiopia,4–0,1957
3,United Arab Republic,Ethiopia,4–0,1959
4,Sudan,Ethiopia,1–0,1959
...,...,...,...,...
741,Senegal,Equatorial Guinea,3–1,2021
742,Burkina Faso,Senegal,1–3,2021
743,Cameroon,Egypt,0–0,2021
744,Burkina Faso,Cameroon,3–3,2021


In [7]:
fixture.sample(10)

Unnamed: 0,home,away,score,year
43,Winner Group F,Runner-up Group E,R8,2023
41,Winner Group C,3rd Group A/B/F,R6,2023
34,Tanzania,DR Congo,v,2023
32,Morocco,DR Congo,v,2023
19,Burkina Faso,Mauritania,v,2023
30,Morocco,Tanzania,v,2023
5,Guinea-Bissau,Nigeria,v,2023
0,Ivory Coast,Guinea-Bissau,v,2023
7,Ghana,Cape Verde,v,2023
3,Ivory Coast,Nigeria,v,2023


### Scraping Groups Data

In [8]:
# extracting all tables in website
request_url = f"https://en.wikipedia.org/wiki/2023_Africa_Cup_of_Nations"
res = pd.read_html(request_url)

In [9]:
res[9]
res[16]
res[23]

Unnamed: 0,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,1,Senegal,0,0,0,0,0,0,0,0,Advance to knockout stage
1,2,Cameroon,0,0,0,0,0,0,0,0,Advance to knockout stage
2,3,Guinea,0,0,0,0,0,0,0,0,Possible knockout stage based on ranking
3,4,Gambia,0,0,0,0,0,0,0,0,


In [10]:
# A -> F
# 9 -> 7*6 + 9 = 51

groups = {}
for letter, i in zip(alphabet, range(9, 51, 7)): # A=9, B=16, ...
    df = res[i]
    df.rename(columns={df.columns[1]:"Team"}, inplace=True)
    # remove any home/away team markers
    df["Team"] = df["Team"].str.replace(r"\s*\(.*?\)", "", regex=True)
    df.pop("Qualification")
    groups[f"Group {letter}"] = df

In [11]:
groups["Group A"]

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,Ivory Coast,0,0,0,0,0,0,0,0
1,2,Nigeria,0,0,0,0,0,0,0,0
2,3,Equatorial Guinea,0,0,0,0,0,0,0,0
3,4,Guinea-Bissau,0,0,0,0,0,0,0,0


In [12]:
groups

{'Group A':    Pos               Team  Pld  W  D  L  GF  GA  GD  Pts
 0    1        Ivory Coast    0  0  0  0   0   0   0    0
 1    2            Nigeria    0  0  0  0   0   0   0    0
 2    3  Equatorial Guinea    0  0  0  0   0   0   0    0
 3    4      Guinea-Bissau    0  0  0  0   0   0   0    0,
 'Group B':    Pos        Team  Pld  W  D  L  GF  GA  GD  Pts
 0    1       Egypt    0  0  0  0   0   0   0    0
 1    2       Ghana    0  0  0  0   0   0   0    0
 2    3  Cape Verde    0  0  0  0   0   0   0    0
 3    4  Mozambique    0  0  0  0   0   0   0    0,
 'Group C':    Pos      Team  Pld  W  D  L  GF  GA  GD  Pts
 0    1   Senegal    0  0  0  0   0   0   0    0
 1    2  Cameroon    0  0  0  0   0   0   0    0
 2    3    Guinea    0  0  0  0   0   0   0    0
 3    4    Gambia    0  0  0  0   0   0   0    0,
 'Group D':    Pos          Team  Pld  W  D  L  GF  GA  GD  Pts
 0    1       Algeria    0  0  0  0   0   0   0    0
 1    2  Burkina Faso    0  0  0  0   0   0   0    0
 2  

##### Exporting to CSV

In [13]:
df = pd.concat([df.assign(Grp=name) for name, df in groups.items()])
df.to_csv("data/afcon_groups_data.csv", index=False)

##### Dumping to binary

In [14]:
with open("data/afcon_groups_dump", "wb") as output:
    pickle.dump(groups, output)