# Web Scraping with Python Pandas
---

In [23]:
import pandas as pd
from string import ascii_uppercase as alphabet
import pickle

In [16]:
print(alphabet)

ABCDEFGHIJKLMNOPQRSTUVWXYZ


In [19]:
tables = pd.read_html('https://en.wikipedia.org/wiki/2022_FIFA_World_Cup')

dict_table = {}
for letter, i in zip(alphabet, range(9, 65, 7)):
    df = tables[i]
    df.rename(columns={df.columns[1]: 'Team'}, inplace=True)
    df.pop('Qualification')
    dict_table[f'Group {letter}'] = df

In [20]:
dict_table['Group A']

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,Netherlands,3,2,1,0,5,1,+4,7
1,2,Senegal,3,2,0,1,5,4,+1,6
2,3,Ecuador,3,1,1,1,4,3,+1,4
3,4,Qatar (H),3,0,0,3,1,7,−6,0


In [21]:
dict_table['Group B']

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,England,3,2,1,0,9,2,+7,7
1,2,United States,3,1,2,0,2,1,+1,5
2,3,Iran,3,1,0,2,4,7,−3,3
3,4,Wales,3,0,1,2,1,6,−5,1


In [22]:
dict_table['Group C']

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,Argentina,3,2,0,1,5,2,+3,6
1,2,Poland,3,1,1,1,2,2,0,4
2,3,Mexico,3,1,1,1,2,3,−1,4
3,4,Saudi Arabia,3,1,0,2,3,5,−2,3


In [24]:
with open('data/dict_table', 'wb') as output:
    pickle.dump(dict_table, output)

### Scrape the FIFA World Cup Data from Wikipedia

In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974, 1978, 1982, 1986, 
         1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018]

def get_matches(year):
    url = f'https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup'
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'lxml')

    home = []
    score = []
    away = []
    matches = soup.find_all('div', class_='footballbox')
    for match in matches:
        home.append(match.find('th', class_='fhome').get_text())
        score.append(match.find('th', class_='fscore').get_text())
        away.append(match.find('th', class_='faway').get_text())

    dict_football = {'home': home, 'score': score, 'away': away}
    df_football = pd.DataFrame(dict_football)
    df_football.insert(0, 'year', year)
    return df_football

fifa = [get_matches(year) for year in years]
df_fifa = pd.concat(fifa, ignore_index=True)
df_fifa

Unnamed: 0,year,home,score,away
0,1930,France,4–1,Mexico
1,1930,Argentina,1–0,France
2,1930,Chile,3–0,Mexico
3,1930,Chile,1–0,France
4,1930,Argentina,6–3,Mexico
...,...,...,...,...
896,2018,Russia,2–2 (a.e.t.),Croatia
897,2018,France,1–0,Belgium
898,2018,Croatia,2–1 (a.e.t.),England
899,2018,Belgium,2–0,England
