# Web Scraping for DataViz Exam Project - Football Transfers

In [1]:
# imports
import bs4 as bs
import requests
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle
import sys
import pyarrow as pa

In [2]:
def create_soup(url):
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                         "AppleWebKit/605.1.15 (KHTML, like Gecko) "
                         "Version/15.4 Safari/605.1.15"}
    page = requests.get(url, headers=headers)
    soup = bs.BeautifulSoup(page.content,'html.parser')
    return soup

In [3]:
url = 'https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2018&s_w=&leihe=1&intern=0&intern=1'
soup = create_soup(url)
main_league = 'Premier League'
main_season = '18/19'

In [121]:
# initializing the league_dict and club_info for later use
league_dict = {}
club_info = []

### Getting a list of tuples for all clubs

In [None]:
# finding club id and club code for each club 
boxes = soup.find('div', class_='large-8 columns').find_all('div', class_='box')

for box in tqdm(boxes[3:]):
    club_name = box.find('img').get('alt')
    print(f'club_name: {club_name}')
    tables = box.find_all('div', class_='responsive-table')
    ins = tables[0].find('tbody').find_all('tr')
    outs = tables[1].find('tbody').find_all('tr')

    for row in ins:
        fee = row.find_all('td', class_='rechts')[1].string
        if isinstance(fee, str):
            fee_first = fee[0]
            if fee_first == "€":
                club_section = row.find('td', class_='no-border-links verein-flagge-transfer-cell').find('a')
                club_url = club_section.get('href')
                link_list = club_url.split('/')
                club_code = link_list[1]
                club_id = link_list[4]
                club_info.append((club_code, club_id))
    
    for row in outs:
        fee = row.find_all('td', class_='rechts')[1].string
        if isinstance(fee, str):
            fee_first = fee[0]
            if fee_first == "€":
                club_section = row.find('td', class_='no-border-links verein-flagge-transfer-cell').find('a')
                club_url = club_section.get('href')
                link_list = club_url.split('/')
                club_code = link_list[1]
                club_id = link_list[4]
                club_info.append((club_code, club_id))

In [53]:
# saving the club_info_set as a set in a pickle file
# with open('my_set.pkl', 'wb') as file:
#     pickle.dump(club_info_set, file)

# loading the pickle file containing the set of tuples
with open('club_tuples.pkl', 'rb') as file:
    loaded_set = pickle.load(file)

657


### Extracting league info for each club to create the league dictionary

In [57]:
# extracting league information for each team in the set of tuples
league_dict = {}
for club_info_tuple in tqdm(loaded_set):
    code, id = club_info_tuple
    if id not in league_dict:
        season_dict = {}
        league_page_url = 'https://www.transfermarkt.com/' + code + '/' + 'platzierungen' + '/' + 'verein' + '/' + id
        club_soup = create_soup(league_page_url)
        tables = club_soup.find_all('tbody')
        if len(tables) > 1:
            table = tables[1]
            table_entries = table.find_all('tr')

            for entry in table_entries[:6]:
                season = entry.find('td', class_='zentriert').string
                league = entry.find('td', class_='no-border-links hauptlink').find('a').get('href').split('/')[4]
                season_dict[season] = league
            league_dict[code] = season_dict

100%|██████████| 657/657 [27:36<00:00,  2.52s/it]  


In [4]:
# saving the set to a pickle file
# sys.setrecursionlimit(10**6)
# with open('league_dict_v2.pkl', 'wb') as file:
#     pickle.dump(league_dict, file)

# loading the set from the pickle file
# with open('league_dict_v2.pkl', 'rb') as file:
#     league_dict = pickle.load(file)

# converting the dictionary to a df
# league_df = pd.DataFrame(league_dict)

# the pkl file has now been deleted 
# converting the league df to a parquet file and loading it back in as a dictionary
# league_df.to_parquet('league_dict_v2.parquet')

# loading parquet file back into a df and then converting to a dictionary
df_loaded = pd.read_parquet('league_dict_v2.parquet')
league_dict = df_loaded.to_dict()

### Scraping the transfers based on the league dictionary

In [29]:
len(ins)

1

In [33]:

years = ["2018", "2019", "2020", "2021", "2022"]
seasons = ["18/19","19/20", "20/21", "21/22", "22/23"]
dfs = []
league_id = 'FR1'
league_code = 'ligue-1'

for year, main_season in zip(years, seasons):

    url = f'https://www.transfermarkt.com/{league_code}/transfers/wettbewerb/{league_id}/plus/?saison_id={year}&s_w=&leihe=3&intern=0&intern=1'
    soup = create_soup(url)

    players = []
    selling_clubs = []
    buying_clubs = []
    selling_leagues = []
    buying_leagues = []
    fees = []

    boxes = soup.find('div', class_='large-8 columns').find_all('div', class_='box')

    for box in tqdm(boxes[3:]):
        club_name = box.find('h2', class_='content-box-headline content-box-headline--inverted content-box-headline--logo').find_all('a')[1].get('href').split('/')[1]
        print(f'club_name: {club_name}')
        tables = box.find_all('div', class_='responsive-table')
        ins = tables[0].find('tbody').find_all('tr')
        outs = tables[1].find('tbody').find_all('tr')

        if len(ins) > 1:
            for row in ins:
                fee = row.find_all('td', class_='rechts')[1].string
                if isinstance(fee, str):
                    fee_first = fee[0]
                    fee_last = fee[-1]
                    if fee_first == "€":
                        fee_number = fee[1:-1]
                        if fee_last == "m":
                            fee_value = float(fee_number)*10**6
                        elif fee_last == 'k':
                            fee_value = float(fee_number)*10**3

                        player_name = row.find('span', class_='hide-for-small').string
                        club_section = row.find('td', class_='no-border-links verein-flagge-transfer-cell').find('a')
                        from_club_code = club_section.get('href').split('/')[1]
                        # print(from_club)
                        if from_club_code in league_dict:
                            if main_season in league_dict[from_club_code]:
                                selling_league = league_dict[from_club_code][main_season]
                            else:
                                selling_league = '-'
                        else:
                            selling_league = '-'
                        
                        players.append(player_name)
                        selling_clubs.append(from_club_code)
                        buying_clubs.append(club_name)
                        selling_leagues.append(selling_league)
                        buying_leagues.append(league_id)
                        fees.append(fee_value)
                    
        if len(outs) > 1:
            for row in outs:
                fee = row.find_all('td', class_='rechts')[1].string
                if isinstance(fee, str):
                    fee_first = fee[0]
                    fee_last = fee[-1]
                    if fee_first == "€":
                        fee_number = fee[1:-1]
                        if fee_last == "m":
                            fee_value = float(fee_number)*10**6
                        elif fee_last == 'k':
                            fee_value = float(fee_number)*10**3
                    
                        player_name = row.find('span', class_='hide-for-small').string
                        club_section = row.find('td', class_='no-border-links verein-flagge-transfer-cell').find('a')
                        to_club_code = club_section.get('href').split('/')[1]
                        # print(to_club)
                        if to_club_code in league_dict:
                            if main_season in league_dict[to_club_code]:
                                buying_league = league_dict[to_club_code][main_season]
                            else:
                                buying_league = '-'
                        else:
                            buying_league = '-'

                        players.append(player_name)
                        selling_clubs.append(club_name)
                        buying_clubs.append(to_club_code)
                        selling_leagues.append(league_id)
                        buying_leagues.append(buying_league)
                        fees.append(fee_value)
    
    # Define a larger DataFrame
    data = {'Season': [main_season]*len(players),
            'Player': players, 
            'From_Club': selling_clubs,
            'To_Club': buying_clubs,
            'From_League': selling_leagues,
            'To_League': buying_leagues,
            'Fee': fees}
    df = pd.DataFrame(data)
    dfs.append(df)

100%|██████████| 20/20 [00:00<00:00, 187.34it/s]


club_name: amiens-sc
club_name: sco-angers
club_name: fc-girondins-bordeaux
club_name: sm-caen
club_name: dijon-fco
club_name: ea-guingamp
club_name: losc-lille
club_name: olympique-lyon
club_name: olympique-marseille
club_name: as-monaco
club_name: montpellier-hsc
club_name: fc-nantes
club_name: ogc-nizza
club_name: nimes-olympique
club_name: fc-paris-saint-germain
club_name: stade-reims
club_name: fc-stade-rennes
club_name: as-saint-etienne
club_name: rc-strassburg-alsace
club_name: fc-toulouse


100%|██████████| 20/20 [00:00<00:00, 197.64it/s]


club_name: amiens-sc
club_name: sco-angers
club_name: fc-girondins-bordeaux
club_name: stade-brest-29
club_name: dijon-fco
club_name: losc-lille
club_name: olympique-lyon
club_name: olympique-marseille
club_name: fc-metz
club_name: nimes-olympique
club_name: as-monaco
club_name: montpellier-hsc
club_name: fc-nantes
club_name: ogc-nizza
club_name: fc-paris-saint-germain
club_name: stade-reims
club_name: fc-stade-rennes
club_name: as-saint-etienne
club_name: rc-strassburg-alsace
club_name: fc-toulouse


100%|██████████| 20/20 [00:00<00:00, 145.49it/s]


club_name: fc-paris-saint-germain
club_name: olympique-marseille
club_name: fc-stade-rennes
club_name: losc-lille
club_name: ogc-nizza
club_name: stade-reims
club_name: olympique-lyon
club_name: montpellier-hsc
club_name: rc-strassburg-alsace
club_name: as-monaco
club_name: sco-angers
club_name: fc-girondins-bordeaux
club_name: fc-nantes
club_name: stade-brest-29
club_name: fc-metz
club_name: dijon-fco
club_name: as-saint-etienne
club_name: nimes-olympique
club_name: fc-lorient
club_name: rc-lens


100%|██████████| 20/20 [00:00<00:00, 214.22it/s]


club_name: losc-lille
club_name: fc-paris-saint-germain
club_name: as-monaco
club_name: olympique-lyon
club_name: olympique-marseille
club_name: fc-stade-rennes
club_name: rc-lens
club_name: montpellier-hsc
club_name: ogc-nizza
club_name: fc-metz
club_name: as-saint-etienne
club_name: fc-girondins-bordeaux
club_name: sco-angers
club_name: stade-reims
club_name: rc-strassburg-alsace
club_name: fc-lorient
club_name: stade-brest-29
club_name: fc-nantes
club_name: es-troyes-ac
club_name: clermont-foot-63


100%|██████████| 20/20 [00:00<00:00, 152.80it/s]

club_name: fc-paris-saint-germain
club_name: olympique-marseille
club_name: as-monaco
club_name: fc-stade-rennes
club_name: ogc-nizza
club_name: rc-strassburg-alsace
club_name: rc-lens
club_name: olympique-lyon
club_name: fc-nantes
club_name: losc-lille
club_name: stade-brest-29
club_name: stade-reims
club_name: montpellier-hsc
club_name: sco-angers
club_name: es-troyes-ac
club_name: fc-lorient
club_name: clermont-foot-63
club_name: fc-toulouse
club_name: ac-ajaccio
club_name: aj-auxerre





In [34]:
main_df = pd.concat(dfs)

# Display the DataFrame using df.head()
main_df.iloc[0:50]

Unnamed: 0,Season,Player,From_Club,To_Club,From_League,To_League,Fee
0,18/19,Saman Ghoddos,ostersunds-fk,amiens-sc,SE1,FR1,4000000.0
1,18/19,Juan Otero,club-estudiantes-de-la-plata,amiens-sc,AR1N,FR1,2140000.0
2,18/19,Rafal Kurzawa,gornik-zabrze,amiens-sc,PL1,FR1,850000.0
3,18/19,Tanguy Ndombélé,amiens-sc,olympique-lyon,FR1,FR1,8000000.0
4,18/19,Guessouma Fofana,amiens-sc,ea-guingamp,FR1,FR1,1000000.0
5,18/19,Harrison Manzala,amiens-sc,sco-angers,FR1,FR1,1000000.0
6,18/19,Jeff Reine-Adélaïde,fc-arsenal-u23,sco-angers,,FR1,1600000.0
7,18/19,Vincent Pajot,as-saint-etienne,sco-angers,FR1,FR1,1000000.0
8,18/19,Harrison Manzala,amiens-sc,sco-angers,FR1,FR1,1000000.0
9,18/19,Dorian Bertrand,so-cholet,sco-angers,FR3,FR1,800000.0


In [35]:
# saving dataframe as csv file
main_df.to_csv('L1_transfers_v2.csv', index=False)

In [36]:
# combining csv files
file_names = ['PL_transfers_v2.csv', 'LL_transfers_v2.csv', 'BL_transfers_v2.csv', 'SA_transfers_v2.csv', 'L1_transfers_v2.csv']
dfs = []

for file_name in file_names:
    df = pd.read_csv(file_name)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv('all_transfers_v2.csv', index=False)

In [37]:
# removing duplicate rows from the final data
df = pd.read_csv('all_transfers_v2.csv')
df.drop_duplicates(inplace=True)
df.to_csv('all_transfers_no_duplicates.csv', index=False)

In [None]:
# dictionary for looking up league IDs
league_id_dict = {'L1': 'Bundesliga', 'GB1': 'Premier League', 'FR1': 'Ligue 1', 'IT1': 'Serie A', 'ES1': 'La Liga'}