In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import os
import re



In [2]:
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

currency_symbols = ["€", "$", "£"]

def transfer_markt_soup(season_year=2020):
    url = "https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/"
    payload = {'saison_id': season_year}

    page_req = requests.get(url, headers=headers, params=payload)
    print(page_req.url)

    return BeautifulSoup(page_req.content, "html.parser")

In [3]:
season_year = 2000
page_soup = transfer_markt_soup(season_year)

https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=2000


## Club Main Data: Foreigners, Market Value, and Squad Size

In [4]:
table_div = page_soup.find(id='yw1')
table_main = table_div.find('table', class_="items")

In [5]:
table_head = []
for i in table_main.find_all("th"):
 title = i.text
 table_head.append(title)

In [6]:
table_head

['Club',
 'name',
 'Squad',
 'ø age',
 'Foreigners',
 'ø market value',
 'Total market value']

In [7]:
table_body = []
rows = table_main.find("tbody").find_all("tr")
for row in rows:
    cells = row.find_all("td")
    row_data = []
    for i in range(len(cells)):
        cell_value_raw = cells[i].get_text()
        try:
            cell_value_final = float(cell_value_raw)
        except ValueError:
            if cell_value_raw != "":
                if any(cell_value_raw[0] == el for el in currency_symbols):
                    cell_value_raw = cell_value_raw[1:-1]
                    cell_value_final = float(cell_value_raw) 
                else:
                    cell_value_final = cell_value_raw.strip()
            else:
                cell_value_final = cell_value_raw
        row_data.append(cell_value_final)
    table_body.append(row_data)

In [8]:
# Making pandas dataframe to clean up data 
main_df = pd.DataFrame(data=table_body, columns=table_head)
main_df = main_df.drop('Club',axis=1)
main_df

Unnamed: 0,name,Squad,ø age,Foreigners,ø market value,Total market value
0,SS Lazio,38.0,25.8,14.0,-,-
1,Udinese Calcio,43.0,23.6,22.0,-,-
2,AS Roma,37.0,24.5,14.0,-,-
3,AC Fiorentina,36.0,25.3,9.0,-,-
4,Brescia Calcio,33.0,27.4,11.0,-,-
5,Juventus FC,43.0,24.1,16.0,-,-
6,Atalanta BC,39.0,24.2,5.0,-,-
7,AC Parma,38.0,25.0,16.0,-,-
8,AC Perugia,42.0,25.7,12.0,-,-
9,Hellas Verona,38.0,24.3,6.0,-,-


## Club Ranking General Data

In [9]:
table_div = page_soup.find(id='yw4')
table_main = table_div.find('table', class_="items")

In [10]:
table_head = []
for i in table_main.find_all("th"):
 title = i.text
 if title == '\xa0':
    title = 'MP'
 table_head.append(title)

In [11]:
table_head

['#', 'Club', 'MP', '+/-', 'Pts']

In [12]:
table_body = []
rows = table_main.find("tbody").find_all("tr")
for row in rows:
    cells = row.find_all("td")
    row_data = []
    for i in range(len(cells)):
        cell_value_raw = cells[i].get_text().strip()
        if re.match(r"^[0-9]+$", cell_value_raw) and cell_value_raw != "":
            cell_value_final = int(cell_value_raw)
        elif cell_value_raw == "":
            continue
        else:
            cell_value_final = cell_value_raw
        row_data.append(cell_value_final)
    table_body.append(row_data)

In [13]:
df2 = pd.DataFrame(data=table_body, columns=table_head)
df2

Unnamed: 0,#,Club,MP,+/-,Pts
0,1,AS Roma,34,35,75
1,2,Juventus,34,34,73
2,3,Lazio,34,29,69
3,4,AC Parma,34,20,56
4,5,Inter,34,0,51
5,6,Milan,34,10,49
6,7,Atalanta BC,34,4,44
7,8,Brescia,34,2,44
8,9,Fiorentina,34,1,43
9,10,Bologna,34,-4,43


## Merging the two datasets

In [14]:
# Fuzzy Matching team names in preparation of the merge
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [15]:
left_main_df = fuzzy_merge(main_df, df2, 'name', 'Club', threshold=80)
right_df2 = df2

In [16]:
df3 = left_main_df.merge(right_df2, left_on='matches', right_on='Club')
df3 = df3.drop(columns=['matches', 'Club'])
df3 = df3.rename(columns={"+/-": "GD", "#": "Ranking", "ø market value": "Average market value", "ø age": "Average age"})
df3 = df3.astype({'GD': 'int64'})
df3['Season'] = season_year
df3

Unnamed: 0,name,Squad,Average age,Foreigners,Average market value,Total market value,Ranking,MP,GD,Pts,Season
0,SS Lazio,38.0,25.8,14.0,-,-,3,34,29,69,2000
1,Udinese Calcio,43.0,23.6,22.0,-,-,12,34,-10,38,2000
2,AS Roma,37.0,24.5,14.0,-,-,1,34,35,75,2000
3,Brescia Calcio,33.0,27.4,11.0,-,-,8,34,2,44,2000
4,Juventus FC,43.0,24.1,16.0,-,-,2,34,34,73,2000
5,Atalanta BC,39.0,24.2,5.0,-,-,7,34,4,44,2000
6,AC Parma,38.0,25.0,16.0,-,-,4,34,20,56,2000
7,AC Perugia,42.0,25.7,12.0,-,-,11,34,-4,42,2000
8,Hellas Verona,38.0,24.3,6.0,-,-,15,34,-19,37,2000
9,US Lecce,40.0,25.2,11.0,-,-,13,34,-14,37,2000


In [17]:
os.makedirs("data", exist_ok=True)  
df3.to_csv(f"data/SerieA_{season_year}.csv")  