In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import os
import re

In [2]:
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

currency_symbols = ["€", "$", "£"]

def transfer_markt_soup(season_year=2020):
    url = "https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/"
    payload = {'saison_id': season_year}

    page_req = requests.get(url, headers=headers, params=payload)
    print(page_req.url)

    return BeautifulSoup(page_req.content, "html.parser")

In [3]:
season_year = 2010
page_soup = transfer_markt_soup(season_year)

https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=2010


## Club Main Data: Foreigners, Market Value, and Squad Size

In [4]:
table_div = page_soup.find(id='yw1')
table_main = table_div.find('table', class_="items")

In [5]:
table_head = []
for i in table_main.find_all("th"):
 title = i.text
 table_head.append(title)

In [6]:
table_head

['Club',
 'name',
 'Squad',
 'ø age',
 'Foreigners',
 'ø market value',
 'Total market value']

In [7]:
table_body = []
rows = table_main.find("tbody").find_all("tr")
for row in rows:
    cells = row.find_all("td")
    row_data = []
    for i in range(len(cells)):
        cell_value_raw = cells[i].get_text()
        try:
            cell_value_final = float(cell_value_raw)
        except ValueError:
            if cell_value_raw != "":
                if any(cell_value_raw[0] == el for el in currency_symbols):
                    cell_value_raw = cell_value_raw[1:-1]
                    cell_value_final = float(cell_value_raw) 
                else:
                    cell_value_final = cell_value_raw.strip()
            else:
                cell_value_final = cell_value_raw
        row_data.append(cell_value_final)
    table_body.append(row_data)

In [8]:
# Making pandas dataframe to clean up data 
main_df = pd.DataFrame(data=table_body, columns=table_head)
main_df = main_df.drop('Club',axis=1)
main_df

Unnamed: 0,name,Squad,ø age,Foreigners,ø market value,Total market value
0,FC Internazionale,44.0,25.6,27.0,7.89,346.95
1,AC Milan,45.0,27.2,22.0,6.4,288.08
2,Juventus FC,51.0,25.5,15.0,5.08,258.93
3,ACF Fiorentina,42.0,24.4,22.0,4.18,175.48
4,AS Roma,43.0,26.2,22.0,4.02,172.98
5,SSC Napoli,31.0,27.8,10.0,5.5,170.43
6,Genoa CFC,45.0,24.8,22.0,3.74,168.18
7,Udinese Calcio,40.0,24.1,23.0,3.43,137.05
8,UC Sampdoria,46.0,24.0,14.0,2.72,125.05
9,US Palermo,45.0,23.7,16.0,2.73,123.03


## Club Ranking General Data

In [9]:
table_div = page_soup.find(id='yw4')
table_main = table_div.find('table', class_="items")

In [10]:
table_head = []
for i in table_main.find_all("th"):
 title = i.text
 if title == '\xa0':
    title = 'MP'
 table_head.append(title)

In [11]:
table_head

['#', 'Club', 'MP', '+/-', 'Pts']

In [12]:
table_body = []
rows = table_main.find("tbody").find_all("tr")
for row in rows:
    cells = row.find_all("td")
    row_data = []
    for i in range(len(cells)):
        cell_value_raw = cells[i].get_text().strip()
        if re.match(r"^[0-9]+$", cell_value_raw) and cell_value_raw != "":
            cell_value_final = int(cell_value_raw)
        elif cell_value_raw == "":
            continue
        else:
            cell_value_final = cell_value_raw
        row_data.append(cell_value_final)
    table_body.append(row_data)

In [13]:
df2 = pd.DataFrame(data=table_body, columns=table_head)
df2

Unnamed: 0,#,Club,MP,+/-,Pts
0,1,AC Milan,38,41,82
1,2,Inter,38,27,76
2,3,SSC Napoli,38,20,70
3,4,Udinese Calcio,38,22,66
4,5,Lazio,38,16,66
5,6,AS Roma,38,7,63
6,7,Juventus,38,10,58
7,8,US Palermo,38,-5,56
8,9,Fiorentina,38,5,51
9,10,Genoa,38,-2,51


## Merging the two datasets

In [14]:
# Fuzzy Matching team names in preparation of the merge
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [15]:
left_main_df = fuzzy_merge(main_df, df2, 'name', 'Club', threshold=80)
right_df2 = df2

In [16]:
df3 = left_main_df.merge(right_df2, left_on='matches', right_on='Club')
df3 = df3.drop(columns=['matches', 'Club'])
df3 = df3.rename(columns={"+/-": "GD", "#": "Ranking", "ø market value": "Average market value", "ø age": "Average age"})
df3 = df3.astype({'GD': 'int64'})
df3['Season'] = season_year
df3

Unnamed: 0,name,Squad,Average age,Foreigners,Average market value,Total market value,Ranking,MP,GD,Pts,Season
0,FC Internazionale,44.0,25.6,27.0,7.89,346.95,2,38,27,76,2010
1,AC Milan,45.0,27.2,22.0,6.4,288.08,1,38,41,82,2010
2,Juventus FC,51.0,25.5,15.0,5.08,258.93,7,38,10,58,2010
3,ACF Fiorentina,42.0,24.4,22.0,4.18,175.48,9,38,5,51,2010
4,AS Roma,43.0,26.2,22.0,4.02,172.98,6,38,7,63,2010
5,SSC Napoli,31.0,27.8,10.0,5.5,170.43,3,38,20,70,2010
6,Genoa CFC,45.0,24.8,22.0,3.74,168.18,10,38,-2,51,2010
7,Udinese Calcio,40.0,24.1,23.0,3.43,137.05,4,38,22,66,2010
8,UC Sampdoria,46.0,24.0,14.0,2.72,125.05,18,38,-16,36,2010
9,US Palermo,45.0,23.7,16.0,2.73,123.03,8,38,-5,56,2010


In [17]:
os.makedirs("data", exist_ok=True)  
df3.to_csv(f"data/SerieA_{season_year}.csv")  