# Data Acquisition of Achievement Data from Transfermarkt

This notebook demonstrates how achievement  data for a players career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup and Selenium. It is used to retreive all achievement data for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi Achievement Data](https://www.transfermarkt.at/lionel-messi/erfolge/spieler/28003)
- [Christiano Ronaldo Achievement Data](https://www.transfermarkt.at/cristiano-ronaldo/erfolge/spieler/8198)

In [1]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [2]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

the structure for the achievements table looks as follows:

```
{
    year,
    title,
    team
}
```

In [3]:
# this function takes a row from the table and returns a structured representation of the row data.
def getTitleData(table_row):
    title_data = { }

    tds = table_row.find_all("td")
    if (len(tds) == 1):
        title_data["year"] = tds[0].text.strip()
        title_data["team"] = None
        return title_data
    elif (len(tds) != 3):
        print("Invalid length for table column amount within a row")
        return title_data

    # get year
    year_tag = tds[0]
    title_data["year"] = year_tag.text.strip()

    # get team 
    team_tag = tds[1].find_next("img")
    if (team_tag):
        title_data["team"] = team_tag.get("alt", "No teamname available")
    else:
        title_data["team"] = "No teamname available"

    return title_data


In [4]:
def getAllAchievements(page):
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    titleList = []
    htwos = pageSoup.find_all("h2", string="\n                    Alle Titel                ")
    if (len(htwos) < 1):
        print("Could not find all trophies table")

    rows = htwos[0].find_next("tbody").find_all("tr")

    current_title_name = None # some tr elements represent table title names, there
    # iterate over rows, but exclude first two rows that are the header and total sum
    for row_id in range(len(rows)):
        # if row is current table header, change current title name
        if (rows[row_id].find_next("td").get("class", "no class available")[0] == "hauptlink"):
            current_title_name = rows[row_id].find_next("td").text.strip()
            current_title_name = re.sub(r'\d+x ', '', current_title_name)
        else:
            # in this case the row is a won title which is stored 
            title = getTitleData(rows[row_id])
            title["title"] = current_title_name
            titleList.append(title)
    
    return titleList


In [5]:
page_messi = "https://www.transfermarkt.at/lionel-messi/erfolge/spieler/28003"
messi_data = getAllAchievements(page_messi)

page_ronaldo = "https://www.transfermarkt.at/cristiano-ronaldo/erfolge/spieler/8198"
ronaldo_data = getAllAchievements(page_ronaldo)

In [6]:
# Creates a dataframe using the specified data structure for each title of the player.
def createDataFrameFromTitles(titleList):
    rows = []
    for title in titleList:
        row = {
            'year': title['year'],
            'title': title['title'],
            'team': title['team'],
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    print("Dataframes successfully created.")
    return df


In [7]:
# Create dataframes to be saved
df_messi = createDataFrameFromTitles(messi_data)
df_ronaldo = createDataFrameFromTitles(ronaldo_data)

Dataframes successfully created.
Dataframes successfully created.


In [14]:
# Combine the datasets
df_messi["player_name"] = "Lionel Messi"
df_ronaldo["player_name"] = "Christiano Ronaldo"

# set player name as first column
df = pd.concat([df_messi, df_ronaldo], ignore_index=True)
columns = ['player_name'] + [col for col in df.columns if col != 'player_name']
df = df[columns]

In [22]:
# remove unwanted data
df = df[~df['title'].str.contains('Teilnehmer', na=False)]
df = df[~df['title'].str.contains('Finalist', na=False)]

In [21]:
# store data
folder_name = "data"
try:
    os.makedirs(folder_name, exist_ok=False)
    print("Folder created for storing goal data")
except Exception:
    print("Folder already exists")


df.to_csv('./data/' + "player_achievements.csv", index=False, encoding="utf-8")

Folder already exists
