# Data Acquisition for Performance Data from Transfermarkt

This notebook demonstrates how performance  data for a players career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup and Selenium. It is used to retreive all performance data for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi Club Performance Data](https://www.transfermarkt.at/lionel-messi/detaillierteleistungsdaten/spieler/28003/plus/1)
- [Christiano Ronaldo Club Performance Data](https://www.transfermarkt.at/cristiano-ronaldo/detaillierteleistungsdaten/spieler/8198/plus/1)

In [190]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [191]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

The structure of the performance data table looks as follows:
```
{
    "saison",
    "competition_type",
    "competition",
    "club",
    "games_played",
    "goals",
    "assists",
    "owngoals",
    "substitute_in_amount",
    "substitute_out_amount",
    "yellow_cards",
    "yellow_red_cards",
    "red_cards",
    "penalties",
    "minutes_played"
}
```

In [192]:
# this function takes a row from the games played and fetches all relevant game performance data.
def getGameData(table_row):
    game_data = { }

    # get saison if there is one
    saison_tag = table_row.find_next("td")
    if (saison_tag):
        game_data["saison"] = saison_tag.text.strip()

    # get competition where player played
    competition_tag = saison_tag.find_next("img")
    if (competition_tag):   
        game_data["competition"] = competition_tag.get("title", "No title available")

        
    # get club where player played
    club_tag = competition_tag.find_next("img")
    if (club_tag):
        game_data["club"] = club_tag.get("alt", "no alt available")

    # get games played in competition in a season
    games_tag = club_tag.find_next("td")
    if (games_tag):
        game_data["games_played"] = games_tag.text.strip()
        if (game_data["games_played"] == "-"):
            game_data["games_played"] = 0
    
    # get goals in competition for row
    goals_tag = games_tag.find_next("td")
    if (goals_tag):
        game_data["goals"] = goals_tag.text.strip()
        if (game_data["goals"] == "-"):
            game_data["goals"] = 0
    
    # get assists by player
    assists_tag = goals_tag.find_next("td")
    if (assists_tag):
        game_data["assists"] = assists_tag.text.strip()
        if (game_data["assists"] == "-"):
            game_data["assists"] = 0
            
    # get owngoals in competition for row
    owngoals_tag = assists_tag.find_next("td")
    if (owngoals_tag):
        game_data["owngoals"] = owngoals_tag.text.strip()
        if (game_data["owngoals"] == "-"):
            game_data["owngoals"] = 0
    
    # get amount of substitute in for player
    subst_in_tag = owngoals_tag.find_next("td")
    if (subst_in_tag):
        game_data["substitute_in"] = subst_in_tag.text.strip()
        if (game_data["substitute_in"] == "-"):
            game_data["substitute_in"] = 0

    # get amount of substitute out for player
    subst_out_tag = subst_in_tag.find_next("td")
    if (subst_out_tag):
        game_data["substitute_out"] = subst_out_tag.text.strip()
        if (game_data["substitute_out"] == "-"):
            game_data["substitute_out"] = 0

    # get amount of yellow cards for player
    yellow_tag = subst_out_tag.find_next("td")
    if (yellow_tag):
        game_data["yellow_cards"] = yellow_tag.text.strip()
        if (game_data["yellow_cards"] == "-"):
            game_data["yellow_cards"] = 0

    # get amount of yellow red out for player
    yellow_red_tag = yellow_tag.find_next("td")
    if (yellow_red_tag):
        game_data["yellow_red_cards"] = yellow_red_tag.text.strip()
        if (game_data["yellow_red_cards"] == "-"):
            game_data["yellow_red_cards"] = 0

     # get amount of straight red cards for player
    red_cards_tag = yellow_red_tag.find_next("td")
    if (red_cards_tag):
        game_data["red_cards"] = red_cards_tag.text.strip()
        if (game_data["red_cards"] == "-"):
            game_data["red_cards"] = 0

    # get penalty amount 
    penalties_tag = red_cards_tag.find_next("td")
    if (penalties_tag):
        game_data["penalties"] = penalties_tag.text.strip()
        if (game_data["penalties"] == "-"):
            game_data["penalties"] = 0
    
    # get minutes playded
    minutes_tag = penalties_tag.find_next("td").find_next("td")
    if (minutes_tag):
        game_data["minutes_played"] = minutes_tag.text.strip()
        if (game_data["minutes_played"] == "-"):
            game_data["minutes_played"] = 0

    return game_data

In [193]:
def getAllClubPerformances(page):
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    gamesList = []
    # get tables of national liga, national cup and international cup data.
    tables = pageSoup.find_all("table")
    # get competition type names
    htwos = pageSoup.find_all("h2", class_="content-box-headline")
    competition_type_names = [h2.text.strip() for h2 in htwos[1:]]


    for table_id in range(len(tables)):
        rows = tables[table_id].find_all("tr")
        # iterate over rows, but exclude first two rows that are the header and total sum
        for row_id in range(2, len(rows)):
            game = getGameData(rows[row_id])
            game["competition_type"] = competition_type_names[table_id]
            gamesList.append(game)
            
    return gamesList


In [194]:
page_messi = "https://www.transfermarkt.at/lionel-messi/detaillierteleistungsdaten/spieler/28003/plus/1"
messi_stats = getAllClubPerformances(page_messi)

page_ronaldo= "https://www.transfermarkt.at/cristiano-ronaldo/detaillierteleistungsdaten/spieler/8198/plus/1"
ronaldo_stats = getAllClubPerformances(page_ronaldo)

In [195]:
# Creates a dataframe using the specified data structure for each club game played by the player.
def createDataFrameFromGames(gamesList):
    # store goals list in dataframe
    rows = []
    for game in gamesList:
        row = {
            'saison': game["saison"],
            'competition_type': game['competition_type'],
            'competition': game['competition'],
            'club': game['club'],
            'games_played': game['games_played'],
            'goals': game['goals'],
            'assists': game['assists'],
            'owngoals': game['owngoals'],
            'substitute_in': game['substitute_in'],
            'substitute_out': game['substitute_out'],
            'yellow_cards': game['yellow_cards'],
            'yellow_red_cards': game['yellow_red_cards'],
            'red_cards': game['red_cards'],
            'penalties': game['penalties'],
            'minutes_played': game['minutes_played']
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    print("Dataframes successfully created.")
    return df


In [196]:
# Create dataframes to be saved
df_messi = createDataFrameFromGames(messi_stats)
df_ronaldo = createDataFrameFromGames(ronaldo_stats)

Dataframes successfully created.
Dataframes successfully created.


In [197]:
# Combine the datasets
df_messi["player_name"] = "Lionel Messi"
df_ronaldo["player_name"] = "Christiano Ronaldo"

# set player name as first column
df = pd.concat([df_messi, df_ronaldo], ignore_index=True)
columns = ['player_name'] + [col for col in df.columns if col != 'player_name']
df = df[columns]

In [198]:
# store data
folder_name = "data"
try:
    os.makedirs(folder_name, exist_ok=False)
    print("Folder created for storing goal data")
except Exception:
    print("Folder already exists")

df.to_csv('./data/' + "player_club_performance.csv", index=False, encoding="utf-8")

Folder already exists
