# Data Acquisition for Injury Data from Transfermarkt

This notebook demonstrates how injury data for a players career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup and Selenium. It is used to retreive all injury periods for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi Injury Data](https://www.transfermarkt.at/lionel-messi/verletzungen/spieler/28003/plus/1)
- [Christiano Ronaldo Injury Data](https://www.transfermarkt.at/cristiano-ronaldo/verletzungen/spieler/8198/plus/1)

In [1]:
import requests 
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re
import os

In [2]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

The data for the injury table will have the following structure:

```
{
    "saison",
    "injury_description",
    "start_date",
    "end_date",
    "days",
    "missed_games"
}
```

The first step of the data aquisition is to create a function that reads a single row of the injuries table and a function to read all rows from the injuries table.

In [3]:
import time

def getInjuryDataForPlayer(page):
    """
    This function opens a transfermarkt page and reads the data from an injuries table.
    The table is split into multiple pages, which have to be clicked to interact.
    Therefore the page has to be navigated using selenium.
    Args:
        page: Page contianing the injury data.
    
    Returns:
        injury_table: List containing all rows from the injuries table for a player.
    """
    driver = webdriver.Chrome()
    driver.get(page)
    initial_source = driver.page_source

    # check for correct iframe
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"Number of iframes found: {len(iframes)}")
    driver.switch_to.frame(iframes[1])

    # get cookies button and accept
    try:
        wait = WebDriverWait(driver, 10)
        buttons = driver.find_elements(By.XPATH, "//button[@title='Zustimmen & weiter']")
        # go through all buttons (should be only 1) and click
        for button in buttons:
            button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(buttons[0])  # Assuming you want the first button
            )
            print("title Name:", button.get_attribute("title"))  # Prints the button's class
            button.click()
    except Exception as e:
        print("Failed to click the button:", (e))


    WebDriverWait(driver, 4)

    # init data
    injury_tables = []
    notLastPage = True
    while(notLastPage):
        # get new source (i dont know but when this is removed the program does not work)
        new_source  = driver.page_source
        if initial_source != new_source:
            print("Dynamic content was loaded.")

        # load table using soup
        html = new_source
        pageSoup = BeautifulSoup(html, "html.parser")
        table = pageSoup.find_all("table")
        injury_tables.append(table[0])
        # try to get nexdt button and click
        # if no button available, then its the last page
        try:
            elements = driver.find_elements(By.XPATH, "//a[@title='Zur naechsten Seite']")
            print(f"Found {len(elements)} <a> tags.")
            for e in elements:
                e.click()
            if (len(elements) == 0):
                notLastPage = False
        except:
            # stop the loop
            notLastPage = False
            
        # needed for timing reasons
        time.sleep(2)
    return injury_tables

In [4]:
# load data for messi
page_messi = "https://www.transfermarkt.at/lionel-messi/verletzungen/spieler/28003/plus/1"
injury_tables_messi = getInjuryDataForPlayer(page_messi)

Number of iframes found: 2
title Name: Zustimmen & weiter
Dynamic content was loaded.
Found 1 <a> tags.
Dynamic content was loaded.
Found 1 <a> tags.
Dynamic content was loaded.
Found 1 <a> tags.
Dynamic content was loaded.
Found 0 <a> tags.


In [5]:
# load data for ronaldo
page_ronaldo = "https://www.transfermarkt.at/cristiano-ronaldo/verletzungen/spieler/8198/plus/1"
injury_tables_ronaldo = getInjuryDataForPlayer(page_ronaldo)

Number of iframes found: 2
title Name: Zustimmen & weiter
Dynamic content was loaded.
Found 1 <a> tags.
Dynamic content was loaded.
Found 0 <a> tags.


In [6]:
def getInjuryRow(row):
    """
    This function reads the data from a single row of an injuries table.
    Args:
        row: Row of the injuries table.
    
    Returns:
        injury: Data representation of the row. 
    """
    injury = {}

    # get saison
    saison_tag = row.find_next("td")
    if saison_tag:
        injury["saison"] = saison_tag.text.strip()

    # get injury description
    injury_tag = saison_tag.find_next()
    if injury_tag:
        # exclude running injuries
        if (injury_tag.get("class", "No alt attribute found") != ['hauptlink', 'bg_rot_20']):
            injury["injury_description"] = injury_tag.text.strip()

    # get start date of injury
    start_tag = injury_tag.find_next()
    if start_tag:
        injury["start_date"] = start_tag.text.strip()

    # get end date of injury
    end_tag = start_tag.find_next()
    if end_tag:
        injury["end_date"] = end_tag.text.strip()

    # get day amount of injury
    days_tag = end_tag.find_next()
    if days_tag:
        injury["days"] = int(days_tag.text.strip().replace(" Tage", ""))
    
    # get amount of missed games
    missed_games_tag = days_tag.find_next()
    if missed_games_tag:
        span_tag = missed_games_tag.find("span")
        if (span_tag == None): # span could not exist
            injury["missed_games"] = missed_games_tag.text.strip()
        else:
            injury["missed_games"] = span_tag.text.strip()
    if (injury["missed_games"] == "-"):
        injury["missed_games"] = 0
    else:
        injury["missed_games"] = int(injury["missed_games"])

    return injury

In [7]:
def loadInjuryDataForPlayer(table):
    """
    Lodas the injury data from the table into a list.
    Args:
        table: Injury table.
    
    Returns:
        injury: complete list of Data representation of the table. 
    """
    rows = table.find_all("tr")
    injury_data = []
    for i in range(1,len(rows)):
        # exclude currently ongoing injuries
        if (rows[i].find(class_="bg_rot_20")):
            continue
        injury_row = getInjuryRow(rows[i])
        injury_data.append(injury_row)
    return injury_data


In [8]:
# this function loads the table data into a single dataframe.
def createDataFrameForTables(tables):
    """
    Creates a dataframe from the injury table data.
    Args:
        tables: list of table data.
    
    Returns:
        df: dataframe of the data.
    """
    rows = []
    for table in tables:
        data = loadInjuryDataForPlayer(table)
        for injury in data:
            df_row = {
                    'saison': injury['saison'],
                    'injury_description': injury['injury_description'],
                    'start_date': injury['start_date'],
                    'end_date': injury['end_date'],
                    'days': injury['days'],
                    'missed_games': injury['missed_games']
                }
            rows.append(df_row)

    df = pd.DataFrame(rows)
    df["start_date"] = pd.to_datetime(df["start_date"], format="%d.%m.%Y")
    df["end_date"] = pd.to_datetime(df["end_date"], format="%d.%m.%Y")
    return df

Now we can create dataframes for each player, combine them into a single dataframe and store them  as a .csv file.

In [9]:
# create dataframe and correct 
df_ronaldo = createDataFrameForTables(injury_tables_ronaldo)
df_messi = createDataFrameForTables(injury_tables_messi)

In [11]:
# Combine the datasets
df_messi["player_name"] = "Lionel Messi"
df_ronaldo["player_name"] = "Christiano Ronaldo"

# set player name as first column
df = pd.concat([df_messi, df_ronaldo], ignore_index=True)
columns = ['player_name'] + [col for col in df.columns if col != 'player_name']
df = df[columns]

In [12]:
# store data
folder_name = "data"
try:
    os.makedirs(folder_name, exist_ok=False)
    print("Folder created for storing goal data")
except Exception:
    print("Folder already exists")

df.to_csv('./data/' + "player_injuries.csv", index=False, encoding="utf-8")

Folder already exists
