# Data Acquisition for Injury Data from Transfermarkt

This notebook demonstrates how injury data for a players career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup and Selenium. It is used to retreive all injury periods for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi Injury Data](https://www.transfermarkt.at/lionel-messi/verletzungen/spieler/28003/plus/1)
- [Christiano Ronaldo Injury Data](https://www.transfermarkt.at/cristiano-ronaldo/verletzungen/spieler/8198/plus/1)

In [224]:
import requests 
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re
import os

In [225]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

In [231]:
import time

def getInjuryDataForPlayer(page):
    driver = webdriver.Chrome()
    driver.get(page)
    initial_source = driver.page_source

    # check for correct iframe
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"Number of iframes found: {len(iframes)}")
    driver.switch_to.frame(iframes[1])

    # get cookies button and accept
    try:
        wait = WebDriverWait(driver, 10)
        buttons = driver.find_elements(By.XPATH, "//button[@title='Zustimmen & weiter']")
        # go through all buttons (should be only 1) and click
        for button in buttons:
            button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(buttons[0])  # Assuming you want the first button
            )
            print("title Name:", button.get_attribute("title"))  # Prints the button's class
            button.click()
    except Exception as e:
        print("Failed to click the button:", (e))


    WebDriverWait(driver, 4)

    # init data
    injury_tables = []
    notLastPage = True
    while(notLastPage):
        # get new source (i dont know but when this is removed the program does not work)
        new_source  = driver.page_source
        if initial_source != new_source:
            print("Dynamic content was loaded.")

        # load table using soup
        html = new_source
        pageSoup = BeautifulSoup(html, "html.parser")
        table = pageSoup.find_all("table")
        injury_tables.append(table[0])
        # try to get nexdt button and click
        # if no button available, then its the last page
        try:
            elements = driver.find_elements(By.XPATH, "//a[@title='Zur naechsten Seite']")
            print(f"Found {len(elements)} <a> tags.")
            for e in elements:
                e.click()
            if (len(elements) == 0):
                notLastPage = False
        except:
            # stop the loop
            notLastPage = False
            
        # needed for timing reasons
        time.sleep(2)
    return injury_tables

In [232]:
# load data for messi
page_messi = "https://www.transfermarkt.at/lionel-messi/verletzungen/spieler/28003/plus/1"
injury_tables_messi = getInjuryDataForPlayer(page_messi)

Number of iframes found: 2
title Name: Zustimmen & weiter
Dynamic content was loaded.
Found 1 <a> tags.
Dynamic content was loaded.
Found 1 <a> tags.
Dynamic content was loaded.
Found 1 <a> tags.
Dynamic content was loaded.
Found 0 <a> tags.


In [233]:
# load data for ronaldo
page_ronaldo = "https://www.transfermarkt.at/cristiano-ronaldo/verletzungen/spieler/8198/plus/1"
injury_tables_ronaldo = getInjuryDataForPlayer(page_ronaldo)

Number of iframes found: 2
title Name: Zustimmen & weiter
Dynamic content was loaded.
Found 1 <a> tags.
Dynamic content was loaded.
Found 0 <a> tags.


The data for the injury table will have the following structure:

```
{
    "saison",
    "injury_description",
    "start_date",
    "end_date",
    "days",
    "missed_games"
}
```

In [234]:
# gets injury data from row
def getInjuryRow(row):
    injury = {}

    # get saison
    saison_tag = row.find_next("td")
    if saison_tag:
        injury["saison"] = saison_tag.text.strip()

    # get injury description
    injury_tag = saison_tag.find_next()
    if injury_tag:
        # exclude running injuries
        if (injury_tag.get("class", "No alt attribute found") != ['hauptlink', 'bg_rot_20']):
            injury["injury_description"] = injury_tag.text.strip()

    # get start date of injury
    start_tag = injury_tag.find_next()
    if start_tag:
        injury["start_date"] = start_tag.text.strip()

    # get end date of injury
    end_tag = start_tag.find_next()
    if end_tag:
        injury["end_date"] = end_tag.text.strip()

    # get day amount of injury
    days_tag = end_tag.find_next()
    if days_tag:
        injury["days"] = int(days_tag.text.strip().replace(" Tage", ""))
    
    # get amount of missed games
    missed_games_tag = days_tag.find_next()
    if missed_games_tag:
        span_tag = missed_games_tag.find("span")
        if (span_tag == None): # span could not exist
            injury["missed_games"] = missed_games_tag.text.strip()
        else:
            injury["missed_games"] = span_tag.text.strip()
    if (injury["missed_games"] == "-"):
        injury["missed_games"] = 0
    else:
        injury["missed_games"] = int(injury["missed_games"])

    return injury

In [235]:
def loadInjuryDataForPlayer(table):
    rows = table.find_all("tr")
    injury_data = []
    for i in range(1,len(rows)):
        # exclude currently ongoing injuries
        if (rows[i].find(class_="bg_rot_20")):
            continue
        injury_row = getInjuryRow(rows[i])
        injury_data.append(injury_row)
    return injury_data


In [236]:
# this function loads the table data into a single dataframe.
def createDataFrameForTables(tables):
    rows = []
    for table in tables:
        data = loadInjuryDataForPlayer(table)
        for injury in data:
            df_row = {
                    'saison': injury['saison'],
                    'injury_description': injury['injury_description'],
                    'start_date': injury['start_date'],
                    'end_date': injury['end_date'],
                    'days': injury['days'],
                    'missed_games': injury['missed_games']
                }
            rows.append(df_row)

    df = pd.DataFrame(rows)
    df["start_date"] = pd.to_datetime(df["start_date"], format="%d.%m.%Y")
    df["end_date"] = pd.to_datetime(df["end_date"], format="%d.%m.%Y")
    return df

In [237]:
# create dataframe and correct 
df_ronaldo = createDataFrameForTables(injury_tables_ronaldo)
df_messi = createDataFrameForTables(injury_tables_messi)
df_messi

Unnamed: 0,saison,injury_description,start_date,end_date,days,missed_games
0,24/25,Bänderverletzung,2024-07-16,2024-09-12,58,10
1,23/24,Schonung,2024-05-24,2024-05-27,3,1
2,23/24,Beinverletzung,2024-05-14,2024-05-18,4,1
3,23/24,muskuläre Probleme,2024-03-15,2024-04-05,21,6
4,23/24,Schonung,2024-03-09,2024-03-12,3,1
5,23/24,Muskelverletzung,2023-09-22,2023-10-06,14,4
6,23/24,Muskelverletzung,2023-09-06,2023-09-18,12,4
7,22/23,Knieprobleme,2023-02-08,2023-02-13,5,2
8,22/23,Schonung,2022-12-26,2023-01-03,8,2
9,22/23,Achillesfersenprobleme,2022-11-05,2022-11-07,2,1


In [238]:
# store data
folder_name = "data"
try:
    os.makedirs(folder_name, exist_ok=False)
    print("Folder created for storing goal data")
except Exception:
    print("Folder already exists")

df_messi.to_csv('./data/' + "messi_injuries.csv", index=False, encoding="utf-8")
df_ronaldo.to_csv('./data/' + "ronaldo_injuries.csv", index=False, encoding="utf-8")

Folder already exists
