# Data Acquisition for Penalty  Data from Transfermarkt

This notebook demonstrates how penalty kick data for a players career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup and Selenium. It is used to retreive all penalty kicks for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi Penalty Data](https://www.transfermarkt.at/lionel-messi/elfmetertore/spieler/28003/saison_id//wettbewerb_id//plus/1#tore)
- [Christiano Ronaldo Penalty Data](https://www.transfermarkt.at/cristiano-ronaldo/elfmetertore/spieler/8198/saison_id//wettbewerb_id//plus/1#tore)

In [1]:
import requests 
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re
import os

In [2]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

In [3]:
import time

def getSuccessfullPenaltyData(page):
    driver = webdriver.Chrome()
    driver.get(page)
    initial_source = driver.page_source

    # check for correct iframe
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"Number of iframes found: {len(iframes)}")
    driver.switch_to.frame(iframes[1])

    # get cookies button and accept
    try:
        wait = WebDriverWait(driver, 10)
        buttons = driver.find_elements(By.XPATH, "//button[@title='Zustimmen & weiter']")
        # go through all buttons (should be only 1) and click
        for button in buttons:
            button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(buttons[0])  # Assuming you want the first button
            )
            print("title Name:", button.get_attribute("title"))  # Prints the button's class
            button.click()
    except Exception as e:
        print("Failed to click the button:", (e))


    WebDriverWait(driver, 4)

    # init data
    penalty_tables = []
    notLastPage = True
    count = 0
    while(notLastPage):
        print(f"count: {count}")
        # get new source (i dont know but when this is removed the program does not work)
        new_source  = driver.page_source
        if initial_source != new_source:
            print("Dynamic content was loaded.")

        # load table using soup
        html = new_source
        pageSoup = BeautifulSoup(html, "html.parser")
        # get successfull penaltie area to find table in it.
        boxes = pageSoup.find_all('div', class_='box')
        # get first table within successfull penalty box div
        table = boxes[0].find_all("table", class_="items")
        print(f"len table: {len(table)}")
        # check if there is only one table, there should not be any more than that.
        if (len(table) == 1):
            penalty_tables.append(table[0])
        # try to get next button and click
        # if no button available, then its the last page
        try:
            elements = driver.find_elements(By.XPATH, "//div[@class='box'][1]//a[@title='Zur naechsten Seite']")
            print(f"Found {len(elements)} <a> tags.")
            elements[0].click()
        except:
            # stop the loop
            notLastPage = False
        
    # needed for timing reasons
        time.sleep(4)
        count = count + 1     
    return penalty_tables  

In [4]:
import time

def geMissedPenaltyData(page):
    driver = webdriver.Chrome()
    driver.get(page)
    initial_source = driver.page_source

    # check for correct iframe
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"Number of iframes found: {len(iframes)}")
    driver.switch_to.frame(iframes[1])

    # get cookies button and accept
    try:
        wait = WebDriverWait(driver, 10)
        buttons = driver.find_elements(By.XPATH, "//button[@title='Zustimmen & weiter']")
        # go through all buttons (should be only 1) and click
        for button in buttons:
            button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(buttons[0])  # Assuming you want the first button
            )
            print("title Name:", button.get_attribute("title"))  # Prints the button's class
            button.click()
    except Exception as e:
        print("Failed to click the button:", (e))


    WebDriverWait(driver, 4)

    # init data
    penalty_tables = []
    notLastPage = True
    count = 0
    while(notLastPage):
        print(f"count: {count}")
        # get new source (i dont know but when this is removed the program does not work)
        new_source  = driver.page_source
        if initial_source != new_source:
            print("Dynamic content was loaded.")

        # load table using soup
        html = new_source
        pageSoup = BeautifulSoup(html, "html.parser")
        # get missed penaltie area to find table in it.
        boxes = pageSoup.find_all('div', class_='box')
        if (len(boxes) != 2):
            raise ValueError("invalid size for div amount") 
        # get first table within missed penalty box div
        table = boxes[1].find_all("table", class_="items")
        print(f"len table: {len(table)}")
        # check if there is only one table, there should not be any more than that.
        if (len(table) == 1):
            penalty_tables.append(table[0])
        # try to get next button and click
        # if no button available, then its the last page
        try:
            elements = driver.find_elements(By.XPATH, "//div[@class='box'][2]//a[@title='Zur naechsten Seite']")
            print(f"Found {len(elements)} <a> tags.")
            elements[0].click()
        except:
            # stop the loop
            notLastPage = False
        
    # needed for timing reasons
        time.sleep(4)
        count = count + 1     
    return penalty_tables  

In [5]:
# load data for messi
page_messi = "https://www.transfermarkt.at/lionel-messi/elfmetertore/spieler/28003/saison_id//wettbewerb_id//plus/1#tore"
penalty_success_messi = getSuccessfullPenaltyData(page_messi)
penalty_missed_messi = geMissedPenaltyData(page_messi)

Number of iframes found: 2
title Name: Zustimmen & weiter
count: 0
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 1
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 2
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 3
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 4
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 5
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 6
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 7
Dynamic content was loaded.
len table: 1
Found 0 <a> tags.
Number of iframes found: 2
title Name: Zustimmen & weiter
count: 0
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 1
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 2
Dynamic content was loaded.
len table: 1
Found 0 <a> tags.


In [6]:
# load data for ronaldo
page_ronaldo = "https://www.transfermarkt.at/cristiano-ronaldo/elfmetertore/spieler/8198/saison_id//wettbewerb_id//plus/1#tore"
penalty_success_ronaldo = getSuccessfullPenaltyData(page_ronaldo)
penalty_missed_ronaldo = geMissedPenaltyData(page_ronaldo)

Number of iframes found: 2
title Name: Zustimmen & weiter
count: 0
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 1
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 2
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 3
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 4
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 5
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 6
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 7
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 8
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 9
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 10
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 11
Dynamic content was loaded.
len table: 1
Found 1 <a> tags.
count: 12
Dynamic content was loaded.
len table: 1
Found 0 <a> tags.
Number of iframes found: 2
title Name: Zustimmen & weit

The data for the penalty table will have the following structure:

```
{
    "saison",
    "game_id",
    "competition",
    "team",
    "opponent",
    "date",
    "missed_games",
    "result",
    "minute",
    "score",
    "goalkeeper",
    "has_scored"
}
```

In [27]:
# gets penalty data from row
def getPenaltyRow(row):
    penalty = {}

    # get saison
    saison_tag = row.find_next("td")
    if saison_tag:
        penalty["saison"] = saison_tag.text.strip()

    # get competition description
    competition_tag = saison_tag.find_next()
    if competition_tag:
        penalty["competition"] = competition_tag.text.strip()
    
    # get team of player who took penalty
    team_tag = competition_tag.find_next("td").find_next("a")
    if team_tag:
        penalty["team"] = team_tag.get("title", "no title available")
        img = team_tag.find_next("img")
        if (img):
            img_class = img.get("class", "no class available")
            if (img_class == ["flaggenrahmen"]):
                penalty["competition_type"] = "International"
            else:
                penalty["competition_type"] = "Club"
    
    # get date of game
    date_tag = team_tag.find_next("td")
    if date_tag:
        penalty["date"] = date_tag.text.strip()

    # get home team during game
    home_team_tag = date_tag.find_next("td")
    home_team = ""
    if (home_team_tag):
        home_team = home_team_tag.find_next("a").get("title", "no title available")
    penalty["home"] = home_team

    # get final result of game
    result_tag = home_team_tag.find_next("td")
    if result_tag:
        penalty["result"] = result_tag.text.strip()
        game_id_tag = result_tag.find_next("a")
        if (game_id_tag):
            penalty["game_id"] = game_id_tag.get("id", "no id available") 
    
    # get away team of game
    away_team_tag = result_tag.find_next("td").find_next("a")
    away_team = ""
    if (away_team_tag):
        away_team = away_team_tag.get("title", "no title available")
    penalty["away"] = away_team

    # if team is away team, swap scoreboard
    if penalty["away"] == penalty["team"]:
        penalty["opponent"] = penalty["home"]
        x, y = map(int, penalty["result"].split(":"))
        x, y = y, x
        penalty["result"] = str(x) + ":" + str(y)
    else:
        penalty["opponent"] = penalty["away"]
    
    # get minute of scored penalty
    minute_tag = away_team_tag.find_next("td")
    if minute_tag:
        penalty["minute"] = minute_tag.text.strip()
    
    # get score during after penalty
    score_tag = minute_tag.find_next("td")
    if score_tag:
        penalty["score"] = score_tag.text.strip()
    
    # if team is away team, swap current score
    if penalty["away"] == penalty["team"]:
        x, y = map(int, penalty["score"].split(":"))
        x, y = y, x
        penalty["score"] = str(x) + ":" + str(y)
    
    # get goalkeeper who opposed penalty taker
    goalkeeper_tag = score_tag.find_next("td")
    if goalkeeper_tag:
        penalty["goalkeeper"] = goalkeeper_tag.text.strip()

    return penalty

In [14]:
def loadPenaltyDataForTable(table):
    rows = table.find_all("tr")
    penalty_data = []
    for i in range(1,len(rows)):
        penalty_row = getPenaltyRow(rows[i])
        penalty_data.append(penalty_row)
    return penalty_data


In [28]:
# this function loads the table data into a single dataframe.
def createPenaltyDataFrameForTables(tables_success, tables_missed):
    rows = []
    for table in tables_success:
        data = loadPenaltyDataForTable(table)
        for penalty in data:
            df_row = {
                    'game_id': penalty['game_id'],
                    'saison': penalty['saison'],
                    'competition_type': penalty['competition_type'],
                    'competition': penalty['competition'],
                    'team': penalty['team'],
                    'opponent': penalty['opponent'],
                    'date': penalty['date'],
                    'result': penalty['result'],
                    'minute': penalty['minute'],
                    'score': penalty['score'],
                    'goalkeeper': penalty['goalkeeper'],
                    'has_scored': True
                }
            rows.append(df_row)
        
    for table in tables_missed:
        data = loadPenaltyDataForTable(table)
        for penalty in data:
            df_row = {
                    'game_id': penalty['game_id'],
                    'saison': penalty['saison'],
                    'competition_type': penalty['competition_type'],
                    'competition': penalty['competition'],
                    'team': penalty['team'],
                    'opponent': penalty['opponent'],
                    'date': penalty['date'],
                    'result': penalty['result'],
                    'minute': penalty['minute'],
                    'score': penalty['score'],
                    'goalkeeper': penalty['goalkeeper'],
                    'has_scored': False
                }
            rows.append(df_row)

    df = pd.DataFrame(rows)
    df["date"] = pd.to_datetime(df["date"], format="%d.%m.%Y")
    df = df.sort_values(by='date', ascending=False)
    return df

In [29]:
# create dataframe and correct 
df_messi = createPenaltyDataFrameForTables(penalty_success_messi, penalty_missed_messi)
df_ronaldo = createPenaltyDataFrameForTables(penalty_success_ronaldo, penalty_missed_ronaldo)

['tiny_wappen']
['flaggenrahmen']
['flaggenrahmen']
['flaggenrahmen']
['flaggenrahmen']
['flaggenrahmen']
['flaggenrahmen']
[]
[]
['flaggenrahmen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['flaggenrahmen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['tiny_wappen']
['

In [30]:
df_ronaldo

Unnamed: 0,game_id,saison,competition_type,competition,team,opponent,date,result,minute,score,goalkeeper,has_scored
0,4410956,24/25,Club,Saudi Pro League,Al-Nassr FC,Damac FC,2024-11-29,2:0,17',1:0,Amin Al-Bukhari,True
1,4284226,24/25,International,UEFA Nations League A,Portugal,Polen,2024-11-15,5:1,72',2:0,Marcin Bulka,True
184,4483579,24/25,Club,King's Cup,Al-Nassr FC,Al-Taawoun FC,2024-10-29,0:1,90',0:1,Mailson,False
2,4404909,24/25,Club,Saudi Pro League,Al-Nassr FC,Al-Shabab FC,2024-10-18,2:1,90',2:1,Seung-gyu Kim,True
3,4404887,24/25,Club,Saudi Pro League,Al-Nassr FC,Al-Orobah FC,2024-10-05,3:0,17',1:0,Gaëtan Coucke,True
...,...,...,...,...,...,...,...,...,...,...,...,...
181,55683,06/07,Club,Premier League,Manchester United,FC Watford,2007-01-31,4:0,12',1:0,Richard Lee,True
213,55630,06/07,Club,Premier League,Manchester United,Wigan Athletic,2006-12-26,3:1,50',2:0,Chris Kirkland,False
214,2372139,06/07,International,Freundschaftsspiele,Portugal,Dänemark,2006-09-01,2:4,58',1:2,Thomas Sörensen,False
182,49327,05/06,International,Weltmeisterschaft 2006,Portugal,Iran,2006-06-17,2:0,80',2:0,Ebrahim Mirzapour,True


In [11]:
# Combine the datasets
df_messi["player_name"] = "Lionel Messi"
df_ronaldo["player_name"] = "Christiano Ronaldo"

# set player name as first column
df = pd.concat([df_messi, df_ronaldo], ignore_index=True)
columns = ['player_name'] + [col for col in df.columns if col != 'player_name']
df = df[columns]

In [12]:
# store data
folder_name = "data"
try:
    os.makedirs(folder_name, exist_ok=False)
    print("Folder created for storing goal data")
except Exception:
    print("Folder already exists")

df.to_csv('./data/' + "player_penalties.csv", index=False, encoding="utf-8")

Folder already exists
