# Data Acquisition for International Goal Data from Transfermarkt

This notebook demonstrates how goal data for a players international career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup. It is used to retreive all international goals for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi](https://www.transfermarkt.de/lionel-messi/nationalmannschaft/spieler/28003/verein_id/3437/hauptwettbewerb//wettbewerb_id//start/2005-08-17/ende/2024-12-23/nurEinsatz/1/plus/1)
- [Christiano Ronaldo](https://www.transfermarkt.com/cristiano-ronaldo/nationalmannschaft/spieler/8198/verein_id/3300/hauptwettbewerb//wettbewerb_id//start/2003-08-20/ende/2024-12-25/nurEinsatz/0/plus/1)

In [6]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [7]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

The resulting data will have this structure, containing all international games played.
```
{
    "game_id",
    "player_current_club",
    "tournament",
    "gameday",
    "venue_country"
    "venue_city",
    "date",
    "home_team",
    "guest_team",
    "result",
    "player_position",
    "goal_amount",
    "assists_amount",
    "own_goals_amount",
    "yellow_card",
    "yellow_red_card",
    "red_card"
}
```

In [212]:
# this function takes a row from the games played and fetches all relevant game and goal data.
def getGameData(table_row):
    game_data = { }

    # get gameday if there is one
    gameday_tag = table_row.find_next(class_="zentriert")
    if (gameday_tag):
        game_data["gameday"] = gameday_tag.text.strip()
        if (len(game_data["gameday"]) == 0):
            game_data["gameday"] = None

    # get club where player played during that time.
    club_tag = gameday_tag.find_next(class_="zentriert")
    if (club_tag):
        game_data["player_current_club"] = club_tag.find_next("a").get("title", "No title available")
    
    # get venue country and city, where game was played
    venue_tag = club_tag.find_next("td")
    if (venue_tag):
        game_data["venue_country"] = venue_tag.find_next("img").get("title", "No country name available")
        game_data["venue_city"] = venue_tag.text.strip()
        
    # get date of game played
    date_tag = venue_tag.find_next(class_="zentriert")
    if (date_tag):
        game_data["date"] = date_tag.text.strip()
    
    # get home team during game
    home_team_tag = date_tag.find_next(class_="zentriert")
    if (home_team_tag):
        game_data["home_team"] = home_team_tag.find_next("img").get("title", "No country name available")
    
    # get guest team during game
    guest_team_tag = home_team_tag.find_next(class_="zentriert")
    if (guest_team_tag):
        game_data["guest_team"] = guest_team_tag.find_next("img").get("title", "No country name available")
    
    # get result and game id
    result_tag = guest_team_tag.find_next(class_="zentriert")
    if (result_tag):
        game_data["game_id"] = result_tag.find_next("a").get("id", "No game id available")
        game_data["result"] = result_tag.find_next("span").text.strip()
    
    # get player position during game
    position_tag = result_tag.find_next(class_="zentriert")
    if (position_tag):
        game_data["player_position"] = position_tag.find_next("a").text.strip()
    
    # get goal amounts in single game
    goal_tag = position_tag.find_next(class_="zentriert")
    if (goal_tag):
        goal_amount = goal_tag.text.strip()
        if (len(goal_amount) == 0):
            game_data["goal_amount"] = 0
        else:
            game_data["goal_amount"] = int(goal_amount)
        
    # get all assists per game
    assists_tag = goal_tag.find_next(class_="zentriert")
    if (assists_tag):
        assists_amount = assists_tag.text.strip()
        if (len(assists_amount) == 0):
            game_data["assists_amount"] = 0
        else:
            game_data["assists_amount"] = int(assists_amount)
    
    # get all own goals per game
    own_goal_tag = assists_tag.find_next(class_="zentriert")
    if (own_goal_tag):
        own_goal_amount = own_goal_tag.text.strip()
        if (len(own_goal_tag) == 0):
            game_data["own_goals_amount"] = 0
        else:
            game_data["own_goals_amount"] = int(own_goal_amount)
    

    # get yellow card if there was one given
    yellow_card_tag = own_goal_tag.find_next(class_="zentriert")
    if (yellow_card_tag):
        yellow_card_text = yellow_card_tag.text.strip()
        if (len(yellow_card_text) == 0):
            game_data["yellow_card"] = None
        else:
            game_data["yellow_card"] = yellow_card_text

    # get yellow red card if there was one given
    yellow_red_card_tag = yellow_card_tag.find_next(class_="zentriert")
    if (yellow_red_card_tag):
        yellow_red_card_text = yellow_red_card_tag.text.strip()
        if (len(yellow_red_card_text) == 0):
            game_data["yellow_red_card"] = None
        else:
            game_data["yellow_red_card"] = yellow_red_card_text

    # get red card if there was one given
    red_card_tag = yellow_red_card_tag.find_next(class_="zentriert")
    if (red_card_tag):
        red_card_text = red_card_tag.text.strip()
        if (len(red_card_text) == 0):
            game_data["red_card"] = None
        else:
            game_data["red_card"] = red_card_text

    
    
    return game_data

In [213]:
def getAllInternationalGames(page):
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

    gamesList = []

    table = pageSoup.find_all("table", class_=False)
    table_data = table[1]
    data = table_data.find_all("tr")
    del data[0] # Remove unwanted item from table row list.

    current_tournament = None


    for i in range(len(data)):
        table_row = data[i]
        if ['zentriert', 'hauptlink', 'no-border-rechts'] == table_row.find_next("td").get('class'):
            img_tag = table_row.find_next("img")
            current_tournament = img_tag.get("title")
        elif (table_row.get("class") == []):
            # in this case, a row is a game played.
            game = getGameData(table_row) 
            game["tournament"] = current_tournament
            gamesList.append(game)
    
    return gamesList

In [219]:
# Creates a dataframe using the specified data structure for each international game played by the player.
def createDataFrameFromGames(gamesList):
    # store goals list in dataframe
    rows = []
    for game in gamesList:
        row = {
            'game_id': game["game_id"],
            'player_current_club': game['player_current_club'],
            'tournament': game['tournament'],
            'gameday': game['gameday'],
            'venue_country': game['venue_country'],
            'venue_city': game['venue_city'],
            'date': game['date'],
            'home_team': game['home_team'],
            'guest_team': game['guest_team'],
            'result': game['result'],
            'player_position': game['player_position'],
            'goal_amount': game['goal_amount'],
            'assists_amount': game['assists_amount'],
            'own_goals_amount': game['own_goals_amount'],
            'yellow_card': game['yellow_card'],
            'yellow_red_card': game['yellow_red_card'],
            'red_card': game['red_card']
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    print("Dataframes successfully created.")
    return df


In [222]:
# get data for ronaldo
ronaldo_page = "https://www.transfermarkt.at/cristiano-ronaldo/nationalmannschaft/spieler/8198/verein_id/3300/hauptwettbewerb//wettbewerb_id//start/2003-08-20/ende/2024-12-25/nurEinsatz/0/plus/1"
ronaldo_games = getAllInternationalGames(ronaldo_page)


In [223]:
# get data for messi
messi_page = "https://www.transfermarkt.at/lionel-messi/nationalmannschaft/spieler/28003/verein_id/3437/plus/1?hauptwettbewerb=&wettbewerb_id=&trainer_id=&start=17.08.2005&ende=25.12.2024&nurEinsatz=1"
messi_games = getAllInternationalGames(messi_page)

In [225]:
df_ronaldo = createDataFrameFromGames(ronaldo_games)
df_messi = createDataFrameFromGames(messi_games)
df_messi

Dataframes successfully created.
Dataframes successfully created.


Unnamed: 0,game_id,player_current_club,tournament,gameday,venue_country,venue_city,date,home_team,guest_team,result,player_position,goal_amount,assists_amount,own_goals_amount,yellow_card,yellow_red_card,red_card
0,1059978,FC Barcelona,Freundschaftsspiele,,Ungarn,Budapest,17.08.2005,Ungarn,Argentinien,1:2,HS,0,0,0,,,65'
1,2373422,FC Barcelona,WM-Qualifikation Südamerika,Gruppe A,Paraguay,Asunción,03.09.2005,Paraguay,Argentinien,1:0,HS,0,0,0,,,
2,2373430,FC Barcelona,WM-Qualifikation Südamerika,Gruppe A,Argentinien,Buenos Aires,09.10.2005,Argentinien,Peru,2:0,HS,0,1,0,,,
3,2373435,FC Barcelona,WM-Qualifikation Südamerika,Gruppe A,Uruguay,Montevideo,13.10.2005,Uruguay,Argentinien,1:0,RM,0,0,0,,,
4,1059980,FC Barcelona,Freundschaftsspiele,,Katar,Doha,16.11.2005,Katar,Argentinien,0:3,HS,0,1,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,4359365,UnbekanntUnbekannt,Copa América 2024,Finale,Vereinigte Staaten,"Miami Gardens, Florida",15.07.2024,Argentinien,Kolumbien,1:0 n.V.,HS,0,0,0,,,
187,4420128,UnbekanntUnbekannt,WM-Qualifikation Südamerika,Gruppe A,Venezuela,Puerto Ordaz,10.10.2024,Venezuela,Argentinien,1:1,MS,0,0,0,,,
188,4420129,UnbekanntUnbekannt,WM-Qualifikation Südamerika,Gruppe A,Argentinien,Buenos Aires,16.10.2024,Argentinien,Bolivien,6:0,RA,3,2,0,,,
189,4420135,UnbekanntUnbekannt,WM-Qualifikation Südamerika,Gruppe A,Paraguay,Asunción,15.11.2024,Paraguay,Argentinien,2:1,RA,0,0,0,,,


In [226]:
# store data
folder_name = "data"
try:
    os.makedirs(folder_name, exist_ok=False)
    print("Folder created for storing goal data")
except Exception:
    print("Folder already exists")

df_messi.to_csv('./data/' + "messi_international_goals.csv", index=False, encoding="utf-8")
df_ronaldo.to_csv('./data/' + "ronaldo_international_goals.csv", index=False, encoding="utf-8")

Folder already exists
