# Data Acquisition for International Performance Data from Transfermarkt

This notebook demonstrates how performance data for a players international career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup. It is used to retreive all international performancs for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi International Performance](https://www.transfermarkt.de/lionel-messi/nationalmannschaft/spieler/28003/verein_id/3437/hauptwettbewerb//wettbewerb_id//start/2005-08-17/ende/2024-12-23/nurEinsatz/1/plus/1)
- [Christiano Ronaldo International Performance](https://www.transfermarkt.com/cristiano-ronaldo/nationalmannschaft/spieler/8198/verein_id/3300/hauptwettbewerb//wettbewerb_id//start/2003-08-20/ende/2024-12-25/nurEinsatz/0/plus/1)

In [32]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [33]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

The resulting data will have this structure, containing all international games played.
```
{
    "game_id",
    "player_current_club",
    "tournament",
    "gameday",
    "venue_country"
    "venue_city",
    "date",
    "home_team",
    "guest_team",
    "result",
    "player_position",
    "goal_amount",
    "assists_amount",
    "own_goals_amount",
    "substitute_in",
    "substitute_out",
    "yellow_card",
    "yellow_red_card",
    "red_card",
    "minutes_played"
}
```

In [72]:
# this function takes a row from the games played and fetches all relevant game and goal data.
# home_team is used to check what team the player plays for, because the players team can be on the left or right side of the scoreboard.
def getGameData(table_row, home_team):
    game_data = { }

    # get gameday if there is one
    gameday_tag = table_row.find_next(class_="zentriert")
    if (gameday_tag):
        game_data["gameday"] = gameday_tag.text.strip()
        if (len(game_data["gameday"]) == 0):
            game_data["gameday"] = None

    # get club where player played during that time.
    club_tag = gameday_tag.find_next(class_="zentriert")
    if (club_tag):
        game_data["player_current_club"] = club_tag.find_next("a").get("title", "No title available")
    
    # get venue country and city, where game was played
    venue_tag = club_tag.find_next("td")
    if (venue_tag):
        game_data["venue_country"] = venue_tag.find_next("img").get("title", "No country name available")
        game_data["venue_city"] = venue_tag.text.strip()
        
    # get date of game played
    date_tag = venue_tag.find_next(class_="zentriert")
    if (date_tag):
        game_data["date"] = date_tag.text.strip()
    
    # get home team during game
    home_team_tag = date_tag.find_next(class_="zentriert")
    if (home_team_tag):
        game_data["home_team"] = home_team_tag.find_next("img").get("title", "No country name available")

    
    # get guest team during game
    guest_team_tag = home_team_tag.find_next(class_="zentriert")
    if (guest_team_tag):
        game_data["guest_team"] = guest_team_tag.find_next("img").get("title", "No country name available")
    
    # if guest team is the players home team
    if game_data["guest_team"]  == home_team:
        game_data["opponent"] = game_data["home_team"]
        game_data["team"] = game_data["guest_team"]
        game_data["venue"] = "A"
    else:
        game_data["opponent"] = game_data["guest_team"]
        game_data["team"] = game_data["home_team"]
        game_data["venue"] = "H"
    
    # get result and game id
    result_tag = guest_team_tag.find_next(class_="zentriert")
    if (result_tag):
        game_data["game_id"] = result_tag.find_next("a").get("id", "No game id available")
        result_span = result_tag.find_next("span")
        #  if team won
        if result_span.get("class", "no class available") == ['greentext']:
            scoreboard = result_span.text.split()
            x, y = map(int, scoreboard[0].split(":"))
            if x < y:
                 x, y = y, x
            
            game_data["result"] = str(x) + ":" + str(y)
            if len(scoreboard) > 1:
                game_data["result"] = game_data["result"] + " " + scoreboard[1]
        # if team lost
        elif result_span.get("class", "no class available") == ['redtext']:
            scoreboard = result_span.text.split()
            x, y = map(int, scoreboard[0].split(":"))
            if x > y:
                x, y = y, x
            
            game_data["result"] = str(x) + ":" + str(y)
            if len(scoreboard) > 1:
                game_data["result"] = game_data["result"] + " " + scoreboard[1]
        # if game is a tie, just save
        else:
            game_data["result"] = result_span.text.split()[0]
    
    # get player position during game
    position_tag = result_tag.find_next(class_="zentriert")
    if (position_tag):
        game_data["player_position"] = position_tag.find_next("a").text.strip()
    
    # get goal amounts in single game
    goal_tag = position_tag.find_next(class_="zentriert")
    if (goal_tag):
        goal_amount = goal_tag.text.strip()
        if (len(goal_amount) == 0):
            game_data["goal_amount"] = 0
        else:
            game_data["goal_amount"] = int(goal_amount)
        
    # get all assists per game
    assists_tag = goal_tag.find_next(class_="zentriert")
    if (assists_tag):
        assists_amount = assists_tag.text.strip()
        if (len(assists_amount) == 0):
            game_data["assists_amount"] = 0
        else:
            game_data["assists_amount"] = int(assists_amount)
    
    # get all own goals per game
    own_goal_tag = assists_tag.find_next(class_="zentriert")
    if (own_goal_tag):
        own_goal_amount = own_goal_tag.text.strip()
        if (len(own_goal_tag) == 0):
            game_data["own_goals_amount"] = 0
        else:
            game_data["own_goals_amount"] = int(own_goal_amount)
    

    # get yellow card if there was one given
    yellow_card_tag = own_goal_tag.find_next(class_="zentriert")
    if (yellow_card_tag):
        yellow_card_text = yellow_card_tag.text.strip()
        if (len(yellow_card_text) == 0):
            game_data["yellow_card"] = None
        else:
            game_data["yellow_card"] = yellow_card_text

    # get yellow red card if there was one given
    yellow_red_card_tag = yellow_card_tag.find_next(class_="zentriert")
    if (yellow_red_card_tag):
        yellow_red_card_text = yellow_red_card_tag.text.strip()
        if (len(yellow_red_card_text) == 0):
            game_data["yellow_red_card"] = None
        else:
            game_data["yellow_red_card"] = yellow_red_card_text

    # get red card if there was one given
    red_card_tag = yellow_red_card_tag.find_next(class_="zentriert")
    if (red_card_tag):
        red_card_text = red_card_tag.text.strip()
        if (len(red_card_text) == 0):
            game_data["red_card"] = None
        else:
            game_data["red_card"] = red_card_text
    
    # get substitution in
    subst_in_tag = red_card_tag.find_next("td", class_="zentriert")
    if (subst_in_tag):
        game_data["substitute_in"] = subst_in_tag.text.strip()
        if game_data["substitute_in"] == "":
            game_data["substitute_in"] = None
    
    # get substitution out
    subst_out_tag = subst_in_tag.find_next("td", class_="zentriert")
    if (subst_out_tag):
        game_data["substitute_out"] = subst_out_tag.text.strip()
        if game_data["substitute_out"] == "":
            game_data["substitute_out"] = None
    
    min_played_tag = subst_out_tag.find_next("td")
    if (min_played_tag):
        game_data["minutes_played"] = min_played_tag.text.strip()
        if game_data["minutes_played"] == "":
            game_data["minutes_played"] = 0
    else:
        print(game_data["game_id"])
    
    return game_data

In [66]:
def getAllInternationalGames(page, home_team):
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

    gamesList = []

    table = pageSoup.find_all("table", class_=False)
    table_data = table[1]
    data = table_data.find_all("tr")
    del data[0] # Remove unwanted item from table row list.

    current_tournament = None

    for i in range(len(data)):
        table_row = data[i]
        if ['zentriert', 'hauptlink', 'no-border-rechts'] == table_row.find_next("td").get('class'):
            img_tag = table_row.find_next("img")
            current_tournament = img_tag.get("title")
        elif (table_row.get("class") == []):
            # in this case, a row is a game played.
            game = getGameData(table_row, home_team) 
            game["tournament"] = current_tournament
            gamesList.append(game)
    
    return gamesList

In [77]:
# Creates a dataframe using the specified data structure for each international game played by the player.
def createDataFrameFromGames(gamesList):
    # store goals list in dataframe
    rows = []
    for game in gamesList:
        row = {
            'game_id': game["game_id"],
            'player_current_club': game['player_current_club'],
            'tournament': game['tournament'],
            'gameday': game['gameday'],
            'venue_country': game['venue_country'],
            'venue_city': game['venue_city'],
            'venue': game['venue'],
            'date': game['date'],
            'team': game['team'],
            'opponent': game['opponent'],
            'result': game['result'],
            'player_position': game['player_position'],
            'goal_amount': game['goal_amount'],
            'assists_amount': game['assists_amount'],
            'own_goals_amount': game['own_goals_amount'],
            'substitute_in': game['substitute_in'],
            'substitute_out': game['substitute_out'],
            'yellow_card': game['yellow_card'],
            'yellow_red_card': game['yellow_red_card'],
            'red_card': game['red_card'],
            'minutes_played': game['minutes_played']
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    df["date"] = pd.to_datetime(df["date"], format="%d.%m.%Y")
    print("Dataframes successfully created.")
    return df


In [68]:
# get data for ronaldo
ronaldo_page = "https://www.transfermarkt.at/cristiano-ronaldo/nationalmannschaft/spieler/8198/verein_id/3300/hauptwettbewerb//wettbewerb_id//start/2003-08-20/ende/2024-12-25/nurEinsatz/0/plus/1"
ronaldo_team = "Portugal"
ronaldo_games = getAllInternationalGames(ronaldo_page, ronaldo_team)



In [73]:
# get data for messi
messi_page = "https://www.transfermarkt.at/lionel-messi/nationalmannschaft/spieler/28003/verein_id/3437/plus/1?hauptwettbewerb=&wettbewerb_id=&trainer_id=&start=17.08.2005&ende=25.12.2024&nurEinsatz=1"
messi_team = "Argentinien"
messi_games = getAllInternationalGames(messi_page, messi_team)

In [74]:
from datetime import datetime
def fillMissingClubInfo(gameList, timespans):
    for game in gameList:
        if (game["player_current_club"] == "UnbekanntUnbekannt"):
            date = datetime.strptime(game["date"], "%d.%m.%Y")
            for timespan in timespans:
                start_date = datetime.strptime(timespan[0], "%d.%m.%Y")
                end_date = datetime.strptime(timespan[1], "%d.%m.%Y")
                if (date >= start_date and date < end_date):
                    game["player_current_club"] = timespan[2]

In [75]:
ronaldo_club_timespans = [("01.08.2002", "12.08.2003", "Sporting Lissabon"), 
                          ("12.08.2003", "01.07.2009", "Manchester United"), 
                          ("01.07.2009", "10.07.2018", "Real Madrid"), 
                          ("10.07.2018", "31.12.2021", "Juventus Turin"), 
                          ("31.12.2021", "22.11.2022", "Manchester United"), 
                          ("22.11.2022", "01.01.2023", "Vereinslos"), 
                          ("01.01.2023", "26.12.2024", "Al-Nassr")
]

messi_club_timespans = [("17.10.2004", "30.06.2021", "FC Barcelona"),
                        ("30.06.2021", "10.08.2021", "Vereinslos"),
                        ("10.08.2021", "30.06.2023", "Paris Saint-Germain"),
                        ("01.07.2023", "26.12.2024", "Inter Miami")
]

# There is some information missing about ronaldo's and current club during some of the games, therefore they are filled here:
fillMissingClubInfo(ronaldo_games, ronaldo_club_timespans)
fillMissingClubInfo(messi_games, messi_club_timespans)


In [80]:
# Create dataframes to be saved
df_ronaldo = createDataFrameFromGames(ronaldo_games)
df_messi = createDataFrameFromGames(messi_games)
df_ronaldo

Dataframes successfully created.
Dataframes successfully created.


Unnamed: 0,game_id,player_current_club,tournament,gameday,venue_country,venue_city,venue,date,team,opponent,...,player_position,goal_amount,assists_amount,own_goals_amount,substitute_in,substitute_out,yellow_card,yellow_red_card,red_card,minutes_played
0,2266024,Manchester United,Freundschaftsspiele,,Portugal,Chaves,H,2003-08-20,Portugal,Kasachstan,...,RA,0,0,0,46',0,,,,0
1,2381478,Manchester United,Freundschaftsspiele,,Portugal,Lissabon,H,2003-10-11,Portugal,Albanien,...,RA,0,0,0,0,46',,,,0
2,1182089,Manchester United,Freundschaftsspiele,,Portugal,Faro,H,2004-02-18,Portugal,England,...,LA,0,0,0,46',0,,,,0
3,2328084,Manchester United,Freundschaftsspiele,,Portugal,Braga,H,2004-03-31,Portugal,Italien,...,LA,0,0,0,46',0,,,,0
4,2381457,Manchester United,Freundschaftsspiele,,Portugal,Coimbra,H,2004-04-28,Portugal,Schweden,...,RA,0,1,0,63',0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,4284225,Al-Nassr,UEFA Nations League A,Gruppe 1,Portugal,Lissabon,H,2024-09-05,Portugal,Kroatien,...,MS,1,0,0,0,88',,,,Gruppe 1
212,4284227,Al-Nassr,UEFA Nations League A,Gruppe 1,Portugal,Lissabon,H,2024-09-08,Portugal,Schottland,...,MS,1,0,0,46',0,,,,Gruppe 1
213,4284229,Al-Nassr,UEFA Nations League A,Gruppe 1,Polen,Warszawa,A,2024-10-12,Portugal,Polen,...,MS,1,0,0,0,63',,,,Gruppe 1
214,4284232,Al-Nassr,UEFA Nations League A,Gruppe 1,Schottland,Glasgow,A,2024-10-15,Portugal,Schottland,...,MS,0,0,0,0,0,,,,Gruppe 1


In [81]:
# store data
folder_name = "data"
try:
    os.makedirs(folder_name, exist_ok=False)
    print("Folder created for storing goal data")
except Exception:
    print("Folder already exists")

df_messi.to_csv('./data/' + "messi_international_performance.csv", index=False, encoding="utf-8")
df_ronaldo.to_csv('./data/' + "ronaldo_international_performance.csv", index=False, encoding="utf-8")

Folder already exists
