# Data Acquisition for Club Goal Data from Transfermarkt

This notebook demonstrates how goal data for a players club career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup. It is used to retreive all club goals for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi](https://www.transfermarkt.at/lionel-messi/alletore/spieler/28003)
- [Christiano Ronaldo](https://www.transfermarkt.com/cristiano-ronaldo/alletore/spieler/8198)

In [10]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [11]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

'game_id': game_id,
                'saison': game['saison'],
                'league': game['league'],
                'gameday': game['gameday'],
                'venue': game['venue'],
                'team': game['team'],
                'team_table_position': game['team_table_position'],
                'opponent': game['opponent'],
                'opponent_table_position': game['opponent_table_position'],
                'result': game['result'],
                'player_position': game['player_position'],
                'goal_minute': goal['minute'],
                'goal_score': goal['score'],
                'goal_type': goal['goal_type']


The resulting data will have this structure, before being converted into a dataframe for every goal.
```
{
    "game_id",
    "saison",
    "league",
    "gameday",
    "venue",
    "team",
    "team_table_position",
    "opponent",
    "opponent-table_position",
    "result",
    "player_position",
    "goals": {
        "goal_minute",
        "goal_score",
        "goal_type"
    }
}
````

In [12]:
# this function takes a row from the games played and fetches all relevant game and goal data.
def getGoalData(game):
    game_data = { }
    goal_data = { }

    # get saison
    a_tag = game.find_next("a")
    if a_tag:
        h_ref = a_tag.get("href", "No href available")
        parts = h_ref.split('/')
        saison_id = parts[parts.index('saison_id') + 1]
        game_data["saison"] = saison_id

    # get liga
    liga_tag = game.find("img")
    if liga_tag:
        game_data["league"] = liga_tag.get("alt", "No alt attribute found")
    else:
        print("Found second goal in the same game.")

    # get gameday of the season
    spieltag_tag = liga_tag.find_next().find("a")
    if spieltag_tag:
        game_data["gameday"] = spieltag_tag.text.strip()

    # get venue where game was played
    ort_tag = spieltag_tag.find_next()
    if ort_tag:
        game_data["venue"]  = ort_tag.text.strip()

    # get team for which player played
    team_tag = ort_tag.find_next("a")
    if team_tag:
        game_data["team"]  = team_tag.get("title", "No team found")

    # get teams table position
    team_pos_tag = team_tag.find_next("td") 
    found = False
    # try to find team table position if exists
    for child in team_pos_tag.children:
        regex = "\(([0-9]+)\.\)"
        matches = re.findall(regex, child.text.strip())
        if len(matches) > 0:
            game_data["team_table_position"] = matches[0]
            found = True
    if not found:
        # if not found, set none
        game_data["team_table_position"] = None

    # get opponent for game played
    gegner_tag = team_tag.find_next().find_next("img")
    if gegner_tag:
        game_data["opponent"] = gegner_tag.get("alt", "No alt attribute found")

    # get opponent table position
    gegner_pos_tag = gegner_tag.find_next("span")
    found = False
    # try to find opponent table position if exists
    for child in gegner_pos_tag.children:
        regex = "\(([0-9]+)\.\)"
        matches = re.findall(regex, child.text.strip())
        if len(matches) > 0:
            game_data["opponent_table_position"] = matches[0]
            found = True
    if not found:
        # if no table position, set none
        game_data["opponent_table_position"] = None

    # get end result of the game
    ergebnis_tag = gegner_tag.find_next("a").find_next("a")
    if ergebnis_tag:
        game_data["result"] = ergebnis_tag.text.strip()

    # get position played by player
    position_tag = ergebnis_tag.find_next("a")
    if position_tag:
        game_data["player_position"] = position_tag.text.strip()

    # get minute in which goal happened
    tor_minute_tag = position_tag.find_next("td")
    if tor_minute_tag:
        goal_data["minute"] = tor_minute_tag.text.strip()

    # get score after goal
    spielstand_tag = tor_minute_tag.find_next()
    if spielstand_tag:
        goal_data["score"] = spielstand_tag.text.strip()

    # get type of goal
    torart_tag = spielstand_tag.find_next()
    if torart_tag:
        goal_data["goal_type"] = torart_tag.text.strip()
    
    game_data["goals"] = []
    game_data["goals"].append(goal_data)
    return game_data

In [13]:
# gets the goal data for a consecutive row
# this is used because transfermarkt uses multiple lines for consecutive goals within the same game.
def getConsecutiveGoalData(game):
    goal_data = { }

    # get time where
    time_tag = game.find("td").find_next()
    if (time_tag):
        goal_data["minute"] = time_tag.text.strip()

    # get score after goal scored
    score_tag = time_tag.find_next()
    if (score_tag):
        goal_data["score"] = score_tag.text.strip()
    
    # get goal type
    goal_type_tag = score_tag.find_next()
    if (goal_type_tag):
        goal_data["goal_type"] = goal_type_tag.text.strip()

    return goal_data

In [14]:

def getAllPlayerGoals(page):
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    gamesList = []
    # get all rows for the games
    games = pageSoup.find_all("tr", class_=True)

    # Init Variables to store previous row values from the table.
    # because multiple goals can occur in a single game, we need to store the color values of the rows.
    # if two consecutive rows have the same color, its the same game.
    i = 0
    lastColor = None 
    lastGame = None

    for i in range(len(games)):
        game = games[i]
        color = game.get("class", "No alt attribute found")
        # if first game in list
        if (lastColor == None or lastGame == None):
            lastColor = color
            gameData = getGoalData(game)
            lastGame = gameData
        # if new game
        elif (lastColor != color):
            gamesList.append(lastGame)
            gameData = getGoalData(game)
            lastColor = color
            lastGame = gameData
        # if row has goal in same game
        elif (lastColor == color):
            goalData = getConsecutiveGoalData(game)
            if (lastGame != None):
                lastGame["goals"].append(goalData)
    # store last game from list.
    gamesList.append(lastGame)
    print("Games loaded.")
    return gamesList


In [15]:
def createDataFrameFromGames(gamesList):
    # store goals list in dataframe
    rows = []
    game_id = 1
    for game in gamesList:
        for goal in game['goals']:
            row = {
                'game_id': game_id,
                'saison': game['saison'],
                'league': game['league'],
                'gameday': game['gameday'],
                'venue': game['venue'],
                'team': game['team'],
                'team_table_position': game['team_table_position'],
                'opponent': game['opponent'],
                'opponent_table_position': game['opponent_table_position'],
                'result': game['result'],
                'player_position': game['player_position'],
                'goal_minute': goal['minute'],
                'goal_score': goal['score'],
                'goal_type': goal['goal_type']
            }
            rows.append(row)
        game_id = game_id + 1

    df = pd.DataFrame(rows)
    print("Dataframes successfully created.")
    return df



In [20]:
def scrapePlayerData(page, filename):
    gamesList = getAllPlayerGoals(page)
    df = createDataFrameFromGames(gamesList)

    folder_name = "data"
    try:
        os.makedirs(folder_name, exist_ok=False)
        print("Folder created for storing goal data")
    except Exception:
        print("Folder already exists")

    df.to_csv('./data/' + filename, index=False, encoding="utf-8")
    print("Stored goal data in '" + filename + ".")
    return df

In [21]:
# lionel messi transfermarkt all club goals list
page = "https://www.transfermarkt.at/lionel-messi/alletore/spieler/28003"
df_messi = scrapePlayerData(page, "messi_club_goals.csv")

page = "https://www.transfermarkt.at/cristiano-ronaldo/alletore/spieler/8198"
df_ronaldo = scrapePlayerData(page, "ronaldo_club_goals.csv")

Games loaded.
Dataframes successfully created.
Folder already exists
Stored goal data in 'messi_club_goals.csv.
Games loaded.
Dataframes successfully created.
Folder already exists
Stored goal data in 'ronaldo_club_goals.csv.


In [22]:
df_messi

Unnamed: 0,game_id,saison,league,gameday,venue,team,team_table_position,opponent,opponent_table_position,result,player_position,goal_minute,goal_score,goal_type
0,1,2004,Segunda División B - Grupo III (-20/21),2,H,FC Barcelona B,1,FC Girona,5,1:2,LA,65',1:2,Linksschuss
1,2,2004,Segunda División B - Grupo III (-20/21),4,H,FC Barcelona B,1,Espanyol Barcelona B,17,2:0,LA,88',2:0,Linksschuss
2,3,2004,Segunda División B - Grupo III (-20/21),10,A,FC Barcelona B,1,Villajoyosa CF,8,2:1,OM,2',0:1,Ohne weitere Angaben
3,4,2004,Segunda División B - Grupo III (-20/21),14,A,FC Barcelona B,1,UE Figueres,15,2:2,OM,20',0:1,Ohne weitere Angaben
4,5,2004,Segunda División B - Grupo III (-20/21),17,H,FC Barcelona B,1,CD Alcoyano,11,2:0,MS,40',2:0,Linksschuss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,486,2023,Major League Soccer,35,A,Inter Miami CF,,Columbus Crew,,2:3,RA,45'+5,0:2,Direkter Freistoß
740,487,2023,Major League Soccer,37,H,Inter Miami CF,,New England Revolution,,6:2,RA,78',4:2,Linksschuss
741,487,2023,Major League Soccer,37,H,Inter Miami CF,,New England Revolution,,6:2,RA,81',5:2,Linksschuss
742,487,2023,Major League Soccer,37,H,Inter Miami CF,,New England Revolution,,6:2,RA,89',6:2,Linksschuss
