# Data Acquisition for Club Goal Data from Transfermarkt

This notebook demonstrates how goal data for a players club career is scraped from [Transfermarkt](https://www.transfermarkt.com/) using BeautifulSoup. It is used to retreive all club goals for the players Lionel Messi & Christiano Ronaldo from the following pages:

- [Lionel Messi Club Goals](https://www.transfermarkt.at/lionel-messi/alletore/spieler/28003)
- [Christiano Ronaldo CLub Goals](https://www.transfermarkt.com/cristiano-ronaldo/alletore/spieler/8198)

In [1]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [2]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

The resulting data will have this structure, before being converted into a dataframe for every goal.
```
{
    "game_id",
    "saison",
    "league",
    "gameday",
    "venue",
    "team",
    "team_table_position",
    "opponent",
    "opponent-table_position",
    "result",
    "player_position",
    "goals": {
        "goal_minute",
        "goal_score",
        "goal_type"
    }
}
````

In [3]:
# this function takes a row from the games played and fetches all relevant game and goal data.
def getGoalData(game):
    game_data = { }
    goal_data = { }

    # get saison
    a_tag = game.find_next("a")
    if a_tag:
        h_ref = a_tag.get("href", "No href available")
        parts = h_ref.split('/')
        saison_id = parts[parts.index('saison_id') + 1]
        game_data["saison"] = saison_id

    # get liga
    liga_tag = game.find("img")
    if liga_tag:
        game_data["league"] = liga_tag.get("alt", "No alt attribute found")
    else:
        print("Found second goal in the same game.")

    # get gameday of the season
    spieltag_tag = liga_tag.find_next().find("a")
    if spieltag_tag:
        game_data["gameday"] = spieltag_tag.text.strip()

    # get venue where game was played
    ort_tag = spieltag_tag.find_next()
    if ort_tag:
        game_data["venue"]  = ort_tag.text.strip()

    # get team for which player played
    team_tag = ort_tag.find_next("a")
    if team_tag:
        game_data["team"]  = team_tag.get("title", "No team found")

    # get teams table position
    team_pos_tag = team_tag.find_next("td") 
    found = False
    # try to find team table position if exists
    for child in team_pos_tag.children:
        regex = "\(([0-9]+)\.\)"
        matches = re.findall(regex, child.text.strip())
        if len(matches) > 0:
            game_data["team_table_position"] = matches[0]
            found = True
    if not found:
        # if not found, set none
        game_data["team_table_position"] = None

    # get opponent for game played
    gegner_tag = team_tag.find_next().find_next("img")
    if gegner_tag:
        game_data["opponent"] = gegner_tag.get("alt", "No alt attribute found")

    # get opponent table position
    gegner_pos_tag = gegner_tag.find_next("span")
    found = False
    # try to find opponent table position if exists
    for child in gegner_pos_tag.children:
        regex = "\(([0-9]+)\.\)"
        matches = re.findall(regex, child.text.strip())
        if len(matches) > 0:
            game_data["opponent_table_position"] = matches[0]
            found = True
    if not found:
        # if no table position, set none
        game_data["opponent_table_position"] = None

    # get end result of the game
    ergebnis_tag = gegner_tag.find_next("a").find_next("a")
    if ergebnis_tag:
        game_data["result"] = ergebnis_tag.text.strip()
        game_data["game_id"] = ergebnis_tag.get("id", "no id available")
        if game_data["venue"] == "A":
            scoreboard = ergebnis_tag.text.strip().split(" ")
            x, y = map(int, scoreboard[0].split(":"))
            if x < y:
                 x, y = y, x
            game_data["result"] = str(x) + ":" + str(y)
            if len(scoreboard) > 2:
                game_data["result"] = game_data["result"] + " " + scoreboard[2]

    # get position played by player
    position_tag = ergebnis_tag.find_next("a")
    if position_tag:
        game_data["player_position"] = position_tag.text.strip()

    # get minute in which goal happened
    tor_minute_tag = position_tag.find_next("td")
    if tor_minute_tag:
        goal_data["minute"] = tor_minute_tag.text.strip()

    # get score after goal
    spielstand_tag = tor_minute_tag.find_next()
    if spielstand_tag:
        goal_data["score"] = spielstand_tag.text.strip()

    # get type of goal
    torart_tag = spielstand_tag.find_next()
    if torart_tag:
        goal_data["goal_type"] = torart_tag.text.strip()
    
    game_data["goals"] = []
    game_data["goals"].append(goal_data)
    return game_data

In [4]:
# gets the goal data for a consecutive row
# this is used because transfermarkt uses multiple lines for consecutive goals within the same game.
def getConsecutiveGoalData(game):
    goal_data = { }

    # get time where
    time_tag = game.find("td").find_next()
    if (time_tag):
        goal_data["minute"] = time_tag.text.strip()

    # get score after goal scored
    score_tag = time_tag.find_next()
    if (score_tag):
        goal_data["score"] = score_tag.text.strip()
    
    # get goal type
    goal_type_tag = score_tag.find_next()
    if (goal_type_tag):
        goal_data["goal_type"] = goal_type_tag.text.strip()

    return goal_data

In [5]:

def getAllPlayerGoals(page):
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    gamesList = []
    # get all rows for the games
    games = pageSoup.find_all("tr", class_=True)

    # Init Variables to store previous row values from the table.
    # because multiple goals can occur in a single game, we need to store the color values of the rows.
    # if two consecutive rows have the same color, its the same game.
    lastColor = None 
    lastGame = None

    for i in range(len(games)):
        game = games[i]
        color = game.get("class", "No alt attribute found")
        # if first game in list
        if (lastColor == None or lastGame == None):
            lastColor = color
            gameData = getGoalData(game)
            lastGame = gameData
        # if new game
        elif (lastColor != color):
            gamesList.append(lastGame)
            gameData = getGoalData(game)
            lastColor = color
            lastGame = gameData
        # if row has goal in same game
        elif (lastColor == color):
            goalData = getConsecutiveGoalData(game)
            if (lastGame != None):
                lastGame["goals"].append(goalData)
    # store last game from list.
    gamesList.append(lastGame)
    print("Games loaded.")
    return gamesList


In [6]:
def createDataFrameFromGames(gamesList):
    # store goals list in dataframe
    rows = []
    for game in gamesList:
        for goal in game['goals']:
            row = {
                'game_id': game['game_id'],
                'saison': game['saison'],
                'league': game['league'],
                'gameday': game['gameday'],
                'venue': game['venue'],
                'team': game['team'],
                'team_table_position': game['team_table_position'],
                'opponent': game['opponent'],
                'opponent_table_position': game['opponent_table_position'],
                'result': game['result'],
                'player_position': game['player_position'],
                'goal_minute': goal['minute'],
                'goal_score': goal['score'],
                'goal_type': goal['goal_type']
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    print("Dataframes successfully created.")
    return df



In [7]:
def storeData(df, filename):
    folder_name = "data"
    try:
        os.makedirs(folder_name, exist_ok=False)
        print("Folder created for storing goal data")
    except Exception:
        print("Folder already exists")

    df.to_csv('./data/' + filename, index=False, encoding="utf-8")
    print("Stored goal data in '" + filename + ".")

In [8]:
# scrape data from web.
# lionel messi transfermarkt all club goals list
page_messi = "https://www.transfermarkt.at/lionel-messi/alletore/spieler/28003"
games_messi = getAllPlayerGoals(page_messi)

# chirstiano ronaldo transfermarkt all club goals list
page_ronaldo = "https://www.transfermarkt.at/cristiano-ronaldo/alletore/spieler/8198"
games_ronaldo = getAllPlayerGoals(page_ronaldo)

Games loaded.
Games loaded.


In [9]:
# Create dataframes that can be stored as a .csv
df_messi = createDataFrameFromGames(games_messi)
df_ronaldo = createDataFrameFromGames(games_ronaldo)

Dataframes successfully created.
Dataframes successfully created.


In [13]:
# Messis data contains goals from the B Team of FC Barcelona, which does not count as professional football
# therefore, those goals have to be removed.
df_messi = df_messi[df_messi["team"] != "FC Barcelona B"]

In [14]:
# Combine the datasets
df_messi["player_name"] = "Lionel Messi"
df_ronaldo["player_name"] = "Christiano Ronaldo"

# set player name as first column
df = pd.concat([df_messi, df_ronaldo], ignore_index=True)
columns = ['player_name'] + [col for col in df.columns if col != 'player_name']
df = df[columns]

In [None]:
# we also have to split the goal time into regular time and added time for better visualisation
df["added_time"] = 0
# Convert 'Minute' to a sortable integer using a lambda function
df["added_time"] = df["goal_minute"].str.extract(r"\+(\d+)")[0].fillna(0).astype(int)  # Extract added time
df["goal_minute"] = df["goal_minute"].str.split("'").str[0].astype(int)  # Extract main minute
df

90'+1
test
34'
a
51'
a
50'
a
82'
a
75'
a
83'
a
42'
a
84'
a
59'
a
36'
a
89'
a
80'
a
11'
a
28'
a
90'+1
test
86'
a
45'
a
29'
a
45'
a
39'
a
80'
a
43'
a
57'
a
34'
a
51'
a
82'
a
72'
a
79'
a
5'
a
10'
a
50'
a
66'
a
19'
a
80'
a
41'
a
81'
a
58'
a
18'
a
79'
a
44'
a
5'
a
70'
a
85'
a
89'
a
90'
a
87'
a
90'+2
test
8'
a
4'
a
19'
a
61'
a
86'
a
51'
a
49'
a
78'
a
90'+2
test
89'
a
11'
a
57'
a
79'
a
84'
a
21'
a
49'
a
76'
a
65'
a
81'
a
30'
a
81'
a
31'
a
40'
a
26'
a
9'
a
38'
a
18'
a
23'
a
35'
a
75'
a
55'
a
70'
a
50'
a
68'
a
78'
a
15'
a
90'+3
test
23'
a
63'
a
25'
a
80'
a
86'
a
64'
a
26'
a
80'
a
86'
a
55'
a
110'
a
36'
a
45'
a
75'
a
85'
a
90'+1
test
54'
a
7'
a
84'
a
42'
a
66'
a
56'
a
80'
a
82'
a
13'
a
56'
a
4'
a
65'
a
78'
a
67'
a
21'
a
37'
a
42'
a
88'
a
32'
a
18'
a
87'
a
16'
a
90'+3
test
5'
a
61'
a
74'
a
25'
a
43'
a
90'+1
test
3'
a
22'
a
45'
a
12'
a
20'
a
18'
a
90'+1
test
41'
a
65'
a
4'
a
63'
a
31'
a
23'
a
67'
a
58'
a
83'
a
16'
a
36'
a
67'
a
63'
a
64'
a
82'
a
46'
a
86'
a
51'
a
44'
a
63'
a
73'
a
37'
a
33'
a
9'
a

In [12]:
# store data
storeData(df, "player_club_goals.csv")

Folder already exists
Stored goal data in 'player_club_goals.csv.
