# Data Acquisition of Best La Liga Scorer Data from Transfermarkt

This notebook demonstrates how the best goal scorer data is scraped from [Transfermarkt](https://www.transfermarkt.com)

- [Best Goal Scorers of La Liga](https://www.transfermarkt.at/laliga/ewigetorschuetzen/wettbewerb/ES1)

In [8]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [11]:
# Header used to perform http request data from web server.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

This is the structure of the top scorer table:

```
{
    name,
    appearances,
    minutes_played,
    goals
}
```

In [186]:
def getRowData(row):
    player = {}

    #get player name
    name_tag = row.find_next("a")
    if (name_tag):
        player["name"] = name_tag.get("title", "No title available")


    club_tag = name_tag.find_next("a") # only used to find next tag

    # get appearances
    appearance_tag = club_tag.find_next("a")
    if (appearance_tag):
        player["appearances"] = int(appearance_tag.text.strip())

    # get minutes played
    minutes_tag = appearance_tag.find_next("td")
    if (minutes_tag):
        minutes = minutes_tag.text.strip().replace(".", "")
        player["minutes_played"] = int(minutes)

    # get goal amount
    minutes_per_goal_tag = minutes_tag.find_next("td")
    if (not minutes_per_goal_tag):
        return
    goals_tag = minutes_per_goal_tag.find_next("td")
    if (goals_tag):
        player["goals"] = int(goals_tag.text.strip())

    return player

In [184]:
def getTopPlayers(page):
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    top_players = []
    table = pageSoup.find("table", class_="items")
    if not table:
        return None
    tbody = table.find_next("tbody")
    if not tbody:
        return None

    rows = tbody.contents

    for row in rows[1::2]:  # No iterations because the slice is empty
        player = getRowData(row)
        top_players.append(player)
    
    return top_players


In [187]:
page = "https://www.transfermarkt.at/laliga/ewigetorschuetzen/wettbewerb/ES1"
data = getTopPlayers(page)

[{'name': 'Lionel Messi',
  'appearances': 520,
  'minutes_played': 42148,
  'goals': 474},
 {'name': 'Cristiano Ronaldo',
  'appearances': 292,
  'minutes_played': 25113,
  'goals': 311},
 {'name': 'Telmo Zarra',
  'appearances': 277,
  'minutes_played': 24895,
  'goals': 254},
 {'name': 'Karim Benzema',
  'appearances': 439,
  'minutes_played': 32047,
  'goals': 238},
 {'name': 'Hugo Sánchez',
  'appearances': 347,
  'minutes_played': 30310,
  'goals': 234},
 {'name': 'Raúl', 'appearances': 550, 'minutes_played': 44079, 'goals': 228},
 {'name': 'Alfredo di Stéfano',
  'appearances': 329,
  'minutes_played': 29561,
  'goals': 227},
 {'name': 'César Rodríguez',
  'appearances': 349,
  'minutes_played': 31410,
  'goals': 221},
 {'name': 'Quini', 'appearances': 448, 'minutes_played': 35815, 'goals': 219},
 {'name': 'Pahiño', 'appearances': 278, 'minutes_played': 24925, 'goals': 214},
 {'name': 'Antoine Griezmann',
  'appearances': 507,
  'minutes_played': 39051,
  'goals': 197},
 {'name'

In [188]:
# Creates a dataframe using the specified data structure for top scorers
def createDataframe(list):
    rows = []
    for player in list:
        row = {
            'name': player['name'],
            'appearances': player['appearances'],
            'minutes_played': player['minutes_played'],
            'goals': player['goals']
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    print("Dataframes successfully created.")
    return df


In [191]:
# Create dataframes to be saved
df = createDataframe(data)

Dataframes successfully created.


In [192]:
# store data
folder_name = "data"
try:
    os.makedirs(folder_name, exist_ok=False)
    print("Folder created for storing goal data")
except Exception:
    print("Folder already exists")


df.to_csv('./data/' + "laliga_top_scorer.csv", index=False, encoding="utf-8")

Folder already exists
