<h1><center>WebScraping Football</center></h1>


This Notebook will be utilising the BeautifulSoup Web scraping library, to web scrape data from the popular football transfer website Transfermarkt. The aim of this notebook is to be an introduction to web scraping football data, it will demonstrate a very easy way to scrape football transfer data.

In [None]:
#Import Libraries
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup


In [None]:
#This will tell the browser that we are a browser and not a scraping tool
headers = {'User-Agent':
          'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

#This the page to be scraped
page = 'https://www.transfermarkt.co.uk/transfers/transferrekorde/statistik/top/plus/0/galerie/0?saison_id=2022'
pageTree = requests.get(page, headers=headers)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

In [None]:
#Find all the td tags with the class "hauptlink" and store it in the
#variable Players
Players = pageSoup.find_all("td", {"class":"hauptlink"})

In [None]:
#CHECKS
#The type of the variable Players
print(type(Players))

#Length of the element resultset
print(len(Players))

In [None]:
#What does the Players result set look like
Players

In [None]:
#What does an element of the Players result set look like
Players[0]

In [None]:
#Text value of the first element in Players
Players[0].text

In [None]:
#Printing out the index and the value in each index in
#The Players list - Find pattern
for i,player in enumerate(Players):
    print(i, player)

In [None]:
#Find all the td tags with the class "rechts hauptlink"
Fees = pageSoup.find_all("td", "rechts hauptlink")

In [None]:
#the type of the variable Fees
print(type(Fees))

In [None]:
#The text value of the first element of Fees
Fees[0].text

In [None]:
#Print the index and element of the Result Set
for i, row in enumerate(Fees):
    print(i, row)

In [None]:
#`create two lists one for the Players and one for the Fees
PlayersList = [] #Store names of Players
FeesList = [] #Store the fees of those Players

count = 0
#For loop to add the names of the players and their corresponding fees
#to their respective lists
for i, fee in enumerate(Fees):
    
    PlayersList.append(Players[count].text)
    FeesList.append(Fees[i].text)
    
    count+=3


In [None]:
#Create DataFrame of the Players List and Fees List
df_transfer = pd.DataFrame({"Players":PlayersList, "Transfer Fees (£)": FeesList})

In [None]:
#Show first five rows of the dataframe (data isn't clean)
df_transfer.head()

In [None]:
#Remove the newline character, '£' and 'm' from the Dataframe
df_transfer.replace('\n', '', regex=True, inplace=True)
df_transfer.replace('£', '', regex=True, inplace=True)
df_transfer['Transfer Fees (£)'].replace('m', '', regex=True, inplace=True)


In [None]:
#Applys the float function to each row in the Transfer Fees column
df_transfer['Transfer Fees (£)'] = df_transfer['Transfer Fees (£)'].apply(float)

In [None]:
#Check the Data types in the Dataframe
df_transfer.dtypes

In [None]:
#Check the information of the Dataframe
df_transfer.info()

In [None]:
df_transfer

In [None]:
#Second url for 2020 Transfer data
page2 = 'https://www.transfermarkt.co.uk/transfers/transferrekorde/statistik/top/plus/0/galerie/0?saison_id=2021'
pageTree2 = requests.get(page2, headers=headers)
pageSoup2 = BeautifulSoup(pageTree2.content, 'html.parser')

In [None]:
Players2 = pageSoup2.find_all("td", "hauptlink")

In [None]:
#The first element in the Players2
Players2[0].text

In [None]:
Fees2 = pageSoup2.find_all("td", "rechts hauptlink")

In [None]:
#The first element in the Fees2
Fees2[0].text

In [None]:
#Create two lists for the Players2 and Fees2 resultsets
PlayersList2 = []
FeesList2 = []

count = 0
for i, fee in enumerate(Fees2):
    
    PlayersList2.append(Players2[count].text)
    FeesList2.append(Fees2[i].text)
    
    count+=3

In [None]:
#Check the list PlayersList2
PlayersList2

In [None]:
#Check the list FeesList2
FeesList2

In [None]:
df_transfer2 = pd.DataFrame({"Players":PlayersList2, "Transfer Fees (£)":FeesList2})

In [None]:
df_transfer2.replace('\n', '', regex=True, inplace=True)
df_transfer2.replace('£', '', regex=True, inplace=True)
df_transfer2['Transfer Fees (£)'].replace('m', '', regex=True, inplace=True)

In [None]:
df_transfer['Transfer Fees (£)'] = df_transfer['Transfer Fees (£)'].apply(float)

In [None]:
df_transfer2

In [None]:
#We are going to write the above code into a function
#Helper function to populate lists
def populateLists(players, playerList, fees, feeList):
   
    
    count = 0
    for i, fee in enumerate(fees):
        
        playerList.append(players[count].text)
        feeList.append(fees[i].text)
        
        #players name is in every third index
        count+=3
        
#Helper function to clean a Transfer dataframe 
def cleanTransferDataFrame(df):
    df.replace('\n', '', regex=True, inplace=True)
    df.replace('£', '', regex=True, inplace=True)
    df['Transfer Fee (£)'].replace('m', '', regex=True, inplace=True)
    df['Transfer Fee (£)'] = df['Transfer Fee (£)'].apply(float)
    
    return df
    

def scrapeTransferPage(year):
    
    if type(year) == int:
        year = str(year)
    
    elif type(year) == str:
        pass
    
    else:
        raise Exception("Wrong data type used: use str or int")
        
    page = 'https://www.transfermarkt.co.uk/transfers/transferrekorde/statistik/top/plus/0/galerie/0?saison_id='+year
    pageTree = requests.get(page, headers={'User-Agent':
          'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'})
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    
    if pageTree.status_code == 404:
        
        raise Exception("404 Error: Web Page is not available")
        
    
    Players = pageSoup.find_all("td", {"class":"hauptlink"})
    Fees = pageSoup.find_all("td", "rechts hauptlink")
    
    
    playerList = []
    feeList = []
    populateLists(Players, playerList, Fees, feeList)
        
    if len(playerList) > 0 and len(feeList) > 0:
        df = pd.DataFrame({"Players":playerList, "Transfer Fee (£)":feeList})
        df = cleanTransferDataFrame(df)
    else:
        raise Exception('Player List or FeeList is empty and has not been populated')
    
    return df
    

In [None]:
#Get the top Transfers from the 2021
transfers_2021 = scrapeTransferPage('2021')

In [None]:
transfers_2021

In [None]:
#Dictionary to store all the top transfers in the season
topTransfers = {}

#Populate the dictionary with transfers from 2000 to 2020
for i in range(2000, 2021):
    
    topTransfers[i] = scrapeTransferPage(i)

In [None]:
#Top transfers in the year 2000
topTransfers[2000]

In [None]:
#Top Transfers in 2010
topTransfers[2010]

In [None]:
topTransfers[2015].head()

In [None]:
topTransfers[2020].tail()