In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import MyFunctionsList as mfs
from splinter import Browser
from datetime import datetime
from DoNotPublish import password
from sqlalchemy import create_engine
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

# Debugging
from IPython.display import display, clear_output

In [2]:
# Pull Data from 'leagues' sql table

protocol = 'postgresql'
username = 'postgres'
password = password
host = 'localhost'
port = 5432
database_name = 'european_football_db'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

In [3]:
# Query links from 'leagues'

clubs = pd.read_sql_query('select * from clubs', con=engine)

# Set league_link to index to allow for loc search of league at a later point
clubs = clubs.set_index(["club_link"])

In [4]:
# Start Scraping System

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)

In [5]:
#Debug
debug_number= 0

# Series contains links to clubs
clubs_links = clubs.index

# Create list to store all players data across clubs
list_of_players = []

# Loop through clubs
for club_link in clubs_links:
    
    # Create List for Player info to be stored in
    player_info = []
    
    # Go to club_page
    browser.visit(club_link)
    site = soup(browser.html, "html.parser")
    
    # Locate and go to detailed club page
    detailed_tab = site.find("div", class_="tm-tabs")
    detailed_tab = detailed_tab.find_all("a")
    detailed_link = detailed_tab[1]["href"]
    browser.visit(f"https://www.transfermarkt.us{detailed_link}")
    site = soup(browser.html, "html.parser")
    
    # Generate List of rows:
    # Given that there are burried <tr> in the html which would result result in a list which was difficult to parse,
    # here we use the two td/class combinations which always correspond to a player. This makes future cleaning easier. 
    first_half_of_league_list = site.find_all("tr", class_="odd")
    first_half_of_players_list = site.find_all("tr", class_="odd")
    second_half_of_players_list = site.find_all("tr", class_="even")
    players = first_half_of_players_list + second_half_of_players_list
    
    for player in players:
        try: 
        
        #Create list to store individual player info
            player_info = []
            
        #Obtain and store Name
            name = player.find("td", class_="hauptlink").text.strip()
            player_info.append(name)
            
        # Store club_id
            club_id = clubs.loc[club_link, "club_id"]
            player_info.append(club_id)

        #Obtain and store Number
            number = player.find("div", class_ = "rn_nummer").text.strip()
            if number == "-":
                number = 100
            player_info.append(number)

        #Obtain and store Player_Position
            tbody = player.find("tbody")
            tr = tbody.find_all("tr")
            td = tr[1]
            position = td.text.strip()
            player_info.append(position)

            multiple_info_field = player.find_all("td", class_="zentriert")

        #Obtain and store date of birth
            dob_age = multiple_info_field[1].text.strip()
            dob = mfs.date_cleaner(dob_age)
            player_info.append(dob)

        #Obtain and store nationalities
            player_nationalities = multiple_info_field[2].find_all("img")

        #Strip nationality name from image
            nationality_list = []
            for nation in player_nationalities:
                nationality_list.append(nation["alt"])

            # add second nationality of 'None' if approrpriate
            if len(nationality_list) == 1:
                nationality_list.append("None")
                
            player_info.append(nationality_list[0])
            player_info.append(nationality_list[1])
            
        #Obtain and store player height
            height = multiple_info_field[3].text.strip()
            height = int(mfs.output_only_digits(height))
            player_info.append(height)

        #Obtain and store player foot
            preferred_foot = multiple_info_field[4].text.strip()
            player_info.append(preferred_foot)

        #Obtain and store join date
            join_date = multiple_info_field[5].text.strip()
            if join_date == "-":
                join_date = "Jan 1, 1900"
            join_date = datetime.strptime(join_date, "%b %d, %Y").date()
            player_info.append(join_date)

        #Obtain and store contract end date
            contract_end = multiple_info_field[7].text.strip()
            if contract_end == "-":
                contract_end = "Jan 1, 1900"
            contract_end = datetime.strptime(contract_end, "%b %d, %Y").date()
            player_info.append(contract_end)

        #MFind and store 
            market_value = player.find("td", class_="rechts hauptlink").text.strip()
            market_value = mfs.number_cleaner(market_value)
            player_info.append(market_value)

        #store player_info
            list_of_players.append(player_info)

        except:
            pass
        
    # Debug
    debug_number += 1
    clear_output(wait=True)
    display(f"{debug_number} out of {len(clubs_links)} complete")

'398 out of 398 complete'

In [6]:
browser.quit()

In [7]:
# Generate Dataframe
players_dataframe = pd.DataFrame(list_of_players, columns=("name",
                                                  "club_id",
                                                  "number",
                                                  "position",
                                                  "date_of_birth",
                                                  "primary_nationality",
                                                  "secondary_nationality",
                                                  "height_cm",
                                                  "preferred_foot",
                                                  "date_joined_club",
                                                  "contract_end_date",
                                                  "player_market_value"))

In [8]:
# Upload Data to Database
players_dataframe.to_sql(name="players", con=engine, if_exists="append", index=False)

In [9]:
players_dataframe

Unnamed: 0,name,club_id,number,position,date_of_birth,primary_nationality,secondary_nationality,height_cm,preferred_foot,date_joined_club,contract_end_date,player_market_value
0,Ederson,1,31,Goalkeeper,1993-08-17,Brazil,Portugal,188,left,2017-07-01,2026-06-30,45000000
1,Scott Carson,1,33,Goalkeeper,1985-09-03,England,,188,right,2021-07-20,2023-06-30,250000
2,Aymeric Laporte,1,14,Centre-Back,1994-05-27,Spain,France,189,left,2018-01-30,2025-06-30,38000000
3,Nathan Aké,1,6,Centre-Back,1995-02-18,Netherlands,Cote d'Ivoire,180,left,2020-08-05,2025-06-30,30000000
4,Sergio Gómez,1,21,Left-Back,2000-09-04,Spain,,171,left,2022-08-16,2026-06-30,15000000
...,...,...,...,...,...,...,...,...,...,...,...,...
10328,Márcio Meira,398,10,Attacking Midfield,1994-01-09,Portugal,,169,right,2021-07-22,2023-05-31,200000
10329,Kandet Diawara,398,9,Left Winger,2000-02-10,Guinea,France,180,right,2022-09-03,2023-06-30,250000
10330,Julien Lamy,398,99,Left Winger,1999-11-06,France,Cameroon,185,both,2021-08-31,2023-05-31,150000
10331,Dimitris Flouris,398,23,Right Winger,2002-07-23,Cyprus,,181,right,2020-07-01,1900-01-01,0
