In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import MyFunctionsList as mfs
from splinter import Browser
from DoNotPublish import password
from sqlalchemy import create_engine
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Pull Data from 'leagues' sql table

protocol = 'postgresql'
username = 'postgres'
password = password
host = 'localhost'
port = 5432
database_name = 'european_football_db'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

In [3]:
# Query links from 'leagues'

leagues = pd.read_sql_query('select * from leagues', con=engine)

# Set league_link to index to allow for loc search of league at a later point
leagues = leagues.set_index(["league_link"])

In [4]:
# Start Scraping System

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)

In [5]:
# Series contains links to leagues
league_links = leagues.index

# Create list to store all clubs data across leagues
list_of_clubs = []

# Loop through leagues:
for league_link in league_links:
    club_count = 0
    # Go to page for individual league, convert to html
    browser.visit(league_link)
    site = soup(browser.html, "html.parser")

    # Create list of data to interate through to scrape club data
    clubs = site.find_all("tr")
    
    #Loop through all clubs in league table. #3 was identifed as correct start point by programmer.
    for club in clubs:

        try:
            
        # Create storage for individual club data
            club_info = []
        
        # Identify data for club name, club link. 
            club_full = club.find("td", class_="hauptlink no-border-links")
            
        # Store Club Name
            club_name = club_full.text.strip()
            club_info.append(club_name)
            
        # Store Club's League
            club_league = leagues.loc[league_link, "league"]
            club_info.append(club_league)
            
        # Store Club's League_Level
            club_league_tier = leagues.loc[league_link, "league_tier"]
            club_info.append(club_league_tier)
            
        # Store Club's Country
            club_country = leagues.loc[league_link, "country"]
            club_info.append(club_country)
            
        # Squard Data Field for use in later data points
            squad_info = club.find_all("td", class_="zentriert")
        
        # Store Squad Size
            squad_size = squad_info[1].text.strip()
            club_info.append(squad_size)
        
        # Store Squad Age
            squad_age = squad_info[2].text.strip()
            club_info.append(squad_age)
            
        # Store number of foreign players
            foreigners = squad_info[3].text.strip()
            club_info.append(foreigners)
            
        # Identify market value field for later use
            market_value_info = club.find_all("td", class_="rechts")
            
        # Store Average Market Value of Squad
            average_market_value = market_value_info[0].text.strip()
            club_info.append(mfs.number_cleaner(average_market_value))
        
        # Store Total Market Value of Squad
            total_market_value = market_value_info[1].text.strip()
            club_info.append(mfs.number_cleaner(total_market_value))
            
        #Store Club Link 
            club_link = club_full.a['href']
            club_info.append(f"https://www.transfermarkt.us{club_link}")
            
        # Go to Club Page to Scrape stadium_name, capacity, years_in_league and locate leage data
            browser.visit(f"https://www.transfermarkt.us{club_link}")
            club_page = soup(browser.html, "html.parser")

        #Store years_in_league
            league_related_data = club_page.find("div", class_="data-header__box--big")
            years_in_league_field = league_related_data.find_all("span", class_="data-header__label")
            years_in_league_string = years_in_league_field[2].a.text.strip()
            years_in_league = mfs.output_only_digits(years_in_league_string)
            club_info.append(years_in_league)  
            
        #Store stadium_name
            stadium_name_data = club_page.find_all("li", class_="data-header__label")
            stadium_name = stadium_name_data[4].a.text.strip()
            club_info.append(stadium_name)

        #Store stadium_capacity
            stadium_capacity_info = club_page.find_all("span", class_="tabellenplatz")
            stadium_capacity_string = stadium_capacity_info[1].text.strip()
            stadium_capacity = mfs.output_only_digits(stadium_capacity_string)
            club_info.append(stadium_capacity)
            
        #Add club to list of clubs
            list_of_clubs.append(club_info)
            
            club_count += 1
        except:
            pass
        
    club_league = leagues.loc[league_link, "league"]
    print(f"{club_league} is done!")
    print(f"There were {club_count} clubs collected!")
    club_count = 0

Premier League is done!
There were 20 clubs collected!
Serie A is done!
There were 20 clubs collected!
Ligue 1 is done!
There were 20 clubs collected!
Süper Lig is done!
There were 19 clubs collected!
Jupiler Pro League is done!
There were 18 clubs collected!
Bundesliga is done!
There were 12 clubs collected!
Scottish Premiership is done!
There were 12 clubs collected!
Super League is done!
There were 10 clubs collected!
SuperSport HNL is done!
There were 10 clubs collected!
Super liga Srbije is done!
There were 16 clubs collected!
SuperLiga is done!
There were 16 clubs collected!
Eliteserien is done!
There were 16 clubs collected!
Ligat ha'Al is done!
There were 14 clubs collected!
LaLiga is done!
There were 20 clubs collected!
Bundesliga is done!
There were 17 clubs collected!
Liga Portugal is done!
There were 18 clubs collected!
Eredivisie is done!
There were 18 clubs collected!
Premier Liga is done!
There were 16 clubs collected!
Super League 1 is done!
There were 14 clubs collecte

In [6]:
# Generate dataframe containing all clubs with extracted data 

clubs_dataframe = pd.DataFrame(list_of_clubs, columns=("club",
                                                       "league",
                                                       "league_tier",
                                                       "country",
                                                       "squad_size",
                                                       "average_age",
                                                       "foreign_players",
                                                       "average_club_value_euro",
                                                       "total_club_value_euro",
                                                      "club_link",
                                                      "years_in_league",
                                                       "stadium_name",
                                                       "stadium_capacity"
                                                      ))

In [7]:
# Reorder Databas
clubs_dataframe = clubs_dataframe[["club",
                                    "league",
                                   "years_in_league",
                                    "league_tier",
                                    "country",
                                    "squad_size",
                                    "average_age",
                                    "foreign_players",
                                    "average_club_value_euro",
                                    "total_club_value_euro",
                                   "stadium_name",
                                   "stadium_capacity",
                                    "club_link",                                   
                                    ]]

In [8]:
#Upload Data to Database
clubs_dataframe.to_sql(name="clubs", con=engine, if_exists="append", index=False)

In [9]:
browser.quit()

In [10]:
clubs_dataframe

Unnamed: 0,club,league,years_in_league,league_tier,country,squad_size,average_age,foreign_players,average_club_value_euro,total_club_value_euro,stadium_name,stadium_capacity,club_link
0,Manchester City,Premier League,21,1,England,24,26.7,16,43700000,1050000000,Etihad Stadium,55017,https://www.transfermarkt.us/manchester-city/s...
1,Chelsea FC,Premier League,31,1,England,33,25.0,22,31650000,1040000000,Stamford Bridge,40853,https://www.transfermarkt.us/fc-chelsea/starts...
2,Liverpool FC,Premier League,31,1,England,30,26.5,21,31030000,931000000,Anfield,54074,https://www.transfermarkt.us/fc-liverpool/star...
3,Arsenal FC,Premier League,31,1,England,23,25.1,16,34910000,803000000,Emirates Stadium,60704,https://www.transfermarkt.us/fc-arsenal/starts...
4,Manchester United,Premier League,31,1,England,32,25.5,20,23730000,759200000,Old Trafford,74879,https://www.transfermarkt.us/manchester-united...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,Nea Salamina Famagusta,Protathlima Cyta,1,1,Cyprus,29,28.5,20,228000,6600000,Ammochostos Stadium,5500,https://www.transfermarkt.us/nea-salamina-fama...
394,Karmiotissa Pano Polemidion,Protathlima Cyta,1,1,Cyprus,28,28.1,20,226000,6330000,Stelios Kyriakidis Stadio,9394,https://www.transfermarkt.us/karmiotissa-pano-...
395,Doxa Katokopias,Protathlima Cyta,11,1,Cyprus,24,27.1,17,235000,5650000,Stadio Katokopia,3500,https://www.transfermarkt.us/doxa-katokopias/s...
396,Akritas Chlorakas,Protathlima Cyta,1,1,Cyprus,28,23.3,23,198000,5550000,Kinotiko Stadio Chlorakas,2500,https://www.transfermarkt.us/akritas-chlorakas...
