In [1]:
from matplotlib import pyplot as plt
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from io import StringIO
import os
import time


class Scrape:
    def __init__(self) -> None:
        self.standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

    def __get_team_urls(self) -> list[str]:
        """Get team urls

        Args:
            standings_url (str): Link to certain leagues' stats

        Returns:
            list[str]: List of teams url
        """
        data = requests.get(self.standings_url)
        soup = BeautifulSoup(data.text, features="lxml")
        standings_table = soup.select("table.stats_table")[0]

        # links store href of each team
        links = standings_table.find_all("a")
        links = [l.get("href") for l in links]
        links = [l for l in links if "/squads/" in l]

        team_urls = [f"https://fbref.com{l}" for l in links]

        return team_urls

    def __get_squad_dfs(self, team_urls: list[str]) -> list[pd.DataFrame]:

        squad_dfs = []

        for team_url in team_urls:
            data = requests.get(team_url)
            squads = pd.read_html(StringIO(data.text), match="Standard Stats")[0]
            squads = squads.droplevel(level=0, axis=1)

            # Get team name
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            # Add a column team_name
            squads["team"] = team_name

            # Change index to lower case
            squads.columns = [c.lower() for c in squads.columns]
            squad_dfs.append(squads)

            time.sleep(1)

        return squad_dfs

    def store_squad_to_csv(self):
        """Store squad info into 2 separate csv files.
        squad_avg.csv : will store avg info about squad
        squad.csv : will store info about players in all teams
        """
        # Store squad_dfs into a single csv file
        team_urls = self.__get_team_urls()
        squad_dfs = self.__get_squad_dfs(team_urls)

        dfs_modified = []

        # Remove the last two rows from each DataFrame and append them to dfs_modified
        for df in squad_dfs:
            df_modified = df.iloc[:-2]  # Exclude the last two rows
            dfs_modified.append(df_modified)

        # Concatenate the modified DataFrames into a single DataFrame
        result_df = pd.concat(dfs_modified, ignore_index=True)
        # Drop 2nd column because it is now meaningless
        result_df.to_csv("squad.csv", index=True)

        # Store avg info
        squad_avg_rows = []
        for df in squad_dfs:
            squad_avg_row = df.iloc[-2]
            squad_avg_rows.append(squad_avg_row)
        # Concatenate squad infos to a single df
        squad_avg_df = pd.concat(squad_avg_rows, axis=1).T
        # Remove columns with NaN values
        squad_avg_df = squad_avg_df.dropna(axis=1)
        squad_avg_df.set_index("team", inplace=True)
        squad_avg_df.to_csv("squad_avg.csv", index=True)



In [35]:

class Squad:
    def __init__(self) -> None:
        # Check if csv file exists
        # NOTE: this will break if cwd changes....
        squad_path = "./squad.csv"
        squad_avg_path = "./squad_avg.csv"
        if not (os.path.isfile(squad_path) and os.path.isfile(squad_avg_path)):
            # File doesnt exist
            try:
                s = Scrape()
                s.store_squad_to_csv()
            except:
                print("Error: Scraping didn't work properly.")
            else:
                print("Scraping process executed successfully.")


        # File exists
        self.squad_avg_df = pd.read_csv(squad_avg_path, index_col=0)

    def squad_avg_df(self):
        return self.squad_avg_df

    def avg_age_graph(self):
        self.squad_avg_df["age"] = self.squad_avg_df["age"].astype(float)
        ax = self.squad_avg_df["age"].plot(kind="bar", color="lightgreen")

        # Set the title and labels
        plt.title("Average Age of Players by Team")
        plt.xlabel("Team")
        plt.ylabel("Average Age")

        # for i, val in enumerate(df['age']):
        #     ax.text(i, val, str(val), ha='center', va='bottom')
        # plt.xticks(rotation=0)
        # Show the plot
        plt.show()
        
    def max_min_info(self):
        # Squad max min info
        infos = ['age', 'gls', 'ast', 'g+a', 'xg', 'prgc']
        
        print("Showing General Info about Premier League teams.")
        for info in infos:
            team_max_avg_age = self.squad_avg_df[info].idxmax()
            max_avg_age = self.squad_avg_df.loc[team_max_avg_age, info]
            
            team_min_avg_age = self.squad_avg_df[info].idxmin()
            min_avg_age = self.squad_avg_df.loc[team_min_avg_age, info]
            print(f"Max {info}: {team_max_avg_age} ({max_avg_age})")
            print(f"Min {info}: {team_min_avg_age} ({min_avg_age})")
            print("")

        
        

In [36]:
s = Squad()
# s.get_squad_df()
# s.avg_age_graph()

# s.squad_avg_df().columns
s.max_min_info()


Showing General Info about Premier League teams.
Max age: Fulham (29.2)
Min age: Chelsea (25.0)

Max gls: Arsenal (80.0)
Min gls: Sheffield United (30.0)

Max ast: Manchester City (59.0)
Min ast: Everton (16.0)

Max g+a: Manchester City (139.0)
Min g+a: Sheffield United (48.0)

Max xg: Liverpool (78.3)
Min xg: Sheffield United (34.5)

Max prgc: Manchester City (1036.0)
Min prgc: Sheffield United (315.0)



Below is for testing

In [30]:
s.squad_avg_df.head()

Unnamed: 0_level_0,player,age,mp,starts,min,90s,gls,ast,g+a,g-pk,...,gls.1,ast.1,g+a.1,g-pk.1,g+a-pk,xg.1,xag.1,xg+xag,npxg.1,npxg+xag.1
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Arsenal,Squad Total,25.7,34,385,3150.0,35.0,80.0,57.0,137.0,71.0,...,2.29,1.63,3.91,2.03,3.66,1.96,1.4,3.36,1.76,3.16
Manchester City,Squad Total,27.3,33,374,3060.0,34.0,80.0,59.0,139.0,75.0,...,2.35,1.74,4.09,2.21,3.94,2.04,1.57,3.61,1.9,3.47
Liverpool,Squad Total,27.1,35,385,3150.0,35.0,72.0,55.0,127.0,66.0,...,2.06,1.57,3.63,1.89,3.46,2.24,1.64,3.88,2.03,3.67
Aston Villa,Squad Total,27.6,35,385,3150.0,35.0,69.0,52.0,121.0,65.0,...,1.97,1.49,3.46,1.86,3.34,1.69,1.28,2.98,1.61,2.89
Tottenham Hotspur,Squad Total,26.0,32,363,2970.0,33.0,62.0,51.0,113.0,60.0,...,1.88,1.55,3.42,1.82,3.36,1.75,1.47,3.22,1.71,3.17


In [None]:



def main():
    s = Squad()
    # s.get_squad_df()
    s.avg_age_graph()


if __name__ == "__main__":
    main()
