In [29]:
!pip install pandas




In [30]:
pip install html5lib

Note: you may need to restart the kernel to use updated packages.


In [42]:
import requests
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
import time

# Initialize variables
years = [2018, 2017, 2016,2015,2014]  # Seasons to iterate over
all_matches = []  # To store data for all teams and seasons

# Base URL for Premier League stats
standings_url = "https://fbref.com/en/comps/9/2017-2018/2017-2018-Premier-League-Stats"

for year in years:
    # Fetch the standings page
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text, "html.parser")

    # Check for the existence of the standings table
    standings_table = soup.select('table.stats_table')
    if not standings_table:
        print(f"No standings table found for year: {year}")
        continue

    standings_table = standings_table[0]
    links = [l.get("href") for l in standings_table.find_all('a')]
    team_links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in team_links]

    # Check and fetch the previous season link
    prev_season_link = soup.select("a.prev")
    if prev_season_link:
        previous_season = prev_season_link[0].get("href")
        standings_url = f"https://fbref.com{previous_season}"
        print(f"Switching to previous season: {standings_url}")
    else:
        print(f"No previous season link found for year: {year}. Terminating.")
        break  # Exit the loop if no previous season link is available

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", "")

        # Fetch team-specific data
        team_data_response = requests.get(team_url)
        html_content = StringIO(team_data_response.text)

        try:
            # Read the "Scores & Fixtures" table
            matches = pd.read_html(html_content, match="Scores & Fixtures")[0]
        except ValueError:
            print(f"No 'Scores & Fixtures' table for team: {team_name}, year: {year}")
            continue

        # Parse the team page for the "Shooting" table link
        soup = BeautifulSoup(team_data_response.text, "html.parser")
        links = [l.get("href") for l in soup.find_all('a')]
        shooting_links = [l for l in links if l and 'all_comps/shooting/' in l]

        if not shooting_links:
            print(f"No 'Shooting' data for team: {team_name}, year: {year}")
            continue

        # Access the "Shooting" table
        shooting_url = f"https://fbref.com{shooting_links[0]}"
        print(f"Accessing shooting data: {shooting_url}")

        shooting_response = requests.get(shooting_url)
        shooting_html_content = StringIO(shooting_response.text)

        try:
            shooting = pd.read_html(shooting_html_content, match="Shooting")[0]
            shooting.columns = shooting.columns.droplevel()  # Drop multi-level headers
        except ValueError:
            print(f"Failed to parse 'Shooting' table for URL: {shooting_url}")
            continue

        # Merge match data with shooting stats
        try:
            team_data = matches.merge(
                shooting[["Date", "Sh", "SoT", "PK", "PKatt"]], on="Date"
            )
        except ValueError:
            print(f"Mismatch in merging data for team: {team_name}, year: {year}")
            continue

        # Filter for Premier League matches and add metadata
        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name

        # Append to the aggregate list
        all_matches.append(team_data)

        # Respectful scraping
        time.sleep(10)

# Combine all collected data into a single DataFrame
final_data = pd.concat(all_matches, ignore_index=True)

# Save to CSV
final_data.to_csv("premierleague_team_data.csv", index=False)
print("Data collection and merging complete. Saved to 'premierleague_team_data.csv'.")



Switching to previous season: https://fbref.com/en/comps/9/2016-2017/2016-2017-Premier-League-Stats
Accessing shooting data: https://fbref.com/en/squads/b8fd03ef/2017-2018/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/19538871/2017-2018/matchlogs/all_comps/shooting/Manchester-United-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/361ca564/2017-2018/matchlogs/all_comps/shooting/Tottenham-Hotspur-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/822bd0ba/2017-2018/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/cff3d9bb/2017-2018/matchlogs/all_comps/shooting/Chelsea-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/18bb7c10/2017-2018/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/sq

In [43]:
final_data

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Opp Formation,Referee,Match Report,Notes,Sh,SoT,PK,PKatt,Season,Team
0,2017-08-12,17:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Brighton,...,4-4-2,Michael Oliver,Match Report,,14.0,4.0,0,0.0,2018,ManchesterCity
1,2017-08-21,20:00,Premier League,Matchweek 2,Mon,Home,D,1,1,Everton,...,3-5-2,Robert Madley,Match Report,,19.0,6.0,0,0.0,2018,ManchesterCity
2,2017-08-26,12:30,Premier League,Matchweek 3,Sat,Away,W,2,1,Bournemouth,...,5-3-2,Mike Dean,Match Report,,19.0,8.0,0,0.0,2018,ManchesterCity
3,2017-09-09,12:30,Premier League,Matchweek 4,Sat,Home,W,5,0,Liverpool,...,4-3-3,Jonathan Moss,Match Report,,13.0,10.0,0,0.0,2018,ManchesterCity
4,2017-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,6,0,Watford,...,4-2-3-1,Anthony Taylor,Match Report,,27.0,9.0,1,1.0,2018,ManchesterCity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2014-04-12,,Premier League,Matchweek 34,Sat,Away,W,1,0,Southampton,...,4-2-3-1,Jon Moss,Match Report,,,,0,,2014,CardiffCity
3796,2014-04-19,,Premier League,Matchweek 35,Sat,Home,D,1,1,Stoke City,...,4-3-2-1,Howard Webb,Match Report,,,,1,,2014,CardiffCity
3797,2014-04-27,,Premier League,Matchweek 36,Sun,Away,L,0,4,Sunderland,...,4-3-3,Phil Dowd,Match Report,,,,0,,2014,CardiffCity
3798,2014-05-03,,Premier League,Matchweek 37,Sat,Away,L,0,3,Newcastle Utd,...,4-4-1-1,Martin Atkinson,Match Report,,,,0,,2014,CardiffCity


In [44]:
import requests
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
import time

# Initialize variables
years = [2023, 2022, 2021,2020]  # Seasons to iterate over
all_matches = []  # To store data for all teams and seasons

# Base URL for Premier League stats
standings_url = "https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats"

for year in years:
    # Fetch the standings page
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text, "html.parser")

    # Check for the existence of the standings table
    standings_table = soup.select('table.stats_table')
    if not standings_table:
        print(f"No standings table found for year: {year}")
        continue

    standings_table = standings_table[0]
    links = [l.get("href") for l in standings_table.find_all('a')]
    team_links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in team_links]

    # Check and fetch the previous season link
    prev_season_link = soup.select("a.prev")
    if prev_season_link:
        previous_season = prev_season_link[0].get("href")
        standings_url = f"https://fbref.com{previous_season}"
        print(f"Switching to previous season: {standings_url}")
    else:
        print(f"No previous season link found for year: {year}. Terminating.")
        break  # Exit the loop if no previous season link is available

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", "")

        # Fetch team-specific data
        team_data_response = requests.get(team_url)
        html_content = StringIO(team_data_response.text)

        try:
            # Read the "Scores & Fixtures" table
            matches = pd.read_html(html_content, match="Scores & Fixtures")[0]
        except ValueError:
            print(f"No 'Scores & Fixtures' table for team: {team_name}, year: {year}")
            continue

        # Parse the team page for the "Shooting" table link
        soup = BeautifulSoup(team_data_response.text, "html.parser")
        links = [l.get("href") for l in soup.find_all('a')]
        shooting_links = [l for l in links if l and 'all_comps/shooting/' in l]

        if not shooting_links:
            print(f"No 'Shooting' data for team: {team_name}, year: {year}")
            continue

        # Access the "Shooting" table
        shooting_url = f"https://fbref.com{shooting_links[0]}"
        print(f"Accessing shooting data: {shooting_url}")

        shooting_response = requests.get(shooting_url)
        shooting_html_content = StringIO(shooting_response.text)

        try:
            shooting = pd.read_html(shooting_html_content, match="Shooting")[0]
            shooting.columns = shooting.columns.droplevel()  # Drop multi-level headers
        except ValueError:
            print(f"Failed to parse 'Shooting' table for URL: {shooting_url}")
            continue

        # Merge match data with shooting stats
        try:
            team_data = matches.merge(
                shooting[["Date", "Sh", "SoT", "PK", "PKatt"]], on="Date"
            )
        except ValueError:
            print(f"Mismatch in merging data for team: {team_name}, year: {year}")
            continue

        # Filter for Premier League matches and add metadata
        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name

        # Append to the aggregate list
        all_matches.append(team_data)

        # Respectful scraping
        time.sleep(10)

# Combine all collected data into a single DataFrame
final_data = pd.concat(all_matches, ignore_index=True)

# Save to CSV
final_data.to_csv("premierleague_test_team_data.csv", index=False)
print("Data collection and merging complete. Saved to 'premierleague_test_team_data.csv'.")

Switching to previous season: https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats
Accessing shooting data: https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/18bb7c10/2022-2023/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/19538871/2022-2023/matchlogs/all_comps/shooting/Manchester-United-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/b2b47a98/2022-2023/matchlogs/all_comps/shooting/Newcastle-United-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/822bd0ba/2022-2023/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions
Accessing shooting data: https://fbref.com/en/squads/d07537b9/2022-2023/matchlogs/all_comps/shooting/Brighton-and-Hove-Albion-Match-Logs-All-Competitions
Accessing shooting data: https:/

In [6]:
# Fetch and clean the rank table
ranking_url = "https://www.clubranking.eu/5-year-ranking-2018/"
ranking = requests.get(ranking_url)
soup = BeautifulSoup(ranking.text, "html.parser")
table_div = soup.find('div', {'data-id': 'baba125'})
rows = table_div.find_all('tr') if table_div else []

# Extract headers and data
headers = ['Rank', 'Logo', 'Club', 'Points', 'League']
data = [[cell.text.strip() for cell in row.find_all('td')] for row in rows]

# Create the rank DataFrame and clean it
rank_df = pd.DataFrame(data, columns=headers)
rank_df = rank_df[['Rank', 'Club']].dropna()

# Remove the dot from the Rank column
rank_df['Rank'] = rank_df['Rank'].str.replace(r'\.', '', regex=True)

# Clean the Club column
rank_df['Club'] = rank_df['Club'].str.replace(r'\bFC\b|\bCF\b', '', regex=True).str.strip()
rank_df['Club'] = rank_df['Club'].replace({
    'Manchester United': 'Manchester Utd',
    'Tottenham Hotspur': 'Tottenham',
    'West Ham United': 'West Ham'
}).reset_index(drop=True)

# Load the existing premierleague_team_data.csv file
team_data = pd.read_csv("premierleague_team_data.csv")

# Add the 'Rank' column by merging with the rank DataFrame
team_data = team_data.merge(rank_df, left_on='Opponent', right_on='Club', how='left')

# Drop the redundant 'Club' column from the rank DataFrame
team_data = team_data.drop(columns=['Club'])

# Fill missing ranks with 1000
team_data['Rank'] = team_data['Rank'].fillna(1000).astype(int)

# Add a binary indicator for ranked teams
team_data['IsRanked'] = team_data['Rank'].apply(lambda x: 1 if x <= 100 else 0)

# Save the updated data back to the CSV file
team_data.to_csv("premierleague_rank_team_data.csv", index=False)

print("Rank data added to the 'premierleague_rank_team_data.csv' file.")



Rank data added to the 'premierleague_rank_team_data.csv' file.


In [9]:
# Fetch and clean the rank table
ranking_url = "https://www.clubranking.eu/5-year-ranking-2022/"
ranking = requests.get(ranking_url)
soup = BeautifulSoup(ranking.text, "html.parser")
table_div = soup.find('div', {'data-id': 'fce8702'})
rows = table_div.find_all('tr') if table_div else []

# Extract headers and data
headers = ['Rank', 'Logo', 'Club', 'Points', 'League']
data = [[cell.text.strip() for cell in row.find_all('td')] for row in rows]

# Create the rank DataFrame and clean it
rank_df = pd.DataFrame(data, columns=headers)
rank_df = rank_df[['Rank', 'Club']].dropna()

# Remove the dot from the Rank column
rank_df['Rank'] = rank_df['Rank'].str.replace(r'\.', '', regex=True)

# Clean the Club column
rank_df['Club'] = rank_df['Club'].str.replace(r'\bFC\b|\bCF\b', '', regex=True).str.strip()
rank_df['Club'] = rank_df['Club'].replace({
    'Manchester United': 'Manchester Utd',
    'Tottenham Hotspur': 'Tottenham',
    'West Ham United': 'West Ham',
    'Brighton & Hove Albion':'Brighton',
    'Newcastle United': 'Newcastle Utd'
}).reset_index(drop=True)

# Load the existing premierleague_team_data.csv file
team_data = pd.read_csv("premierleague_test_team_data.csv")

# Add the 'Rank' column by merging with the rank DataFrame
team_data = team_data.merge(rank_df, left_on='Opponent', right_on='Club', how='left')

# Drop the redundant 'Club' column from the rank DataFrame
team_data = team_data.drop(columns=['Club'])

# Fill missing ranks with 1000
team_data['Rank'] = team_data['Rank'].fillna(1000).astype(int)

# Add a binary indicator for ranked teams
team_data['IsRanked'] = team_data['Rank'].apply(lambda x: 1 if x <= 100 else 0)

# Save the updated data back to the CSV file
team_data.to_csv("premierleague_rank_test_team_data.csv", index=False)

print("Rank data added to the 'premierleague_rank_test_team_data.csv' file.")



Rank data added to the 'premierleague_rank_test_team_data.csv' file.


In [10]:
print(team_data.dtypes)

Date              object
Time              object
Comp              object
Round             object
Day               object
Venue             object
Result            object
GF                 int64
GA                 int64
Opponent          object
xG               float64
xGA              float64
Poss             float64
Attendance       float64
Captain           object
Formation         object
Opp Formation     object
Referee           object
Match Report      object
Notes            float64
Sh               float64
SoT              float64
PK                 int64
PKatt              int64
Season             int64
Team              object
Rank               int64
IsRanked           int64
dtype: object
