<a href="https://colab.research.google.com/github/Vik7am10/SportData/blob/main/EPL2024MatchReportScrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
pl_url = "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures"
data = requests.get(pl_url)
soup = BeautifulSoup(data.text)

In [2]:
td_elements = soup.find_all('td', class_='left', attrs={'data-stat': 'match_report'})
match_report_links = [td.find('a')['href'] for td in td_elements if td.find('a') and td.find('a').get('href').startswith('/')]

In [7]:
match_report_links = [f"https://fbref.com{l}" for l in match_report_links]
match_report_links

['https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/3a6836b4/Burnley-Manchester-City-August-11-2023-Premier-League',
 'https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/26a7f90c/Arsenal-Nottingham-Forest-August-12-2023-Premier-League',
 'https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/15addfc7/Everton-Fulham-August-12-2023-Premier-League',
 'https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/55fd92c7/Sheffield-United-Crystal-Palace-August-12-2023-Premier-League',
 'https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/56a137f7/Brighton-and-Hove-Albion-Luton-Town-August-12-2023-Premier-League',
 'https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/d6bbf293/Bournemouth-West-Ham-United-August-12-2023-Premier-League',
 'https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/8ff2f8fe/Newcastle-United-Aston-Villa-August-12-2023-Premier-League',
 'https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/67e

In [5]:
larger_dataset_columns = ["Match ID", "Team1", "Team2", "Score1", "Score2", "Possession1", "Possession2",
                          "Passing Accuracy1", "Passing Accuracy2", "Shots on Target1", "Shots on Target2",
                          "Saves1", "Saves2", "Cards1", "Cards2", "Fouls1", "Fouls2", "Corners1", "Corners2",
                          "Crosses1", "Crosses2", "Touches1", "Touches2", "Tackles1", "Tackles2",
                          "Interceptions1", "Interceptions2", "Aerials Won1", "Aerials Won2",
                          "Clearances1", "Clearances2", "Offsides1", "Offsides2", "Goal Kicks1",
                          "Goal Kicks2", "Throw Ins1", "Throw Ins2", "Long Balls1", "Long Balls2"]

larger_dataset = pd.DataFrame(columns=larger_dataset_columns)

In [8]:
import time
i = 0
for report in match_report_links:
  #soup to parse the link
  print(i)
  print(report)
  data = requests.get(report)
  soup = BeautifulSoup(data.text)

  #extract score first
  scorebox = soup.find('div', class_='scorebox')

  # Initialize lists to store data
  teams_data = []

  # Extract team names, scores, and expected goals (xG)
  teams = scorebox.find_all('div', recursive=False)
  for team in teams[:2]:  # Only interested in the first two divs, each representing a team
      team_name = team.find('strong').get_text(strip=True)
      team_score = team.find('div', class_='score').get_text(strip=True)
      teams_data.append({"Team": team_name, "Score": team_score})

  df = pd.DataFrame(teams_data)

  #starting extracting team stats now
  team_stats = soup.find('div', id='team_stats')
  stat_headers = team_stats.find_all('th', style=True)
  team1_name = stat_headers[0].get_text(strip=True)
  team2_name = stat_headers[1].get_text(strip=True)

  stats_data = {"Stat": [], team1_name: [], team2_name: []}

  stat_rows = team_stats.find_all('tr')
  current_stat = ""
  for row in stat_rows:
      if row.th and row.th.get('colspan'):  # It's a stat header
          current_stat = row.th.get_text(strip=True)
      elif row.td:  # It's a stat value row
          team1_stat = row.find_all('td')[0].get_text(strip=True)
          team2_stat = row.find_all('td')[1].get_text(strip=True)
          stats_data["Stat"].append(current_stat)
          stats_data[team1_name].append(team1_stat)
          stats_data[team2_name].append(team2_stat)

  #extract other stats
  team_stats_extra = soup.find('div', id='team_stats_extra')

  extra_stat_divs = team_stats_extra.find_all('div')
  extra_stats = []
  extra_stats.append(extra_stat_divs[0])
  extra_stats.append(extra_stat_divs[16])
  extra_stats.append(extra_stat_divs[32])

  for section in extra_stats:
    divs = section.find_all('div', recursive=False)
    for i in range(3, len(divs), 3):  # Start from the 4th div and process in steps of 3
        team1_stat = divs[i].get_text(strip=True)
        stat_name = divs[i + 1].get_text(strip=True)
        team2_stat = divs[i + 2].get_text(strip=True)
        stats_data["Stat"].append(stat_name)
        stats_data[team1_name].append(team1_stat)
        stats_data[team2_name].append(team2_stat)

# Combine team scores with team stats
  teams_df = pd.DataFrame(teams_data).set_index('Team').T
  stats_df = pd.DataFrame(stats_data).set_index('Stat')
  combined_df = pd.concat([teams_df, stats_df])

# Convert combined_df to a single row
  match_id = i  # Example match ID
  i = i + 1
  team1_name = combined_df.columns[0]
  team2_name = combined_df.columns[1]

  match_info = {
      "Match ID": match_id,
      "Team1": team1_name,
      "Team2": team2_name,
      "Score1": combined_df.at["Score", team1_name],
      "Score2": combined_df.at["Score", team2_name],
      "Possession1": combined_df.at["Possession", team1_name],
      "Possession2": combined_df.at["Possession", team2_name],
      "Passing Accuracy1": combined_df.at["Passing Accuracy", team1_name],
      "Passing Accuracy2": combined_df.at["Passing Accuracy", team2_name],
      "Shots on Target1": combined_df.at["Shots on Target", team1_name],
      "Shots on Target2": combined_df.at["Shots on Target", team2_name],
      "Saves1": combined_df.at["Saves", team1_name],
      "Saves2": combined_df.at["Saves", team2_name],
      "Cards1": combined_df.at["Cards", team1_name],
      "Cards2": combined_df.at["Cards", team2_name],
      "Fouls1": combined_df.at["Fouls", team1_name],
      "Fouls2": combined_df.at["Fouls", team2_name],
      "Corners1": combined_df.at["Corners", team1_name],
      "Corners2": combined_df.at["Corners", team2_name],
      "Crosses1": combined_df.at["Crosses", team1_name],
      "Crosses2": combined_df.at["Crosses", team2_name],
      "Touches1": combined_df.at["Touches", team1_name],
      "Touches2": combined_df.at["Touches", team2_name],
      "Tackles1": combined_df.at["Tackles", team1_name],
      "Tackles2": combined_df.at["Tackles", team2_name],
      "Interceptions1": combined_df.at["Interceptions", team1_name],
      "Interceptions2": combined_df.at["Interceptions", team2_name],
      "Aerials Won1": combined_df.at["Aerials Won", team1_name],
      "Aerials Won2": combined_df.at["Aerials Won", team2_name],
      "Clearances1": combined_df.at["Clearances", team1_name],
      "Clearances2": combined_df.at["Clearances", team2_name],
      "Offsides1": combined_df.at["Offsides", team1_name],
      "Offsides2": combined_df.at["Offsides", team2_name],
      "Goal Kicks1": combined_df.at["Goal Kicks", team1_name],
      "Goal Kicks2": combined_df.at["Goal Kicks", team2_name],
      "Throw Ins1": combined_df.at["Throw Ins", team1_name],
      "Throw Ins2": combined_df.at["Throw Ins", team2_name],
      "Long Balls1": combined_df.at["Long Balls", team1_name],
      "Long Balls2": combined_df.at["Long Balls", team2_name]
  }

  # Combine match_info with stats_row
  match_row = pd.DataFrame(match_info, index=[0])

  # Append the row to the larger dataset using pd.concat
  larger_dataset = pd.concat([larger_dataset, match_row], ignore_index=True)
  time.sleep(1)



0
https://fbref.comhttps://fbref.comhttps://fbref.com/en/matches/3a6836b4/Burnley-Manchester-City-August-11-2023-Premier-League


ConnectionError: HTTPSConnectionPool(host='fbref.comhttps', port=443): Max retries exceeded with url: //fbref.comhttps://fbref.com/en/matches/3a6836b4/Burnley-Manchester-City-August-11-2023-Premier-League (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7b52403a6d70>: Failed to resolve 'fbref.comhttps' ([Errno -2] Name or service not known)"))