**Code to Scrap Data for past 5 years in bundesliga 2**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
years = list(range(2024, 2019, -1))
all_matches = []

In [None]:
standings_url = "https://fbref.com/en/comps/33/2-Bundesliga-Stats"

In [None]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]     #taking Current Stats table which contain the links
    links = [l.get("href") for l in standings_table.find_all('a')]    #finding the Team individual URL by finding anchor
    links = [l for l in links if '/squads/' in l]             #store only the Team links (ehich have "/squad/ in it")
    team_urls = [f"https://fbref.com{l}" for l in links]      #construct the full team URL

    previous_season = soup.select("a.prev")[0].get("href")  #selecting anchor tag that have class prev and take 1 and href property
    standings_url = f"https://fbref.com{previous_season}"  #convert that into absolute URL

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")#scrap team name from the link
        try:
            data = requests.get(team_url)  #get team url
            data.raise_for_status()  # Raise an exception for HTTP errors

            matches = pd.read_html(data.text, match="Scores & Fixtures")[0]   #from team url take only the Scores and fixtures url
            soup = BeautifulSoup(data.text, "html.parser")
            links = [l.get("href") for l in soup.find_all('a')]     #finding the Team individual URL by finding anchor
            links = [l for l in links if l and 'all_comps/shooting/' in l]    #finding Shooting stats
            data = requests.get(f"https://fbref.com{links[0]}")    #finding shooting URL
            data.raise_for_status()  # Raise an exception for HTTP errors

            shooting = pd.read_html(data.text, match="Shooting")[0] #take only first element of table
            shooting.columns = shooting.columns.droplevel()   # Drop top index level since there are 2 headers for the model

            #Try to Merge the matches Data frame and shooting dataframe in a single dataframe if the valueerror occurs just Skip that valur
            try:
                team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
            except ValueError:
                continue

            team_data = team_data[team_data["Comp"] == "2. Bundesliga"]     #Only collecting stats which are there for Bundesliga 2
            team_data["Season"] = year  # Adding extra column for Season
            team_data["Team"] = team_name   # Adding extra column for Team Name since our webpage data dont have that column
            all_matches.append(team_data)  #adding this Data to all match data
            time.sleep(1)  #sleeping for 1 sec delay to make sure that webscraraping will not effect heir website performance
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred for {team_url}: {http_err}")
            # Optionally, you can continue to the next team URL
            continue
        except Exception as e:
            print(f"An error occurred for {team_url}: {e}")
            # Optionally, you can continue to the next team URL
            continue

HTTP error occurred for https://fbref.com/en/squads/3ce4e72c/2019-2020/Osnabruck-Stats: 429 Client Error: Too Many Requests for url: https://fbref.com/en/squads/3ce4e72c/2019-2020/matchlogs/all_comps/shooting/Osnabruck-Match-Logs-All-Competitions
HTTP error occurred for https://fbref.com/en/squads/54864664/2019-2020/St-Pauli-Stats: 429 Client Error: Too Many Requests for url: https://fbref.com/en/squads/54864664/2019-2020/St-Pauli-Stats
HTTP error occurred for https://fbref.com/en/squads/33ba9d7b/2019-2020/Karlsruher-Stats: 429 Client Error: Too Many Requests for url: https://fbref.com/en/squads/33ba9d7b/2019-2020/Karlsruher-Stats
HTTP error occurred for https://fbref.com/en/squads/6f2c108c/2019-2020/Nurnberg-Stats: 429 Client Error: Too Many Requests for url: https://fbref.com/en/squads/6f2c108c/2019-2020/Nurnberg-Stats
HTTP error occurred for https://fbref.com/en/squads/432f2430/2019-2020/Wehen-Wiesbaden-Stats: 429 Client Error: Too Many Requests for url: https://fbref.com/en/squads/

In [None]:
match_df = pd.concat(all_matches) #combine individual ataframes into single Dataframe

In [None]:
match_df.columns = [c.lower() for c in match_df.columns]   #Making all Column names Lowercase

In [None]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-07-29,13:00,2. Bundesliga,Matchweek 1,Sat,Away,W,2,1,Kaiserslautern,...,Match Report,,17,4,20.2,0.0,1,1,2024,St Pauli
1,2023-08-05,13:00,2. Bundesliga,Matchweek 2,Sat,Home,D,0,0,Düsseldorf,...,Match Report,,13,5,18.2,1.0,0,0,2024,St Pauli
3,2023-08-19,13:00,2. Bundesliga,Matchweek 3,Sat,Away,D,0,0,Greuther Fürth,...,Match Report,,7,1,18.2,1.0,0,0,2024,St Pauli
4,2023-08-27,13:30,2. Bundesliga,Matchweek 4,Sun,Home,D,0,0,Magdeburg,...,Match Report,,28,12,17.6,0.0,0,0,2024,St Pauli
5,2023-09-01,18:30,2. Bundesliga,Matchweek 5,Fri,Away,D,1,1,Braunschweig,...,Match Report,,19,4,19.0,0.0,0,0,2024,St Pauli
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,2020-06-06,13:00,2. Bundesliga,Matchweek 30,Sat,Home,W,3,0,Darmstadt 98,...,Match Report,,7,3,18.8,0.0,0,0,2020,Jahn Regensburg
31,2020-06-13,13:00,2. Bundesliga,Matchweek 31,Sat,Away,L,1,4,Heidenheim,...,Match Report,,14,4,14.4,2.0,0,0,2020,Jahn Regensburg
32,2020-06-17,18:30,2. Bundesliga,Matchweek 32,Wed,Home,W,2,1,Karlsruher,...,Match Report,,9,6,19.4,0.0,0,0,2020,Jahn Regensburg
33,2020-06-21,15:30,2. Bundesliga,Matchweek 33,Sun,Away,D,1,1,St. Pauli,...,Match Report,,6,2,20.0,1.0,0,0,2020,Jahn Regensburg


In [None]:
match_df.to_csv("matches.csv")