# WEB SCRAPING DATA

In [1]:
#Import the necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json

In [2]:
# Web scraping required data from our website and storing it in a csv file using the get_csv function
def get_csv(url, year, fname, cols, table_idx=0):
    response = requests.get(url)
    df = []
    if response.status_code == 200:
    # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all tables with ID 'btable'
        tables = soup.find_all('table', {'id': 'btable'})

        # Check if there are tables
        if tables:
            # Assume the 'Premier League' table is the first 'btable'
            premier_league_table = tables[table_idx]

            # Extract data from the table
            rows = premier_league_table.find_all('tr')

            # Print the header
            header = [th.text.strip() for th in rows[0].find_all(['th', 'td'])]

            # Print the data
            for row in rows[1:]:
                data = [td.text.strip() for td in row.find_all(['th', 'td'])]
                if len(data) > 0:
                    df.append(data)                    
            df = pd.DataFrame(df, columns=cols)
            df.to_csv(f'../data/raw/{fname}_{year}.csv')
            display(df)
            return df
        else:
            print("No tables with ID 'btable' found on the page.")
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [3]:
for d in [("", '2023'), ("_2023", '2022'), ("_2022", '2021')]:
    get_csv(f"https://www.soccerstats.com/results.asp?league=england{d[0]}&pmtype=bydate", d[1], "all_matches", cols=['date', 'home_team', 'score', 'away_team', 'stats', 'half_time', '2.5+', 'total_goals', 'btts'])


Unnamed: 0,date,home_team,score,away_team,stats,half_time,2.5+,total_goals,btts
0,,,,,,,,,
1,Fr 11 Aug,Burnley,0 - 3,Manchester City,stats,(0-2),+,3,-
2,,,,,,,,,
3,Sa 12 Aug,Arsenal,2 - 1,Nottm Forest,stats,(2-0),+,3,+
4,Sa 12 Aug,Bournemouth,1 - 1,West Ham Utd,stats,(0-0),-,2,+
...,...,...,...,...,...,...,...,...,...
460,Su 19 May,Crystal Palace,15:00,Aston Villa,h2h,,,,
461,Su 19 May,Liverpool,15:00,Wolverhampton,h2h,,,,
462,Su 19 May,Luton Town,15:00,Fulham,h2h,,,,
463,Su 19 May,Manchester City,15:00,West Ham Utd,h2h,,,,


Unnamed: 0,date,home_team,score,away_team,stats,half_time,2.5+,total_goals,btts
0,,,,,,,,,
1,Fr 5 Aug,Crystal Palace,0 - 2,Arsenal,stats,(0-1),-,2,-
2,,,,,,,,,
3,Sa 6 Aug,Fulham,2 - 2,Liverpool,stats,(1-0),+,4,+
4,Sa 6 Aug,Bournemouth,2 - 0,Aston Villa,stats,(1-0),-,2,-
...,...,...,...,...,...,...,...,...,...
492,Su 28 May,Everton,1 - 0,Bournemouth,stats,(0-0),-,1,-
493,Su 28 May,Leeds Utd,1 - 4,Tottenham,stats,(0-1),+,5,+
494,Su 28 May,Leicester City,2 - 1,West Ham Utd,stats,(1-0),+,3,+
495,Su 28 May,Manchester Utd,2 - 1,Fulham,stats,(1-1),+,3,+


Unnamed: 0,date,home_team,score,away_team,stats,half_time,2.5+,total_goals,btts
0,,,,,,,,,
1,Fr 13 Aug,Brentford,2 - 0,Arsenal,stats,(1-0),-,2,-
2,,,,,,,,,
3,Sa 14 Aug,Manchester Utd,5 - 1,Leeds Utd,stats,(1-0),+,6,+
4,Sa 14 Aug,Burnley,1 - 2,Brighton,stats,(1-0),+,3,+
...,...,...,...,...,...,...,...,...,...
498,Su 22 May,Crystal Palace,1 - 0,Manchester Utd,stats,(1-0),-,1,-
499,Su 22 May,Leicester City,4 - 1,Southampton,stats,(0-0),+,5,+
500,Su 22 May,Liverpool,3 - 1,Wolverhampton,stats,(1-1),+,4,+
501,Su 22 May,Manchester City,3 - 2,Aston Villa,stats,(0-1),+,5,+


In [4]:
res = []
for d in [("", '2023'), ("_2023", '2022'), ("_2022", '2021')]:
    res = get_csv(f"https://www.soccerstats.com/homeaway.asp?league=england{d[0]}", d[1], "home", cols=["No", "Name", "GP", "W", "D", "L", "GF", "GA", "GD", "Pts"])



Unnamed: 0,No,Name,GP,W,D,L,GF,GA,GD,Pts
0,1,Liverpool,7,7,0,0,21,5,16,21
1,2,Newcastle Utd,8,7,0,1,19,4,15,21
2,3,Arsenal,8,6,2,0,20,8,12,20
3,4,Aston Villa,6,6,0,0,23,5,18,18
4,5,Manchester City,7,5,2,0,20,7,13,17
5,6,Brighton,7,3,3,1,15,10,5,12
6,7,Brentford,8,3,3,2,15,12,3,12
7,8,Tottenham,6,4,0,2,10,8,2,12
8,9,Manchester Utd,7,4,0,3,8,10,-2,12
9,10,West Ham Utd,7,3,2,2,12,10,2,11


Unnamed: 0,No,Name,GP,W,D,L,GF,GA,GD,Pts
0,1,Manchester City,19,17,1,1,60,17,43,52
1,2,Manchester Utd,19,15,3,1,36,10,26,48
2,3,Arsenal,19,14,3,2,53,25,28,45
3,4,Liverpool,19,13,5,1,46,17,29,44
4,5,Newcastle Utd,19,11,6,2,36,14,22,39
5,6,Aston Villa,19,12,2,5,33,21,12,38
6,7,Brentford,19,10,7,2,35,18,17,37
7,8,Tottenham,19,12,1,6,37,25,12,37
8,9,Brighton,19,10,4,5,37,21,16,34
9,10,Nottm Forest,19,8,6,5,27,24,3,30


Unnamed: 0,No,Name,GP,W,D,L,GF,GA,GD,Pts
0,1,Liverpool,19,15,4,0,49,9,40,49
1,2,Manchester City,19,15,2,2,58,15,43,47
2,3,Arsenal,19,13,2,4,35,17,18,41
3,4,Tottenham,19,13,1,5,38,19,19,40
4,5,Manchester Utd,19,10,5,4,32,22,10,35
5,6,Chelsea,19,9,7,3,37,22,15,34
6,7,Leicester City,19,10,4,5,34,23,11,34
7,8,West Ham Utd,19,9,5,5,33,26,7,32
8,9,Newcastle Utd,19,8,6,5,26,27,-1,30
9,10,Crystal Palace,19,7,8,4,27,17,10,29


In [5]:
for d in [("", '2023'), ("_2023", '2022'), ("_2022", '2021')]:
    get_csv(f"https://www.soccerstats.com/homeaway.asp?league=england{d[0]}&stats=8", d[1], "away", cols=["No", "Name", "GP", "W", "D", "L", "GF", "GA", "GD", "Pts"], table_idx=1)

Unnamed: 0,No,Name,GP,W,D,L,GF,GA,GD,Pts
0,1,Tottenham,8,4,3,1,18,12,6,15
1,2,Manchester City,7,4,1,2,16,9,7,13
2,3,Arsenal,6,4,1,1,9,3,6,13
3,4,Everton,7,4,1,2,10,11,-1,13
4,5,Manchester Utd,7,4,0,3,8,7,1,12
5,6,Crystal Palace,8,3,2,3,8,11,-3,11
6,7,Aston Villa,8,3,2,3,10,15,-5,11
7,8,Chelsea,6,3,1,2,12,9,3,10
8,9,Liverpool,7,2,4,1,11,9,2,10
9,10,Brighton,7,3,1,3,15,16,-1,10


Unnamed: 0,No,Name,GP,W,D,L,GF,GA,GD,Pts
0,1,Arsenal,19,12,3,4,35,18,17,39
1,2,Manchester City,19,11,4,4,34,16,18,37
2,3,Newcastle Utd,19,8,8,3,32,19,13,32
3,4,Brighton,19,8,4,7,35,32,3,28
4,5,Manchester Utd,19,8,3,8,22,33,-11,27
5,6,Fulham,19,7,2,10,24,24,0,23
6,7,Liverpool,19,6,5,8,29,30,-1,23
7,8,Tottenham,19,6,5,8,33,38,-5,23
8,9,Aston Villa,19,6,5,8,18,25,-7,23
9,10,Brentford,19,5,7,7,23,28,-5,22


Unnamed: 0,No,Name,GP,W,D,L,GF,GA,GD,Pts
0,1,Manchester City,19,14,4,1,41,11,30,46
1,2,Liverpool,19,13,4,2,45,17,28,43
2,3,Chelsea,19,12,4,3,39,11,28,40
3,4,Tottenham,19,9,4,6,31,21,10,31
4,5,Brighton,19,7,8,4,23,21,2,29
5,6,Arsenal,19,9,1,9,26,31,-5,28
6,7,Wolverhampton,19,8,3,8,18,18,0,27
7,8,West Ham Utd,19,7,3,9,27,25,2,24
8,9,Manchester Utd,19,6,5,8,25,35,-10,23
9,10,Aston Villa,19,7,1,11,23,25,-2,22
