In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
BASE_URL = "https://www.worldfootball.net/{feature}/eng-premier-league-{season}/"
SEASONS = ['2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']

In [5]:
def getRound(match_soup):
    round_text = match_soup.find('div', class_='breadcrumb').find('h1').get_text()
    round_info = round_text.split('»')[1].split('.')[0].strip()
    return round_info

def getCoaches(coach_table):
    coaches = [b.get_text() for b in coach_table.find_all('b')]
    home_coach = coaches[0].split(':')[1].strip()
    away_coach = coaches[1].split(':')[1].strip()
    return home_coach, away_coach

def getLineup(team_table):
    players = [a.get_text() for a in team_table.find_all('a')]
    return players

def getTeamName(info_table):
    team_names = [a.get_text() for a in info_table.find_all('a') if 'teams' in a['href']]
    h_name, a_name = team_names[0], team_names[1]
    return h_name, a_name

def writeProgress(url):
    with open('progress.txt', 'a') as f:
        f.write(url + '\n')

In [None]:
def crawl():
    match_data = []
    player_data = []
    
    for season in SEASONS:
        season_url = BASE_URL.format(season=season, feature='all_matches')
        print(f"Crawling season: {season} - URL: {season_url}")
        
        response = requests.get(season_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'class': 'standard_tabelle'})
        
        match_links = [a['href'] for a in table.find_all('a') if 'report' in a['href'] and '(' in a.get_text()]
        match_urls = [BASE_URL.format(season=season, feature='report') + url for url in match_links]
        
        for match_url in match_urls:
            print(f"Crawling match: {match_url}")
            writeProgress(match_url)
            
            match_response = requests.get(match_url)
            match_soup = BeautifulSoup(match_response.text, 'html.parser')
            
            match_round = getRound(match_soup)
            match_tables = match_soup.find_all('table', {'class': 'standard_tabelle'})

            if len(match_tables) == 6:
                info_table, home_table, away_table, coach_table = match_tables[0], match_tables[2], match_tables[3], match_tables[4]
            else:
                info_table, home_table, away_table, coach_table = match_tables[0], match_tables[3], match_tables[4], match_tables[5]

            # Get coach
            home_coach, away_coach = getCoaches(coach_table)

            # Get lineup
            home_lineup = getLineup(home_table)
            away_lineup = getLineup(away_table)
            
            # Get team name
            home_team, away_team = getTeamName(info_table)
            print(home_team, away_team)
            match_id = f"{season}_{len(match_data) + 1}"
            match_data.append({
                "Match ID": match_id,
                "Season": season,
                "Round": match_round,
                "Home Team": home_team,
                "Away Team": away_team,
                "Home Coach": home_coach,
                "Away Coach": away_coach,
            })
            for player in home_lineup:
                player_data.append({
                    "Match ID": match_id,
                    "Team": home_team,
                    "Player Name": player,
                    "Is Home": 1
                })
            for player in away_lineup:
                player_data.append({
                    "Match ID": match_id,
                    "Team": away_team,
                    "Player Name": player,
                    "Is Home": 0
                })
            time.sleep(1)
            print(player_data)
            print(match_data)
            break
    #     print(f"Finished crawling season: {season}")

    matches_df = pd.DataFrame(match_data)
    players_df = pd.DataFrame(player_data)
    matches_df.to_csv('../data/raw/worldfb/worldfb_matches.csv', index=False)
    players_df.to_csv('../data/raw/worldfb/worldfb_players.csv', index=False)
    print("Data saved successfully.")

In [11]:
crawl()

Crawling season: 2021-2022 - URL: https://www.worldfootball.net/all_matches/eng-premier-league-2021-2022/
Crawling match: https://www.worldfootball.net/report/eng-premier-league-2021-2022//report/premier-league-2021-2022-brentford-fc-arsenal-fc/
Brentford FC Arsenal FC
[{'Match ID': '2021-2022_1', 'Team': 'Brentford FC', 'Player Name': ' David Raya', 'Is Home': 1}, {'Match ID': '2021-2022_1', 'Team': 'Brentford FC', 'Player Name': 'Rico Henry', 'Is Home': 1}, {'Match ID': '2021-2022_1', 'Team': 'Brentford FC', 'Player Name': 'Ethan Pinnock', 'Is Home': 1}, {'Match ID': '2021-2022_1', 'Team': 'Brentford FC', 'Player Name': 'Pontus Jansson', 'Is Home': 1}, {'Match ID': '2021-2022_1', 'Team': 'Brentford FC', 'Player Name': 'Kristoffer Ajer', 'Is Home': 1}, {'Match ID': '2021-2022_1', 'Team': 'Brentford FC', 'Player Name': 'Christian Nørgaard', 'Is Home': 1}, {'Match ID': '2021-2022_1', 'Team': 'Brentford FC', 'Player Name': 'Frank Onyeka', 'Is Home': 1}, {'Match ID': '2021-2022_1', 'Team'

In [12]:
test = pd.read_csv('../data/merged_gw.csv')
test['starts'].value_counts()

starts
0    21365
1     8360
Name: count, dtype: int64

In [3]:
test.head()

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW
0,Femi Seriki,DEF,Sheffield Utd,0.5,0,0,0,0,0.0,653,...,0,0.0,0,0,0,0,40,True,0,1
1,Jack Hinshelwood,MID,Brighton,1.5,0,0,0,0,0.0,621,...,4,0.0,0,0,0,0,45,True,0,1
2,Jadon Sancho,MID,Man Utd,3.0,0,0,4,0,11.3,397,...,1,8.0,1,0,0,0,70,True,0,1
3,Rhys Norrington-Davies,DEF,Sheffield Utd,0.1,0,0,0,0,0.0,487,...,0,0.0,0,0,0,0,40,True,0,1
4,Vitaly Janelt,MID,Brentford,2.1,0,0,6,0,11.5,105,...,2,17.0,2,0,0,0,55,True,0,1


In [3]:
new_test = pd.read_csv('../data/gw1.csv')
new_test.head()

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards
0,Alex Scott,MID,Bournemouth,1.6,0,0,11,0,12.8,77,...,1,1,0.0,2,0,0,0,50,False,0
1,Carlos Miguel dos Santos Pereira,GK,Nott'm Forest,2.2,0,0,0,0,0.0,427,...,1,1,0.0,0,0,0,0,45,True,0
2,Tomiyasu Takehiro,DEF,Arsenal,0.0,0,0,0,0,0.0,22,...,0,2,0.0,0,0,0,0,50,True,0
3,Malcolm Ebiowei,MID,Crystal Palace,0.0,0,0,0,0,0.0,197,...,1,2,0.0,0,0,0,0,45,False,0
4,Ben Brereton Díaz,MID,Southampton,1.0,0,0,-2,0,14.0,584,...,0,1,16.0,1,0,0,0,55,False,1


In [10]:
new_test[(new_test['minutes'] >=45) & (new_test['starts'] == 0)]

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards
112,Phil Foden,MID,Man City,4.2,0,0,6,0,3.7,348,...,2,0,0.0,1,0,0,0,95,False,0
200,Samuel Edozie,MID,Southampton,0.6,0,0,0,0,18.5,459,...,0,1,16.0,0,0,0,0,50,False,1
273,Odsonne Edouard,FWD,Crystal Palace,1.7,0,0,1,0,10.7,198,...,1,2,5.0,1,0,0,0,55,False,0
310,Ibrahima Konaté,DEF,Liverpool,3.4,0,0,12,0,0.3,326,...,2,0,0.0,1,0,0,0,50,False,0
416,Simon Adingra,MID,Brighton,2.0,0,0,24,0,6.5,113,...,3,0,26.0,6,0,0,0,55,False,0
553,Ryan Yates,MID,Nott'm Forest,1.6,1,2,29,0,16.9,449,...,1,1,36.0,7,0,0,0,50,True,0
600,Emil Krafth,DEF,Newcastle,2.9,0,0,21,1,0.0,405,...,0,1,6.0,6,0,0,0,45,True,0
