In [1]:
import json
import time

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

from pydantic import BaseModel
from typing import List, Optional

from selenium import webdriver

from supabase import create_client, Client

In [2]:
driver = webdriver.Chrome()

In [3]:
class MatchEvent(BaseModel):
    id: int
    event_id: int                       
    minute: int                         
    second: Optional[float] = None                       
    team_id: int                        
    player_id: int                    
    x: float                            
    y: float                            
    end_x: Optional[float] = None                        
    end_y: Optional[float] = None                        
    qualifiers: list[dict]                    
    is_touch: bool                        
    blocked_x: Optional[float] = None                    
    blocked_y: Optional[float] = None                    
    goal_mouth_z: Optional[float] = None                 
    goal_mouth_y: Optional[float] = None                 
    is_shot: bool                        
    card_type: bool                      
    is_goal: bool                        
    type_display_name: str             
    outcome_type_display_name: str     
    period_display_name: str    

In [4]:
def insert_match_events(df, supabase):
    events = [
        MatchEvent(**x).dict()
        for x in df.to_dict(orient='records')
    ]
    execution = supabase.table('match_events').upsert(events).execute()

In [5]:
class Player(BaseModel):
    player_id: int
    shirt_no: int
    name: str
    age: int
    position: str
    team_id: int

In [6]:
def insert_players(team_info, supabase):
    players = []
    for team in team_info:
        for player in team['players']:
            players.append({
                'player_id': player['playerId'],
                'team_id': team['team_id'],
                'shirt_no': player['shirtNo'],
                'name': player['name'],
                'position': player['position'],
                'age': player['age'],
        })
    execution = supabase.table('players').upsert(players).execute()

In [7]:
project_url = "https://emqugrtdqwgesnybxyqw.supabase.co"
api_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImVtcXVncnRkcXdnZXNueWJ4eXF3Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTc4NTk1MzEsImV4cCI6MjA3MzQzNTUzMX0.46Fd9Qs2kM5Qr5930cygp8W-orxceRffL8aIboBdeq4"
supabase_password = 'UgMyWV4M6Wj2feD6'
supabase = create_client(project_url, api_key)

In [8]:
def scrape_match_events(whoscored_url, driver):
    driver.get(whoscored_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    matchdict = json.loads(element.text.split("matchCentreData: ")[1].split(',\n')[0])
    match_events = matchdict['events']
    df = pd.DataFrame(match_events)
    df.dropna(subset='playerId', inplace=True)
    df = df.where(pd.notnull(df), None)
    df = df.rename(
    {
        'eventId': 'event_id',
        'expandedMinute': 'expanded_minute',
        'outcomeType': 'outcome_type',
        'isTouch': 'is_touch',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'endX': 'end_x',
        'endY': 'end_y',
        'blockedX': 'blocked_x',
        'blockedY': 'blocked_y',
        'goalMouthZ': 'goal_mouth_z',
        'goalMouthY': 'goal_mouth_y',
        'isShot': 'is_shot',
        'cardType': 'card_type',
        'isGoal': 'is_goal'
    },
        axis=1
    )
    df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])
    df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
    df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])
    df.drop(columns=["period", "type", "outcome_type"], inplace=True)
    
    if 'is_goal' not in df.columns:
        print('missing goals')
        df['is_goal'] = False

    if 'is_card' not in df.columns:
        df['is_card'] = False
        df['card_type'] = False
    
    df = df[~(df['type_display_name'] == "OffsideGiven")]
    df = df[[
        'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
        'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
        'card_type', 'is_goal', 'type_display_name', 'outcome_type_display_name',
        'period_display_name'
    ]]
    df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype(int)
    df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
    df[['is_shot', 'is_goal', 'card_type']] = df[['is_shot', 'is_goal', 'card_type']].astype(bool)
    
    df['is_goal'] = df['is_goal'].fillna(False)
    df['is_shot'] = df['is_shot'].fillna(False)
    
    for column in df.columns:
        if df[column].dtype == np.float64 or df[column].dtype == np.float32:
            df[column] = np.where(
                np.isnan(df[column]),
                None,
                df[column]
            )
            
    insert_match_events(df, supabase)

    team_info = []
    team_info.append({
        'team_id': matchdict['home']['teamId'],
        'name': matchdict['home']['name'],
        'country_name': matchdict['home']['countryName'],
        'manager_name': matchdict['home']['managerName'],
        'players': matchdict['home']['players'],
    })

    team_info.append({
        'team_id': matchdict['away']['teamId'],
        'name': matchdict['away']['name'],
        'country_name': matchdict['away']['countryName'],
        'manager_name': matchdict['away']['managerName'],
        'players': matchdict['away']['players'],
    })
    
    insert_players(team_info, supabase)

    return print('Success')

In [9]:
driver.get('https://www.whoscored.com/teams/65/fixtures/spain-barcelona')

In [10]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [11]:
# parsing through the game urls, *= means "like"
all_urls = soup.select('a[href*="\/live\/"]')

In [12]:
all_urls = list(set([
    'https://www.whoscored.com' + x.attrs['href']
    for x in all_urls
]))

In [13]:
for url in all_urls:
    scrape_match_events(
        whoscored_url=url,
        driver=driver
    )
    time.sleep(2)

C:\Users\szerm\AppData\Local\Temp\ipykernel_20324\3158417592.py:3: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  MatchEvent(**x).dict()


Success


C:\Users\szerm\AppData\Local\Temp\ipykernel_20324\3158417592.py:3: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  MatchEvent(**x).dict()


Success


C:\Users\szerm\AppData\Local\Temp\ipykernel_20324\3158417592.py:3: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  MatchEvent(**x).dict()


Success


C:\Users\szerm\AppData\Local\Temp\ipykernel_20324\3158417592.py:3: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  MatchEvent(**x).dict()


Success


In [14]:
all_urls

['https://www.whoscored.com/matches/1913888/live/spain-laliga-2025-2026-levante-barcelona',
 'https://www.whoscored.com/matches/1913918/live/spain-laliga-2025-2026-mallorca-barcelona',
 'https://www.whoscored.com/matches/1913904/live/spain-laliga-2025-2026-rayo-vallecano-barcelona',
 'https://www.whoscored.com/matches/1913922/live/spain-laliga-2025-2026-barcelona-valencia']