In [21]:
# Loading and importing libraries
import json, time, datetime as dt
import csv
from pathlib import Path
import pandas as pd
import numpy as np

from time import sleep
from datetime import date, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service  import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


In [22]:
# Where we'll store raw JSON and processed CSV
RAW_DIR = Path("../data/raw/sofascore")
PROC_DIR = Path("../data/processed")
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

print("Ready — raw:", RAW_DIR.resolve(), " processed:", PROC_DIR.resolve())

Ready — raw: C:\Users\TONNY\OneDrive\Desktop\DS\Football-Prediction\data\raw\sofascore  processed: C:\Users\TONNY\OneDrive\Desktop\DS\Football-Prediction\data\processed


In [23]:
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(service = ChromeService(ChromeDriverManager().install()), options = options)
print("Driver started:", type(driver))
driver.quit()


Driver started: <class 'selenium.webdriver.chrome.webdriver.WebDriver'>


In [24]:
# C. Fetch a single date with Selenium and show a preview
TEST_DATE = "2025-09-17"   # change if you want another date
url = f"https://www.sofascore.com/api/v1/sport/football/scheduled-events/{TEST_DATE}"

options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

driver.get(url)
# wait for the JSON <pre> block (up to 8 seconds)
WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.TAG_NAME, "pre")))

raw_json_text = driver.find_element(By.TAG_NAME, "pre").text
print("Preview (first 400 chars):\n", raw_json_text[:400])

# save raw for reproducibility
raw_path = RAW_DIR / f"Test-data-{TEST_DATE}.json"
with open(raw_path, "w", encoding="utf-8") as f:
    f.write(raw_json_text)
print("Saved raw JSON to:", raw_path)

driver.quit()


Preview (first 400 chars):
 {"events":[{"tournament":{"name":"UEFA Champions League","slug":"uefa-champions-league","category":{"id":1465,"country":{},"name":"Europe","slug":"europe","sport":{"name":"Football","slug":"football","id":1},"flag":"europe","fieldTranslations":{"nameTranslation":{"ar":"\u0623\u0648\u0631\u0648\u0628\u0627","hi":"\u092f\u0942\u0930\u094b\u092a","bn":"\u0987\u0989\u09b0\u09cb\u09aa"},"shortNameTrans
Saved raw JSON to: ..\data\raw\sofascore\Test-data-2025-09-17.json


In [25]:
# D. Load the raw JSON and inspect keys
import json
raw_path = RAW_DIR / f"Test-data-{TEST_DATE}.json"
with open(raw_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

data = json.loads(raw_text)
print("Top-level keys:", list(data.keys()))

# How many events and preview the first event keys
events = data.get("events", [])
print("Number of events:", len(events))
if events:
    first = events[0]
    print("First event keys (sample):", list(first.keys())[:50])
    # Drill into one nested mapping to find our fields:
    nested_keys = {
        "tournament": list(first.get("tournament", {}).keys()),
        "homeTeam": list(first.get("homeTeam", {}).keys()),
        "status": list(first.get("status", {}).keys()) if first.get("status") else []
    }
    print("Nested keys sample:", nested_keys)
else:
    print("No events found for that date.")


Top-level keys: ['events']
Number of events: 245
First event keys (sample): ['tournament', 'season', 'roundInfo', 'customId', 'status', 'winnerCode', 'homeTeam', 'awayTeam', 'homeScore', 'awayScore', 'coverage', 'time', 'changes', 'hasGlobalHighlights', 'hasXg', 'hasEventPlayerStatistics', 'hasEventPlayerHeatMap', 'detailId', 'crowdsourcingDataDisplayEnabled', 'id', 'varInProgress', 'slug', 'startTimestamp', 'finalResultOnly', 'feedLocked', 'isEditor']
Nested keys sample: {'tournament': ['name', 'slug', 'category', 'uniqueTournament', 'priority', 'id'], 'homeTeam': ['name', 'slug', 'shortName', 'gender', 'sport', 'userCount', 'nameCode', 'disabled', 'national', 'type', 'id', 'country', 'subTeams', 'teamColors', 'fieldTranslations'], 'status': ['code', 'description', 'type']}


In [26]:
# E. Extract core fields for one event
def safe_get(d, *keys):
    x = d
    for k in keys:
        if x is None: return None
        x = x.get(k)
    return x

def event_to_row(e):
    start_ts = safe_get(e, "startTimestamp")
    dt_utc = pd.to_datetime(start_ts, unit="s", utc=True) if start_ts else None
    kickoff_nairobi = dt_utc.tz_convert("Africa/Nairobi") if dt_utc is not None else None

    return {
        "match_id": safe_get(e, "id"),
        "match_custom_id": safe_get(e, "customId"),
        "date_utc": dt_utc,
        "kickoff_nairobi": kickoff_nairobi,
        "league": safe_get(e, "tournament", "name"),
        "league_id": safe_get(e, "tournament", "id"),
        "home_team": safe_get(e, "homeTeam", "name"),
        "home_team_id": safe_get(e, "homeTeam", "id"),
        "away_team": safe_get(e, "awayTeam", "name"),
        "away_team_id": safe_get(e, "awayTeam", "id"),
        "status_description": safe_get(e, "status", "description"),
        "status_type": safe_get(e, "status", "type"),
        "home_score": safe_get(e, "homeScore", "current"),
        "away_score": safe_get(e, "awayScore", "current"),
    }

# test the function on the first event
if events:
    row = event_to_row(events[0])
    print(row)


{'match_id': 14566909, 'match_custom_id': 'RsAgb', 'date_utc': Timestamp('2025-09-16 16:45:00+0000', tz='UTC'), 'kickoff_nairobi': Timestamp('2025-09-16 19:45:00+0300', tz='Africa/Nairobi'), 'league': 'UEFA Champions League', 'league_id': 138314, 'home_team': 'Athletic Club', 'home_team_id': 2825, 'away_team': 'Arsenal', 'away_team_id': 42, 'status_description': 'Ended', 'status_type': 'finished', 'home_score': 0, 'away_score': 2}


In [27]:
# F. Build DataFrame and derive result
rows = [event_to_row(e) for e in events]
df = pd.DataFrame(rows)

# Convert datetimes for CSV readability
if not df.empty and df["date_utc"].notna().any():
    df["date_utc_iso"] = df["date_utc"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    df["kickoff_nairobi_str"] = df["kickoff_nairobi"].dt.strftime("%Y-%m-%d %H:%M:%S%z")

def derive_result(hs, as_):
    try:
        if hs is None or as_ is None: return None
        hs = int(hs); as_ = int(as_)
        if hs > as_: return "Home"
        if hs == as_: return "Draw"
        return "Away"
    except:
        return None

df["match_result"] = df.apply(lambda r: derive_result(r["home_score"], r["away_score"]), axis=1)

df.head(20)


Unnamed: 0,match_id,match_custom_id,date_utc,kickoff_nairobi,league,league_id,home_team,home_team_id,away_team,away_team_id,status_description,status_type,home_score,away_score,date_utc_iso,kickoff_nairobi_str,match_result
0,14566909,RsAgb,2025-09-16 16:45:00+00:00,2025-09-16 19:45:00+03:00,UEFA Champions League,138314,Athletic Club,2825,Arsenal,42,Ended,finished,0.0,2.0,2025-09-16T16:45:00Z,2025-09-16 19:45:00+0300,Away
1,14566894,cjbskXb,2025-09-16 16:45:00+00:00,2025-09-16 19:45:00+03:00,UEFA Champions League,138314,PSV Eindhoven,2952,Royale Union Saint-Gilloise,4860,Ended,finished,1.0,3.0,2025-09-16T16:45:00Z,2025-09-16 19:45:00+0300,Away
2,14566738,gkbsmuc,2025-09-16 19:00:00+00:00,2025-09-16 22:00:00+03:00,UEFA Champions League,138314,Benfica,3006,Qarabağ,5962,Ended,finished,2.0,3.0,2025-09-16T19:00:00Z,2025-09-16 22:00:00+0300,Away
3,14566764,ydbsMdb,2025-09-16 19:00:00+00:00,2025-09-16 22:00:00+03:00,UEFA Champions League,138314,Juventus,2687,Borussia Dortmund,2673,Ended,finished,4.0,4.0,2025-09-16T19:00:00Z,2025-09-16 22:00:00+0300,Draw
4,14566597,QHsEgb,2025-09-16 19:00:00+00:00,2025-09-16 22:00:00+03:00,UEFA Champions League,138314,Real Madrid,2829,Olympique de Marseille,1641,Ended,finished,2.0,1.0,2025-09-16T19:00:00Z,2025-09-16 22:00:00+0300,Home
5,14566827,Isugb,2025-09-16 19:00:00+00:00,2025-09-16 22:00:00+03:00,UEFA Champions League,138314,Tottenham Hotspur,33,Villarreal,2819,Ended,finished,1.0,0.0,2025-09-16T19:00:00Z,2025-09-16 22:00:00+0300,Home
6,14566868,VobsBHtb,2025-09-17 16:45:00+00:00,2025-09-17 19:45:00+03:00,UEFA Champions League,138314,Olympiacos FC,3245,Pafos FC,171626,Ended,finished,0.0,0.0,2025-09-17T16:45:00Z,2025-09-17 19:45:00+0300,Draw
7,14566884,gnsqU,2025-09-17 16:45:00+00:00,2025-09-17 19:45:00+03:00,UEFA Champions League,138314,SK Slavia Praha,2216,Bodø/Glimt,656,Ended,finished,2.0,2.0,2025-09-17T16:45:00Z,2025-09-17 19:45:00+0300,Draw
8,14566873,Xdbsdjb,2025-09-17 19:00:00+00:00,2025-09-17 22:00:00+03:00,UEFA Champions League,138314,AFC Ajax,2953,Inter,2697,Ended,finished,0.0,2.0,2025-09-17T19:00:00Z,2025-09-17 22:00:00+0300,Away
9,14566570,Nsxdb,2025-09-17 19:00:00+00:00,2025-09-17 22:00:00+03:00,UEFA Champions League,138314,FC Bayern München,2672,Chelsea,38,Ended,finished,3.0,1.0,2025-09-17T19:00:00Z,2025-09-17 22:00:00+0300,Home


In [28]:
# G. Save the processed CSV for this test date
out_csv = RAW_DIR / f"test_data_{TEST_DATE}.csv"
df.to_csv(out_csv, index=False)
print("Saved processed CSV to:", out_csv)


Saved processed CSV to: ..\data\raw\sofascore\test_data_2025-09-17.csv


# Getting all football data

In [29]:
# Setup driver
options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# Date range
start = date(2025, 8, 15)
end = date(2025, 9, 23)

all_matches_data = []

# Helper functions
# Getting keys
def safe_get(d, *keys):
    x = d
    for k in keys:
        if x is None:
            return None
        x = x.get(k)
    return x

# 
def event_to_row(e):
    start_ts = safe_get(e, "startTimestamp")
    dt_utc = pd.to_datetime(start_ts, unit="s", utc=True) if start_ts else None
    kickoff_nairobi = dt_utc.tz_convert("Africa/Nairobi") if dt_utc is not None else None

    winner_code = safe_get(e, "winnerCode")
    # Map Sofascore winnerCode to human-readable result
    if winner_code == 1:
        match_result = "Home"
    elif winner_code == 2:
        match_result = "Away"
    elif winner_code == 3:
        match_result = "Draw"
    else:
        match_result = None  # not decided yet / future game

    return {
        "match_id": safe_get(e, "id"),
        "match_custom_id": safe_get(e, "customId"),
        "date_utc": dt_utc,
        "kickoff_nairobi": kickoff_nairobi,
        "league": safe_get(e, "tournament", "name"),
        "league_id": safe_get(e, "tournament", "id"),
        "country": safe_get(e, "tournament", "category", "name"),

        "home_team": safe_get(e, "homeTeam", "name"),
        "home_team_id": safe_get(e, "homeTeam", "id"),
        "away_team": safe_get(e, "awayTeam", "name"),
        "away_team_id": safe_get(e, "awayTeam", "id"),

        "status_description": safe_get(e, "status", "description"),
        "status_type": safe_get(e, "status", "type"),

        "home_score": safe_get(e, "homeScore", "current"),
        "away_score": safe_get(e, "awayScore", "current"),

        "winner_code": winner_code,
        "match_result": match_result,   # Home / Away / Draw

        "season_name": safe_get(e, "season", "name"),
        "round": safe_get(e, "roundInfo", "round"),
        "has_xg": safe_get(e, "hasXg")
    }

# Loop through days
current = start
while current <= end:
    url = f"https://www.sofascore.com/api/v1/sport/football/scheduled-events/{current}"
    driver.get(url)
    sleep(1)
    raw_json = driver.find_element("tag name", "pre").text
    data = json.loads(raw_json)

    for event in data.get("events", []):
        all_matches_data.append(event_to_row(event))

    current += timedelta(days=1)

driver.quit()

# --- Save to CSV ---
if all_matches_data:
    with open("../data/raw/sofascore/matches_base.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=all_matches_data[0].keys())
        writer.writeheader()
        writer.writerows(all_matches_data)

print(f"Saved {len(all_matches_data)} matches")

Saved 14972 matches


In [30]:
df = pd.DataFrame(all_matches_data)
df.head()

Unnamed: 0,match_id,match_custom_id,date_utc,kickoff_nairobi,league,league_id,country,home_team,home_team_id,away_team,away_team_id,status_description,status_type,home_score,away_score,winner_code,match_result,season_name,round,has_xg
0,14025013,Uskb,2025-08-15 19:00:00+00:00,2025-08-15 22:00:00+03:00,Premier League,1,England,Liverpool,44,Bournemouth,60,Ended,finished,4.0,2.0,1.0,Home,Premier League 25/26,1.0,True
1,14025016,OP,2025-08-16 11:30:00+00:00,2025-08-16 14:30:00+03:00,Premier League,1,England,Aston Villa,40,Newcastle United,39,Ended,finished,0.0,0.0,3.0,Draw,Premier League 25/26,1.0,True
2,14082854,tgbsoKj,2025-08-15 17:00:00+00:00,2025-08-15 20:00:00+03:00,LaLiga,36,Spain,Girona FC,24264,Rayo Vallecano,2818,Ended,finished,1.0,3.0,2.0,Away,LaLiga 25/26,1.0,True
3,14082858,ugbsbhb,2025-08-15 19:30:00+00:00,2025-08-15 22:30:00+03:00,LaLiga,36,Spain,Villarreal,2819,Real Oviedo,2851,Ended,finished,2.0,0.0,1.0,Home,LaLiga 25/26,1.0,True
4,14061188,QHsiI,2025-08-15 18:45:00+00:00,2025-08-15 21:45:00+03:00,Ligue 1,4,France,Stade Rennais,1658,Olympique de Marseille,1641,Ended,finished,1.0,0.0,1.0,Home,Ligue 1 25/26,1.0,True


In [31]:
df["league"].value_counts().head(10)

league
Premier League                   562
Ligue 1                          256
Club Friendly Games              254
League Two                       229
League One                       229
National League                  228
Pro League                       208
Championship                     189
Turkiye Kupasi, Qualification    176
Premiership                      167
Name: count, dtype: int64

In [32]:
df.columns

Index(['match_id', 'match_custom_id', 'date_utc', 'kickoff_nairobi', 'league',
       'league_id', 'country', 'home_team', 'home_team_id', 'away_team',
       'away_team_id', 'status_description', 'status_type', 'home_score',
       'away_score', 'winner_code', 'match_result', 'season_name', 'round',
       'has_xg'],
      dtype='object')

So far we have the meta data for our data ,ostly now we look for the statistical data still from sofascore and we will go column for column 

### 1. Deriving home and away team position.

Since ther is no endpoint on sofascore for the position of the team when the game was played we will have to derive it from the data we have and use meta data such as match id and league id then calculate the positions

In [33]:
# 1st we need to convert date into pandas datetime so we can sortthem out easily
df["date_utc"] = pd.to_datetime(df["date_utc"], utc = True)
df["date_utc"].head()

0   2025-08-15 19:00:00+00:00
1   2025-08-16 11:30:00+00:00
2   2025-08-15 17:00:00+00:00
3   2025-08-15 19:30:00+00:00
4   2025-08-15 18:45:00+00:00
Name: date_utc, dtype: datetime64[ns, UTC]