In [1]:
# Environment setup (API-Sports removed)
from pathlib import Path
from typing import Optional

try:
    from dotenv import load_dotenv
    DOTENV_AVAILABLE = True
except Exception:
    DOTENV_AVAILABLE = False

# Feature flags (only Wikipedia + Transfermarkt pipeline)
USE_APISPORTS = False  # deprecated; kept for compatibility but not used

# Helper to find project root
def _find_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

# Resolve project directories consistently
ROOT = _find_root()
DATA_DIR = ROOT / 'data' / 'raw'
INTERIM_DIR = ROOT / 'data' / 'interim'
PROCESSED_DIR = ROOT / 'data' / 'processed'
FIG_DIR = ROOT / 'reports' / 'figures'
for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"\nüéØ Environment setup complete")
print(f"   ROOT: {ROOT}")
print(f"   DATA_DIR: {DATA_DIR}")


üéØ Environment setup complete
   ROOT: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks
   DATA_DIR: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw


In [2]:
# Helpers to make the notebook resilient across machines (kept)
from typing import Optional
import random
import time
from pathlib import Path
import requests

_USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36",
]

def find_repo_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

def ensure_environment():
    global ROOT, DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR
    if 'ROOT' not in globals() or not isinstance(ROOT, Path) or not (ROOT / 'data').exists():
        root_guess = find_repo_root(Path.cwd())
        if not (root_guess / 'data').exists() and (root_guess.parent / 'data').exists():
            root_guess = root_guess.parent
        ROOT = root_guess
    DATA_DIR = ROOT / 'data' / 'raw'
    INTERIM_DIR = ROOT / 'data' / 'interim'
    PROCESSED_DIR = ROOT / 'data' / 'processed'
    FIG_DIR = ROOT / 'reports' / 'figures'
    for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
        d.mkdir(parents=True, exist_ok=True)
    return ROOT, DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR


def http_get(url: str, headers: Optional[dict] = None, retries: int = 3, timeout: int = 30) -> str:
    last_err = None
    sess = requests.Session()
    for attempt in range(1, retries + 1):
        ua = random.choice(_USER_AGENTS)
        hdrs = {"User-Agent": ua, "Accept-Language": "en-US,en;q=0.9"}
        if headers:
            hdrs.update(headers)
        try:
            resp = sess.get(url, headers=hdrs, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            last_err = e
            time.sleep(0.8 * attempt)
    raise last_err  # type: ignore


def save_csv(df: 'pd.DataFrame', path: Path, **to_csv_kwargs):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, encoding=to_csv_kwargs.get('encoding', 'utf-8-sig'))
    print(f"Saved: {path}")



In [3]:
# Scrape multiple seasons of Ligat Ha'al from Wikipedia
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
import re
import time
from datetime import datetime

ensure_environment()

def scrape_season(season_year):
    """
    Scrape a single season's matches from Wikipedia.
    season_year: starting year (e.g., 2016 for 2016/17 season)
    """
    season_str = f"{season_year}/{str(season_year+1)[-2:]}"
    url = f"https://en.wikipedia.org/wiki/{season_year}%E2%80%93{str(season_year+1)[-2:]}_Israeli_Premier_League"
    
    print(f"Fetching {season_str}... ", end="", flush=True)
    try:
        html = http_get(url)
        soup = BeautifulSoup(html, "html.parser")

        
        # Find results matrix
        results_table = None
        for table in soup.find_all("table", class_="wikitable"):
            first_row = table.find("tr")
            if first_row:
                first_cell = first_row.find("th")
                if first_cell and ("Home \\ Away" in first_cell.text or "Home / Away" in first_cell.text):
                    results_table = table
                    break
        
        if not results_table:
            print("‚ùå (no results matrix)")
            return None
            
        # Parse teams and build matches
        rows = results_table.find_all("tr")
        team_names = [td.get_text(strip=True) for td in rows[0].find_all("th")][1:]
        
        matches = []
        for i, row in enumerate(rows[1:]):
            cells = row.find_all(["th", "td"])
            home_team = cells[0].get_text(strip=True)
            for j, cell in enumerate(cells[1:]):
                away_team = team_names[j]
                score = cell.get_text(strip=True)
                if re.match(r"^\d+\s*[‚Äì-]\s*\d+$", score):
                    home_goals, away_goals = re.split(r"[‚Äì-]", score)
                    matches.append({
                        "season": season_str,
                        "season_year": season_year,
                        "home_team": home_team,
                        "away_team": away_team,
                        "home_goals": int(home_goals.strip()),
                        "away_goals": int(away_goals.strip())
                    })
        
        if not matches:
            print("‚ùå (no matches found)")
            return None
            
        # Convert to DataFrame and add derived columns
        df = pd.DataFrame(matches)
        df['goal_diff'] = df['home_goals'] - df['away_goals']
        df['result'] = df['goal_diff'].apply(lambda x: "H" if x>0 else ("A" if x<0 else "D"))
        df['home_points'] = df['result'].map({"H":3, "D":1, "A":0}).fillna(0).astype(int)
        df['away_points'] = df['result'].map({"A":3, "D":1, "H":0}).fillna(0).astype(int)
        
        # Select and order columns
        keep_cols = ['season', 'season_year', 'home_team', 'away_team', 'home_goals', 
                     'away_goals', 'goal_diff', 'result', 'home_points', 'away_points']
        df = df[keep_cols]
        
        print(f"‚úì ({len(df)} matches)")
        return df
        
    except Exception as e:
        print(f"‚ùå ({str(e)[:50]}...)")
        return None

# List of seasons to scrape (last 20 seasons)
current_year = datetime.now().year
if datetime.now().month < 8:  # If before August, last season started in previous year
    current_year -= 1
seasons = list(range(current_year - 19, current_year + 1))

print(f"Scraping {len(seasons)} seasons from Wikipedia ({seasons[0]}/{str(seasons[0]+1)[-2:]} to {seasons[-1]}/{str(seasons[-1]+1)[-2:]})...")

# Scrape each season
all_matches = []
for season_year in seasons:
    df = scrape_season(season_year)
    if df is not None:
        # Save individual season
        season_path = DATA_DIR / f"matches_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_wikipedia.csv"
        save_csv(df, season_path)
        all_matches.append(df)
    time.sleep(1)  # Be nice to Wikipedia

if all_matches:
    # Combine all seasons
    combined_df = pd.concat(all_matches, ignore_index=True)
    combined_path = DATA_DIR / "matches_all_seasons_ligat_haal_wikipedia.csv"
    save_csv(combined_df, combined_path)
    
    print("\nSummary:")
    print(f"- Successfully scraped {len(all_matches)} seasons")
    print(f"- Total matches: {len(combined_df)}")
    print(f"\nMatches per season:")
    season_counts = combined_df.groupby('season').size().sort_index()
    for season, count in season_counts.items():
        print(f"  ‚Ä¢ {season}: {count:3d} matches")
    print(f"\nAll matches saved to: {combined_path}")
    display(combined_df.head())

Scraping 20 seasons from Wikipedia (2006/07 to 2025/26)...
Fetching 2006/07... ‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2006_07_ligat_haal_wikipedia.csv
Fetching 2007/08... ‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2007_08_ligat_haal_wikipedia.csv
Fetching 2008/09... ‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2008_09_ligat_haal_wikipedia.csv
Fetching 2009/10... ‚úì (239 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2009_10_ligat_haal_wikipedia.csv
Fetching 2010/11... ‚úì (234 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2010_11_ligat_haal_wikipedia.csv
Fetching 2011/12... ‚úì (240 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\noteb

Unnamed: 0,season,season_year,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2006/07,2006,Beitar Jerusalem,BnY,0,0,0,D,1,1
1,2006/07,2006,Beitar Jerusalem,ASH,2,0,2,H,3,0
2,2006/07,2006,Beitar Jerusalem,HAK,0,0,0,D,1,1
3,2006/07,2006,Beitar Jerusalem,HKS,2,0,2,H,3,0
4,2006/07,2006,Beitar Jerusalem,HPT,2,0,2,H,3,0


In [4]:
def scrape_transfermarkt_attendance(season_year: int) -> 'pd.DataFrame':
    """
    Scrape team attendance data from Transfermarkt for a given season.
    
    Args:
        season_year: Starting year of season (e.g., 2023 for 2023/24)
    
    Returns:
        DataFrame with columns: season, team, stadium, capacity, total_spectators, average_attendance
    """
    import pandas as pd
    from bs4 import BeautifulSoup
    import re
    
    url = f"https://www.transfermarkt.com/ligat-haal/besucherzahlen/wettbewerb/ISR1/saison_id/{season_year}"
    print(f"Scraping attendance from: {url}")
    
    try:
        html = http_get(url)
        soup = BeautifulSoup(html, "html.parser")
        
        # Find the attendance table
        tables = soup.find_all("table", class_="items")
        if not tables:
            print(f"  ‚ö†Ô∏è  No attendance tables found for {season_year}/{str(season_year+1)[-2:]}")
            return None
        
        table = tables[0]
        tbody = table.find("tbody")
        if not tbody:
            print(f"  ‚ö†Ô∏è  No tbody found in attendance table for {season_year}/{str(season_year+1)[-2:]}")
            return None
        
        rows = tbody.find_all("tr", recursive=False)
        
        attendance_data = []
        season_str = f"{season_year}/{str(season_year+1)[-2:]}"
        
        for row in rows:
            cells = row.find_all("td")
            if len(cells) < 5:
                continue
            
            # First cell is rank (skip "Total" row)
            rank_text = cells[0].get_text(strip=True)
            if not rank_text.isdigit():
                continue
            
            # Second cell contains inline table with stadium and team info
            inline_table = cells[1].find("table", class_="inline-table")
            if not inline_table:
                continue
            
            # Extract stadium name (first link in inline table)
            stadium_link = inline_table.find("a", class_="hauptlink")
            stadium = stadium_link.get_text(strip=True) if stadium_link else "Unknown"
            
            # Extract team name (second row of inline table)
            team_links = inline_table.find_all("a", title=True)
            team = "Unknown"
            for link in team_links:
                title = link.get("title", "")
                if title and "spielplan" in link.get("href", ""):
                    team = title
                    break
            
            # Extract capacity, total spectators, average (last 3 cells)
            # Note: Numbers use European format (dots for thousands)
            capacity_text = cells[-3].get_text(strip=True)
            total_text = cells[-2].get_text(strip=True)
            average_text = cells[-1].get_text(strip=True)
            
            # Convert European number format (remove dots, handle empty values)
            def parse_number(text):
                if not text or text == "-":
                    return None
                return int(text.replace(".", "").replace(",", ""))
            
            capacity = parse_number(capacity_text)
            total_spectators = parse_number(total_text)
            average_attendance = parse_number(average_text)
            
            attendance_data.append({
                "season": season_str,
                "team": team,
                "stadium": stadium,
                "capacity": capacity,
                "total_spectators": total_spectators,
                "average_attendance": average_attendance
            })
        
        if not attendance_data:
            print(f"  ‚ö†Ô∏è  No attendance data extracted for {season_year}/{str(season_year+1)[-2:]}")
            return None
        
        df = pd.DataFrame(attendance_data)
        print(f"  ‚úÖ Scraped {len(df)} teams for {season_str}")
        return df
        
    except Exception as e:
        print(f"  ‚ùå Error scraping {season_year}/{str(season_year+1)[-2:]}: {e}")
        return None

# Test the function
ensure_environment()
test_df = scrape_transfermarkt_attendance(2023)
if test_df is not None:
    display(test_df)

Scraping attendance from: https://www.transfermarkt.com/ligat-haal/besucherzahlen/wettbewerb/ISR1/saison_id/2023
  ‚úÖ Scraped 14 teams for 2023/24


Unnamed: 0,season,team,stadium,capacity,total_spectators,average_attendance
0,2023/24,Maccabi Tel Aviv,Unknown,29150,213565,17797
1,2023/24,Maccabi Haifa,Unknown,30780,171948,17195
2,2023/24,Beitar Jerusalem,Unknown,33500,144830,13166
3,2023/24,Hapoel Beer Sheva,Unknown,16126,122024,10169
4,2023/24,Hapoel Tel Aviv,Unknown,29150,101049,9186
5,2023/24,Maccabi Netanya,Unknown,13610,70127,5844
6,2023/24,Hapoel Petah Tikva,Unknown,11500,60759,5524
7,2023/24,Hapoel Haifa,Unknown,30820,42559,3869
8,2023/24,Hapoel Jerusalem,Unknown,33500,40070,3643
9,2023/24,Maccabi Petah Tikva,Unknown,11500,39337,3576


In [5]:
# Quick test: scrape 2023/24 season attendance
ensure_environment()
season_year = 2023
_df_2023 = scrape_transfermarkt_attendance(season_year)
if _df_2023 is not None:
    _csv_2023 = DATA_DIR / f"attendance_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_transfermarkt.csv"
    save_csv(_df_2023, _csv_2023)
    display(_df_2023.head(20))
else:
    print("Failed to scrape 2023/24 attendance from Transfermarkt.")

Scraping attendance from: https://www.transfermarkt.com/ligat-haal/besucherzahlen/wettbewerb/ISR1/saison_id/2023
  ‚úÖ Scraped 14 teams for 2023/24
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\attendance_2023_24_ligat_haal_transfermarkt.csv


Unnamed: 0,season,team,stadium,capacity,total_spectators,average_attendance
0,2023/24,Maccabi Tel Aviv,Unknown,29150,213565,17797
1,2023/24,Maccabi Haifa,Unknown,30780,171948,17195
2,2023/24,Beitar Jerusalem,Unknown,33500,144830,13166
3,2023/24,Hapoel Beer Sheva,Unknown,16126,122024,10169
4,2023/24,Hapoel Tel Aviv,Unknown,29150,101049,9186
5,2023/24,Maccabi Netanya,Unknown,13610,70127,5844
6,2023/24,Hapoel Petah Tikva,Unknown,11500,60759,5524
7,2023/24,Hapoel Haifa,Unknown,30820,42559,3869
8,2023/24,Hapoel Jerusalem,Unknown,33500,40070,3643
9,2023/24,Maccabi Petah Tikva,Unknown,11500,39337,3576


In [6]:
# Scrape attendance data for all 20 seasons (2006-2025)
import pandas as pd
import time

ensure_environment()

# Define seasons to scrape
start_year = 2006
end_year = 2025
seasons = list(range(start_year, end_year + 1))

print(f"Scraping attendance data for {len(seasons)} seasons ({start_year}/{start_year+1}-{end_year}/{str(end_year+1)[-2:]})\n")
print("="*80)

all_attendance = []
failed = []

for season_year in seasons:
    season_str = f"{season_year}/{str(season_year+1)[-2:]}"
    print(f"\n[{season_str}]")
    
    # Check if already exists
    csv_path = DATA_DIR / f"attendance_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_transfermarkt.csv"
    if csv_path.exists():
        print(f"  ‚ÑπÔ∏è  File already exists: {csv_path.name}")
        try:
            existing_df = pd.read_csv(csv_path)
            all_attendance.append(existing_df)
            print(f"  ‚úÖ Loaded existing data: {len(existing_df)} teams")
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error loading existing file: {e}")
            # Try scraping anyway
            df = scrape_transfermarkt_attendance(season_year)
            if df is not None:
                save_csv(df, csv_path)
                all_attendance.append(df)
            else:
                failed.append(season_str)
    else:
        # Scrape new data
        df = scrape_transfermarkt_attendance(season_year)
        if df is not None:
            save_csv(df, csv_path)
            all_attendance.append(df)
        else:
            failed.append(season_str)
        
        # Be polite to the server
        time.sleep(1.2)

print("\n" + "="*80)
print(f"\n‚úÖ Successfully scraped/loaded: {len(all_attendance)} seasons")
if failed:
    print(f"‚ùå Failed: {len(failed)} seasons: {', '.join(failed)}")

# Combine all data
if all_attendance:
    combined_attendance = pd.concat(all_attendance, ignore_index=True)
    combined_path = DATA_DIR / "attendance_all_seasons_ligat_haal_transfermarkt.csv"
    save_csv(combined_attendance, combined_path)
    
    print(f"\nüìä Combined attendance data:")
    print(f"   Total records: {len(combined_attendance)}")
    print(f"   Seasons: {combined_attendance['season'].nunique()}")
    print(f"   Teams: {combined_attendance['team'].nunique()}")
    print(f"\n   Saved to: {combined_path.name}")
    
    # Show summary by season
    summary = combined_attendance.groupby('season').agg({
        'team': 'count',
        'total_spectators': 'sum',
        'average_attendance': 'mean'
    }).round(0)
    summary.columns = ['Teams', 'Total Spectators', 'Avg Attendance']
    print("\n   Season Summary:")
    display(summary)

Scraping attendance data for 20 seasons (2006/2007-2025/26)


[2006/07]
  ‚ÑπÔ∏è  File already exists: attendance_2006_07_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 12 teams

[2007/08]
  ‚ÑπÔ∏è  File already exists: attendance_2007_08_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 12 teams

[2008/09]
  ‚ÑπÔ∏è  File already exists: attendance_2008_09_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 12 teams

[2009/10]
  ‚ÑπÔ∏è  File already exists: attendance_2009_10_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 16 teams

[2010/11]
  ‚ÑπÔ∏è  File already exists: attendance_2010_11_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 16 teams

[2011/12]
  ‚ÑπÔ∏è  File already exists: attendance_2011_12_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 16 teams

[2012/13]
  ‚ÑπÔ∏è  File already exists: attendance_2012_13_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 14 teams

[2013/14]
  ‚ÑπÔ∏è  File already exists: attendan

Unnamed: 0_level_0,Teams,Total Spectators,Avg Attendance
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006/07,12,119700,3136.0
2007/08,12,362600,5738.0
2008/09,12,0,0.0
2009/10,16,939155,3926.0
2010/11,16,318450,4867.0
2011/12,16,911780,3891.0
2012/13,14,916940,5038.0
2013/14,14,970781,5444.0
2014/15,14,935937,7630.0
2015/16,14,1247497,6854.0


In [7]:
# Team Name Mapping - Normalizes abbreviations and variants to full names
# This mapping consolidates Wikipedia's inconsistent team naming across 20 seasons

TEAM_NAME_MAP = {
    # Abbreviations to full names
    'ASH': 'F.C. Ashdod',
    'BEI': 'Beitar Jerusalem',
    'BnS': 'Bnei Sakhnin',
    'BnY': 'Bnei Yehuda',
    'HAS': 'Hapoel Ashkelon',
    'HBS': "Hapoel Be'er Sheva",
    'HHA': 'Hapoel Haifa',
    'HKS': 'Hapoel Kfar Saba',
    'HRA': "Hapoel Ra'anana",
    'HTA': 'Hapoel Tel Aviv',
    'IKS': 'Ironi Kiryat Shmona',
    'MHA': 'Maccabi Haifa',
    'MPT': 'Maccabi Petah Tikva',
    'MTA': 'Maccabi Tel Aviv',
    'HPT': 'Hapoel Petah Tikva',
    'HRG': 'Hapoel Ramat Gan',
    'HRH': 'Hapoel Ramat HaSharon',
    'HRL': 'Rishon LeZion',
    'MAN': 'Maccabi Ahi Nazareth',
    'MBR': 'Maccabi Bnei Reineh',
    'SNZ': 'Sektzia Ness Ziona',
    'HAK': 'Hapoel Acre',
    'MHE': 'Maccabi Herzliya',
    'MNE': 'Maccabi Netanya',
    'HAR': 'Hapoel Raanana',
    'HAC': 'Hapoel Acre',
    'IRH': 'Ironi Ramat HaSharon',
    'HAH': 'Hapoel Hadera',
    'NES': 'Ness Ziona',
    'HJE': 'Hapoel Jerusalem',
    'HNG': 'Hapoel Nof HaGalil',
    'ITI': 'Ironi Tiberias',
    
    # Name variants to canonical names
    'Ashdod': 'F.C. Ashdod',
    'F.C. Ironi Ashdod': 'F.C. Ashdod',
    'Ness Ziona': 'Sektzia Ness Ziona',
    'Ironi Nir Ramat HaSharon': 'Ironi Ramat HaSharon',
    'Hakoah Amidar Ramat Gan': 'Hapoel Ramat Gan',
    'Hapoel Rishon LeZion': 'Rishon LeZion',
    'Hapoel Raanana': "Hapoel Ra'anana",
    
    # Full names map to themselves
    'F.C. Ashdod': 'F.C. Ashdod',
    'Beitar Jerusalem': 'Beitar Jerusalem',
    'Bnei Sakhnin': 'Bnei Sakhnin',
    'Bnei Yehuda': 'Bnei Yehuda',
    'Hapoel Ashkelon': 'Hapoel Ashkelon',
    "Hapoel Be'er Sheva": "Hapoel Be'er Sheva",
    'Hapoel Haifa': 'Hapoel Haifa',
    'Hapoel Kfar Saba': 'Hapoel Kfar Saba',
    "Hapoel Ra'anana": "Hapoel Ra'anana",
    'Hapoel Tel Aviv': 'Hapoel Tel Aviv',
    'Ironi Kiryat Shmona': 'Ironi Kiryat Shmona',
    'Maccabi Haifa': 'Maccabi Haifa',
    'Maccabi Petah Tikva': 'Maccabi Petah Tikva',
    'Maccabi Tel Aviv': 'Maccabi Tel Aviv',
    'Hapoel Petah Tikva': 'Hapoel Petah Tikva',
    'Hapoel Ramat Gan': 'Hapoel Ramat Gan',
    'Hapoel Ramat HaSharon': 'Hapoel Ramat HaSharon',
    'Rishon LeZion': 'Rishon LeZion',
    'Maccabi Ahi Nazareth': 'Maccabi Ahi Nazareth',
    'Maccabi Bnei Reineh': 'Maccabi Bnei Reineh',
    'Sektzia Ness Ziona': 'Sektzia Ness Ziona',
    'Hapoel Acre': 'Hapoel Acre',
    'Maccabi Herzliya': 'Maccabi Herzliya',
    'Maccabi Netanya': 'Maccabi Netanya',
    'Ironi Ramat HaSharon': 'Ironi Ramat HaSharon',
    'Hapoel Hadera': 'Hapoel Hadera',
    'Hapoel Jerusalem': 'Hapoel Jerusalem',
    'Hapoel Nof HaGalil': 'Hapoel Nof HaGalil',
    'Ironi Tiberias': 'Ironi Tiberias',
}

def normalize_team_names(df, name_map=TEAM_NAME_MAP):
    """
    Normalize team names by converting abbreviations and variants to full names.
    
    Args:
        df: DataFrame with 'home_team' and 'away_team' columns
        name_map: Dictionary mapping abbreviations/variants to standardized names
    
    Returns:
        DataFrame with normalized team names
    """
    df = df.copy()
    df['home_team'] = df['home_team'].map(lambda x: name_map.get(x, x))
    df['away_team'] = df['away_team'].map(lambda x: name_map.get(x, x))
    return df

def apply_season_specific_fixes(df, season):
    """
    Apply season-specific Wikipedia data corrections.
    Wikipedia sometimes uses incorrect team names in their results matrices.
    
    Args:
        df: DataFrame with match data
        season: Season string (e.g., '2006/07')
    
    Returns:
        DataFrame with season-specific fixes applied
    """
    df = df.copy()
    
    if season == '2006/07':
        df.loc[df['home_team'] == 'Hapoel Ramat Gan', 'home_team'] = 'Hapoel Acre'
    elif season == '2008/09':
        df.loc[df['home_team'] == 'Hapoel Ramat Gan', 'home_team'] = "Hapoel Ra'anana"
    
    return df

print("‚úÖ Team Name Mapping Loaded:")
print(f"  ‚Ä¢ {len([k for k in TEAM_NAME_MAP.keys() if len(k) <= 3])} abbreviations")
print(f"  ‚Ä¢ {len(set(TEAM_NAME_MAP.values()))} unique teams")


‚úÖ Team Name Mapping Loaded:
  ‚Ä¢ 32 abbreviations
  ‚Ä¢ 31 unique teams


In [8]:
# Summary: Compare data availability between Transfermarkt and Wikipedia
import pandas as pd
ensure_environment()

print("="*80)
print("DATA SOURCES COMPARISON")
print("="*80)

# Check what files we have
transfermarkt_files = list(DATA_DIR.glob("matches_*_transfermarkt.csv"))
wiki_files = list(DATA_DIR.glob("matches_*_wikipedia.csv"))

print(f"\nüìä Match Data Files:")
print(f"  Transfermarkt: {len(transfermarkt_files)} seasons")
print(f"  Wikipedia: {len(wiki_files)} seasons")

# Sample one season to show the difference
if transfermarkt_files:
    sample_file = transfermarkt_files[0]
    df_transfermarkt = pd.read_csv(sample_file)
    
    print(f"\nüîç Sample Analysis: {sample_file.name}")
    print(f"  Total matches: {len(df_transfermarkt)}")
    print(f"  Columns: {list(df_transfermarkt.columns)}")
    
    # Check if it has round info
    if 'round' in df_transfermarkt.columns:
        print(f"  Rounds: {df_transfermarkt['round'].min()} to {df_transfermarkt['round'].max()}")
    
    # Count teams
    teams_home = set(df_transfermarkt['home'].unique()) if 'home' in df_transfermarkt.columns else set()
    teams_away = set(df_transfermarkt['away'].unique()) if 'away' in df_transfermarkt.columns else set()
    all_teams = teams_home.union(teams_away)
    
    print(f"  Unique teams: {len(all_teams)}")
    
    # Calculate expected matches
    num_teams = len(all_teams)
    expected_regular = (num_teams - 1) * 2 * (num_teams // 2)
    
    print(f"\n  üìù For {num_teams} teams:")
    print(f"     Expected regular season: {expected_regular} matches")
    print(f"     Found in Transfermarkt: {len(df_transfermarkt)} matches")
    
    if len(df_transfermarkt) == expected_regular:
        print(f"     ‚úÖ Confirmed: Regular season only (no playoffs)")
    else:
        print(f"     ‚ö†Ô∏è  Match count doesn't match expected regular season")

print("\n" + "="*80)
print("\nüí° RECOMMENDATION:")
print("   Use WIKIPEDIA for complete match data (regular season + playoffs)")
print("   Use TRANSFERMARKT for attendance data")
print("\n   Your existing Wikipedia data already includes:")
print("   ‚úÖ Regular season matches")
print("   ‚úÖ Championship playoff matches")  
print("   ‚úÖ Relegation playoff matches")
print("="*80)

DATA SOURCES COMPARISON

üìä Match Data Files:
  Transfermarkt: 20 seasons
  Wikipedia: 21 seasons

üîç Sample Analysis: matches_2006_07_ligat_haal_transfermarkt.csv
  Total matches: 198
  Columns: ['round', 'home', 'score', 'away']
  Rounds: 1 to 198
  Unique teams: 12

  üìù For 12 teams:
     Expected regular season: 132 matches
     Found in Transfermarkt: 198 matches
     ‚ö†Ô∏è  Match count doesn't match expected regular season


üí° RECOMMENDATION:
   Use WIKIPEDIA for complete match data (regular season + playoffs)
   Use TRANSFERMARKT for attendance data

   Your existing Wikipedia data already includes:
   ‚úÖ Regular season matches
   ‚úÖ Championship playoff matches
   ‚úÖ Relegation playoff matches


In [9]:
# Calculate league standings after each matchday and track leadership changes
import pandas as pd
import numpy as np

ensure_environment()

def calculate_league_table_by_round(matches_df, season_str="2016/17"):
    """
    Calculate league standings after each round/matchday.
    
    Args:
        matches_df: DataFrame with match results (with normalized team names)
        season_str: Season to analyze (e.g., "2016/17")
    
    Returns:
        - standings_by_round: dict mapping round_num -> DataFrame of standings
        - leadership_changes: list of tuples (round_num, new_leader)
    
    Note: Team names should already be normalized (full names, not abbreviations).
    """
    # Filter for the specific season
    season_matches = matches_df[matches_df['season'] == season_str].copy()
    
    # Get all unique teams - count ONLY home teams (each team has home games)
    # This avoids duplicate counting from abbreviations in away_team column
    teams = sorted(season_matches['home_team'].unique())
    n_teams = len(teams)
    
    print(f"‚Ñπ Processing {season_str}: {len(season_matches)} matches, {n_teams} teams")
    
    # In Ligat Ha'al, 14 teams play 26 rounds in regular season, then split into championship/relegation
    # For the regular season: each team plays 13 opponents √ó 2 (home/away) = 26 matches
    # Total matches in regular season = (14 teams √ó 26 matches) / 2 = 182 matches
    
    # Assign round numbers by ordering matches
    # Since we don't have dates, distribute evenly assuming each round has n_teams/2 matches
    season_matches = season_matches.reset_index(drop=True)
    
    # Each round has 7 matches (14 teams / 2)
    matches_per_round = n_teams // 2 if n_teams % 2 == 0 else (n_teams + 1) // 2
    
    # Assign rounds based on position in dataset
    season_matches['round_num'] = (season_matches.index // matches_per_round) + 1
    max_round = season_matches['round_num'].max()
    
    # Initialize standings tracker
    standings_by_round = {}
    current_leader = None
    leadership_changes = []
    
    # Calculate standings after each round
    for round_num in sorted(season_matches['round_num'].unique()):
        # Get all matches up to and including this round
        matches_so_far = season_matches[season_matches['round_num'] <= round_num]
        
        # Initialize team stats
        stats = {team: {'played': 0, 'won': 0, 'drawn': 0, 'lost': 0, 
                        'gf': 0, 'ga': 0, 'gd': 0, 'points': 0} 
                 for team in teams}
        
        # Calculate stats from matches
        for _, match in matches_so_far.iterrows():
            home = match['home_team']
            away = match['away_team']
            home_goals = match['home_goals']
            away_goals = match['away_goals']
            
            # Update home team
            stats[home]['played'] += 1
            stats[home]['gf'] += home_goals
            stats[home]['ga'] += away_goals
            stats[home]['gd'] = stats[home]['gf'] - stats[home]['ga']
            
            # Update away team
            stats[away]['played'] += 1
            stats[away]['gf'] += away_goals
            stats[away]['ga'] += home_goals
            stats[away]['gd'] = stats[away]['gf'] - stats[away]['ga']
            
            # Update points
            if home_goals > away_goals:  # Home win
                stats[home]['won'] += 1
                stats[home]['points'] += 3
                stats[away]['lost'] += 1
            elif away_goals > home_goals:  # Away win
                stats[away]['won'] += 1
                stats[away]['points'] += 3
                stats[home]['lost'] += 1
            else:  # Draw
                stats[home]['drawn'] += 1
                stats[away]['drawn'] += 1
                stats[home]['points'] += 1
                stats[away]['points'] += 1
        
        # Convert to DataFrame and sort
        standings = pd.DataFrame.from_dict(stats, orient='index')
        standings.index.name = 'team'
        standings = standings.reset_index()
        standings = standings.sort_values(['points', 'gd', 'gf'], ascending=[False, False, False])
        standings['position'] = range(1, len(standings) + 1)
        
        standings_by_round[int(round_num)] = standings
        
        # Track leader
        new_leader = standings.iloc[0]['team']
        if new_leader != current_leader:
            leadership_changes.append((int(round_num), new_leader))
            current_leader = new_leader
    
    return standings_by_round, leadership_changes

# Load the combined matches data
matches_path = DATA_DIR / "matches_all_seasons_ligat_haal_wikipedia.csv"
if not matches_path.exists():
    print(f"‚ùå Combined matches file not found: {matches_path}")
    print("Please run the multi-season Wikipedia scraper first (cell 17)")
else:
    all_matches = pd.read_csv(matches_path)
    
    # Normalize team names (convert abbreviations to full names)
    all_matches = normalize_team_names(all_matches, TEAM_NAME_MAP)
    
    # Apply season-specific fixes
    for season_name in all_matches['season'].unique():
        season_data = all_matches[all_matches['season'] == season_name]
        all_matches.loc[all_matches['season'] == season_name] = apply_season_specific_fixes(season_data, season_name)
    
    # Analyze 2016/17 season
    season = "2016/17"
    standings_by_round, leadership_changes = calculate_league_table_by_round(all_matches, season)
    
    print(f"\nüìä League Leadership Analysis - {season} (REGULAR SEASON)")
    print("=" * 60)
    print(f"\nüèÜ Leadership Changes: {len(leadership_changes) - 1}")
    print(f"   (Initial leader doesn't count as a 'change')\n")
    
    print("Round-by-round first place:")
    for round_num, leader in leadership_changes:
        print(f"  ‚Ä¢ Round {round_num:2d}: {leader}")
    
    # Show final standings
    print(f"\nüìã Final Standings After Round {max(standings_by_round.keys())} (Regular Season):")
    final = standings_by_round[max(standings_by_round.keys())]
    display(final[['position', 'team', 'played', 'won', 'drawn', 'lost', 'gf', 'ga', 'gd', 'points']].head(10))
    
    # Calculate some interesting stats
    print(f"\nüìà Season Statistics:")
    print(f"  ‚Ä¢ Rounds analyzed: {len(standings_by_round)} (Regular Season only)")
    print(f"  ‚Ä¢ Teams: {len(final)}")
    print(f"  ‚Ä¢ Total matches: {len(all_matches[all_matches['season'] == season])}")
    print(f"  ‚Ä¢ Leader after regular season: {final.iloc[0]['team']} ({final.iloc[0]['points']:.0f} pts, {final.iloc[0]['played']:.0f} games)")
    print(f"  ‚Ä¢ Runner-up: {final.iloc[1]['team']} ({final.iloc[1]['points']:.0f} pts, {final.iloc[1]['played']:.0f} games)")
    print(f"  ‚Ä¢ Points gap: {final.iloc[0]['points'] - final.iloc[1]['points']:.0f} pts")
    
    print(f"\n‚ö†Ô∏è IMPORTANT NOTE:")
    print(f"   Wikipedia results matrix only shows REGULAR SEASON matches (26 rounds).")
    print(f"   Ligat Ha'al has additional Championship/Relegation playoffs (~10 rounds).")
    print(f"   Full season totals: ~36 matches, ~87 points for champion (as you mentioned).")
    print(f"   This analysis tracks leadership changes during the regular season only.")
    print(f"\n‚úÖ All team names are now normalized (full names used throughout).")


‚Ñπ Processing 2016/17: 182 matches, 14 teams

üìä League Leadership Analysis - 2016/17 (REGULAR SEASON)

üèÜ Leadership Changes: 3
   (Initial leader doesn't count as a 'change')

Round-by-round first place:
  ‚Ä¢ Round  1: F.C. Ashdod
  ‚Ä¢ Round  3: Beitar Jerusalem
  ‚Ä¢ Round  8: Bnei Sakhnin
  ‚Ä¢ Round 10: Hapoel Be'er Sheva

üìã Final Standings After Round 26 (Regular Season):


Unnamed: 0,position,team,played,won,drawn,lost,gf,ga,gd,points
5,1,Hapoel Be'er Sheva,26,18,5,3,54,13,41,59
13,2,Maccabi Tel Aviv,26,17,5,4,45,19,26,56
12,3,Maccabi Petah Tikva,26,13,9,4,36,23,13,48
0,4,Beitar Jerusalem,26,10,10,6,34,27,7,40
1,5,Bnei Sakhnin,26,10,9,7,26,26,0,39
11,6,Maccabi Haifa,26,10,8,8,30,25,5,38
10,7,Ironi Kiryat Shmona,26,9,8,9,35,33,2,35
6,8,Hapoel Haifa,26,8,4,14,29,36,-7,28
3,9,F.C. Ashdod,26,6,10,10,15,26,-11,28
8,10,Hapoel Ra'anana,26,7,7,12,14,29,-15,28



üìà Season Statistics:
  ‚Ä¢ Rounds analyzed: 26 (Regular Season only)
  ‚Ä¢ Teams: 14
  ‚Ä¢ Total matches: 182
  ‚Ä¢ Leader after regular season: Hapoel Be'er Sheva (59 pts, 26 games)
  ‚Ä¢ Runner-up: Maccabi Tel Aviv (56 pts, 26 games)
  ‚Ä¢ Points gap: 3 pts

‚ö†Ô∏è IMPORTANT NOTE:
   Wikipedia results matrix only shows REGULAR SEASON matches (26 rounds).
   Ligat Ha'al has additional Championship/Relegation playoffs (~10 rounds).
   Full season totals: ~36 matches, ~87 points for champion (as you mentioned).
   This analysis tracks leadership changes during the regular season only.

‚úÖ All team names are now normalized (full names used throughout).


In [10]:
# Verify scraped data and compare with Wikipedia format
from pathlib import Path
import pandas as pd

# List all Transfermarkt CSVs
DATA_DIR = Path(ROOT) / 'data' / 'raw'
transfermarkt_files = sorted(DATA_DIR.glob('matches_*_ligat_haal_transfermarkt.csv'))

print(f'‚úÖ Found {len(transfermarkt_files)} Transfermarkt CSV files')
print('\nFiles:')
for f in transfermarkt_files:
    print(f'  - {f.name}')

# Load and check format of first file
if transfermarkt_files:
    sample_file = transfermarkt_files[0]
    df_sample = pd.read_csv(sample_file)
    
    print(f'\n‚úÖ Sample file: {sample_file.name}')
    print(f'  Columns: {list(df_sample.columns)}')
    print(f'  Shape: {df_sample.shape}')
    print(f'\nFirst 5 rows:')
    display(df_sample.head())
    
    # Check for any missing data
    print(f'\nData quality check:')
    print(f'  Missing home teams: {df_sample["home"].isna().sum()}')
    print(f'  Missing away teams: {df_sample["away"].isna().sum()}')
    print(f'  Missing scores: {df_sample["score"].isna().sum()}')

# Compare with Wikipedia format
wiki_files = sorted(DATA_DIR.glob('matches_*_ligat_haal_wikipedia.csv'))
if wiki_files:
    wiki_sample = pd.read_csv(wiki_files[0])
    print(f'\n‚úÖ Wikipedia sample: {wiki_files[0].name}')
    print(f'  Columns: {list(wiki_sample.columns)}')
    print(f'\nFirst 3 rows:')
    display(wiki_sample.head(3))
    
    print('\n‚úÖ Format comparison:')
    print(f'  Transfermarkt columns: {list(df_sample.columns)}')
    print(f'  Wikipedia columns: {list(wiki_sample.columns)}')
    print(f'  Match: {list(df_sample.columns) == list(wiki_sample.columns)}')

‚úÖ Found 20 Transfermarkt CSV files

Files:
  - matches_2006_07_ligat_haal_transfermarkt.csv
  - matches_2007_08_ligat_haal_transfermarkt.csv
  - matches_2008_09_ligat_haal_transfermarkt.csv
  - matches_2009_10_ligat_haal_transfermarkt.csv
  - matches_2010_11_ligat_haal_transfermarkt.csv
  - matches_2011_12_ligat_haal_transfermarkt.csv
  - matches_2012_13_ligat_haal_transfermarkt.csv
  - matches_2013_14_ligat_haal_transfermarkt.csv
  - matches_2014_15_ligat_haal_transfermarkt.csv
  - matches_2015_16_ligat_haal_transfermarkt.csv
  - matches_2016_17_ligat_haal_transfermarkt.csv
  - matches_2017_18_ligat_haal_transfermarkt.csv
  - matches_2018_19_ligat_haal_transfermarkt.csv
  - matches_2019_20_ligat_haal_transfermarkt.csv
  - matches_2020_21_ligat_haal_transfermarkt.csv
  - matches_2021_22_ligat_haal_transfermarkt.csv
  - matches_2022_23_ligat_haal_transfermarkt.csv
  - matches_2023_24_ligat_haal_transfermarkt.csv
  - matches_2024_25_ligat_haal_transfermarkt.csv
  - matches_2025_26_liga

Unnamed: 0,round,home,score,away
0,1,H. Kfar Saba,4:1,H. Petah Tikva
1,2,M. Petah Tikva,0:0,Hakoah Amidar
2,3,FC Ashdod,1:0,Maccabi Herzlya
3,4,Maccabi Netanya,3:1,Maccabi Haifa
4,5,M. Tel Aviv,1:2,B. Jerusalem



Data quality check:
  Missing home teams: 0
  Missing away teams: 0
  Missing scores: 0

‚úÖ Wikipedia sample: matches_2006_07_ligat_haal_wikipedia.csv
  Columns: ['season', 'season_year', 'home_team', 'away_team', 'home_goals', 'away_goals', 'goal_diff', 'result', 'home_points', 'away_points']

First 3 rows:


Unnamed: 0,season,season_year,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2006/07,2006,Beitar Jerusalem,BnY,0,0,0,D,1,1
1,2006/07,2006,Beitar Jerusalem,ASH,2,0,2,H,3,0
2,2006/07,2006,Beitar Jerusalem,HAK,0,0,0,D,1,1



‚úÖ Format comparison:
  Transfermarkt columns: ['round', 'home', 'score', 'away']
  Wikipedia columns: ['season', 'season_year', 'home_team', 'away_team', 'home_goals', 'away_goals', 'goal_diff', 'result', 'home_points', 'away_points']
  Match: False


In [11]:
# Final Summary: All 20 Seasons from Transfermarkt
import pandas as pd
from pathlib import Path

DATA_DIR = Path(ROOT) / 'data' / 'raw'
transfermarkt_files = sorted(DATA_DIR.glob('matches_*_ligat_haal_transfermarkt.csv'))

print('‚úÖ TRANSFERMARKT SCRAPING COMPLETE \u2705')
print('=' * 80)
print(f'\nSuccessfully scraped {len(transfermarkt_files)} seasons from Transfermarkt')
print(f'Seasons: 2006/07 to 2025/26')
print(f'Format: round, home, score, away (same as Wikipedia)')

# Load all files and create summary
all_data = []
season_summary = []

for csv_file in transfermarkt_files:
    df = pd.read_csv(csv_file)
    season = csv_file.stem.split('_')[1:3]  # Extract season from filename
    season_str = f"{season[0]}/{season[1]}"
    
    season_summary.append({
        'Season': season_str,
        'Matches': len(df),
        'Rounds': df['round'].max(),
        'Teams': len(set(df['home'].tolist() + df['away'].tolist()))
    })

summary_df = pd.DataFrame(season_summary)

print('\n‚úÖ Season Summary:')
display(summary_df)

print(f'\n‚úÖ Total Statistics:')
print(f'  Total matches: {summary_df["Matches"].sum()}')
print(f'  Average matches per season: {summary_df["Matches"].mean():.0f}')
print(f'  Max rounds in a season: {summary_df["Rounds"].max()}')
print(f'  Min rounds in a season: {summary_df["Rounds"].min()}')

print('\n‚úÖ Data Location:')
print(f'  Directory: {DATA_DIR}')
print(f'  Files: matches_YYYY_YY_ligat_haal_transfermarkt.csv')

print('\n‚úÖ Next Steps:')
print('  - Data is ready for analysis')
print('  - Same format as Wikipedia data (round, home, score, away)')
print('  - Can be combined or analyzed separately')
print('  - Playoff data available in gesamtspielplan pages (Championship/Relegation rounds)')

‚úÖ TRANSFERMARKT SCRAPING COMPLETE ‚úÖ

Successfully scraped 20 seasons from Transfermarkt
Seasons: 2006/07 to 2025/26
Format: round, home, score, away (same as Wikipedia)

‚úÖ Season Summary:


Unnamed: 0,Season,Matches,Rounds,Teams
0,2006/07,198,198,12
1,2007/08,198,198,12
2,2008/09,198,198,12
3,2009/10,240,240,16
4,2010/11,240,240,16
5,2011/12,240,240,16
6,2012/13,182,182,14
7,2013/14,182,182,14
8,2014/15,182,182,14
9,2015/16,182,182,14



‚úÖ Total Statistics:
  Total matches: 3749
  Average matches per season: 187
  Max rounds in a season: 240
  Min rounds in a season: 69

‚úÖ Data Location:
  Directory: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw
  Files: matches_YYYY_YY_ligat_haal_transfermarkt.csv

‚úÖ Next Steps:
  - Data is ready for analysis
  - Same format as Wikipedia data (round, home, score, away)
  - Can be combined or analyzed separately
  - Playoff data available in gesamtspielplan pages (Championship/Relegation rounds)


In [12]:
# Transfermarkt Playoff Scraper (Restored) - outputs round, home, score, away
import re, time, requests
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd

# Ensure ROOT and DATA_DIR exist
try:
    ROOT
except NameError:
    ROOT = Path.cwd()
DATA_DIR = Path(ROOT) / 'data' / 'raw'
DATA_DIR.mkdir(parents=True, exist_ok=True)

HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}

def http_get(url, retries=3, sleep=1.5):
    for attempt in range(1, retries+1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            if resp.status_code == 200:
                return resp.text
            else:
                print(f"HTTP {resp.status_code} for {url}")
        except Exception as e:
            print(f"Attempt {attempt} failed for {url}: {e}")
        time.sleep(sleep)
    return ''

def scrape_transfermarkt_playoffs(season_year):
    season_tag = f"{season_year}_{str(season_year+1)[-2:]}"
    out_csv = DATA_DIR / f"matches_{season_tag}_ligat_haal_transfermarkt_playoffs.csv"
    base_url = f"https://www.transfermarkt.com/ligat-haal/gesamtspielplan/wettbewerb/ISR1?saison_id={season_year}"
    # Note: League playoffs are included in gesamtspielplan as separate sections (e.g., Championship Round)
    html = http_get(base_url)
    if not html:
        print(f"‚ùå No HTML for playoffs {season_year}")
        return None
    soup = BeautifulSoup(html,'html.parser')
    rows_out = []
    playoff_round = 0
    for box in soup.select('div.box'):
        h2 = box.select_one('h2, h3')
        if not h2:
            continue
        title = h2.get_text(strip=True)
        # Identify playoff sections by keywords
        if not re.search(r'Championship|Relegation|Play-?off|Upper|Lower', title, re.IGNORECASE):
            continue
        table = box.select_one('table.items') or box.select_one('table')
        if not table:
            continue
        for tr in table.select('tbody tr'):
            tds = tr.find_all('td')
            if len(tds) < 5:
                continue
            home_a = tr.select_one('td.verein-heim a, td.heim a, td:nth-of-type(2) a[href*="/verein/"]')
            away_a = tr.select_one('td.verein-gast a, td.gast a, td:nth-of-type(6) a[href*="/verein/"]')
            if not home_a or not away_a:
                team_links = [a for a in tr.select('a[href*="/verein/"]') if a.get_text(strip=True)]
                if len(team_links) >= 2:
                    home_a, away_a = team_links[0], team_links[1]
                else:
                    continue
            home = home_a.get_text(strip=True)
            away = away_a.get_text(strip=True)
            score_cell = tr.select_one('td.ergebnis a, td.ergebnis, td:nth-of-type(5)')
            score_txt = score_cell.get_text(" ", strip=True) if score_cell else ''
            mscore = re.search(r'(\d+\s*:\s*\d+)', score_txt)
            score = mscore.group(1).replace(' ','') if mscore else ''
            if not score:
                continue
            playoff_round += 1
            rows_out.append({'round': playoff_round, 'home': home, 'score': score, 'away': away})
    if not rows_out:
        print(f"‚ö†Ô∏è No playoff matches parsed for {season_year}")
        return None
    df = pd.DataFrame(rows_out)
    df.to_csv(out_csv, index=False)
    print(f"‚úÖ Saved {len(df)} playoff matches -> {out_csv.name}")
    return df

In [13]:
# Transfermarkt Regular Season Scraper (Fixed) - outputs Wikipedia-style columns: round, home, score, away
import re, time, requests
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd

# Ensure ROOT and DATA_DIR exist
try:
    ROOT
except NameError:
    ROOT = Path.cwd()
DATA_DIR = Path(ROOT) / 'data' / 'raw'
DATA_DIR.mkdir(parents=True, exist_ok=True)

HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}

def http_get(url, retries=3, sleep=1.5):
    for attempt in range(1, retries+1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            if resp.status_code == 200:
                return resp.text
        except Exception as e:
            if attempt == retries:
                print(f"Failed after {retries} attempts: {e}")
        if attempt < retries:
            time.sleep(sleep)
    return ''

def scrape_transfermarkt_regular(season_year):
    """Scrape regular season matches from Transfermarkt gesamtspielplan page."""
    season_tag = f"{season_year}_{str(season_year+1)[-2:]}"
    out_csv = DATA_DIR / f"matches_{season_tag}_ligat_haal_transfermarkt.csv"
    
    url = f"https://www.transfermarkt.com/ligat-haal/gesamtspielplan/wettbewerb/ISR1?saison_id={season_year}"
    html = http_get(url)
    if not html:
        print(f"‚ùå No HTML for season {season_year}")
        return None
    
    soup = BeautifulSoup(html, 'html.parser')
    rows_out = []
    round_num = 0
    
    # Find all tables on the page
    tables = soup.find_all('table')
    
    for table in tables:
        # Look for match rows (rows with 2 team links)
        for tr in table.find_all('tr'):
            # Find all cells
            cells = tr.find_all('td')
            if len(cells) < 5:
                continue
            
            # Find score first to confirm this is a match row
            score_link = tr.find('a', class_='ergebnis-link')
            if not score_link:
                continue
            
            score_text = score_link.get_text(strip=True)
            # Validate score format (d:d)
            if not re.match(r'^\d+:\d+$', score_text):
                continue
            
            # Now find team links - typically in cells before and after score
            all_team_links = []
            for cell in cells:
                team_link = cell.find('a', href=re.compile(r'/verein/'))
                if team_link:
                    team_name = team_link.get_text(strip=True)
                    if team_name and team_name not in [link.get_text(strip=True) for link in all_team_links]:
                        all_team_links.append(team_link)
            
            if len(all_team_links) < 2:
                continue
            
            home = all_team_links[0].get_text(strip=True)
            away = all_team_links[1].get_text(strip=True)
            
            # Increment round for each match found
            round_num += 1
            
            rows_out.append({
                'round': round_num,
                'home': home,
                'score': score_text,
                'away': away
            })
    
    if not rows_out:
        print(f"‚ö†Ô∏è No matches parsed for {season_year}")
        return None
    
    df = pd.DataFrame(rows_out)
    df.to_csv(out_csv, index=False)
    print(f"‚úÖ Saved {len(df)} matches -> {out_csv.name}")
    return df

print('Regular season scraper updated with fixed team extraction.')

Regular season scraper updated with fixed team extraction.


In [14]:
# Run restored Transfermarkt scrapers for all seasons and validate coverage
seasons = list(range(2006, 2026))
regular_counts = {}
playoff_counts = {}

for sy in seasons:
    r = scrape_transfermarkt_regular(sy)
    if r is not None:
        regular_counts[sy] = len(r)
    
    p = scrape_transfermarkt_playoffs(sy)
    if p is not None:
        playoff_counts[sy] = len(p)

print('\n' + '='*80)
print('VALIDATION SUMMARY')
print('='*80)
print(f'Regular seasons scraped: {len(regular_counts)}')
print(f'Playoff seasons scraped: {len(playoff_counts)}')

import pandas as pd
summary_df = pd.DataFrame({
    'season_year': list(regular_counts.keys()), 
    'regular_matches': list(regular_counts.values())
}).sort_values('season_year')

print('\nDetailed breakdown:')
display(summary_df)

‚úÖ Saved 198 matches -> matches_2006_07_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed for 2006
‚úÖ Saved 198 matches -> matches_2007_08_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed for 2007
‚úÖ Saved 198 matches -> matches_2008_09_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed for 2008
‚úÖ Saved 240 matches -> matches_2009_10_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed for 2009
‚úÖ Saved 240 matches -> matches_2010_11_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed for 2010
‚úÖ Saved 240 matches -> matches_2011_12_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed for 2011
‚úÖ Saved 182 matches -> matches_2012_13_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed for 2012
‚úÖ Saved 182 matches -> matches_2013_14_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed for 2013
‚úÖ Saved 182 matches -> matches_2014_15_ligat_haal_transfermarkt.csv
‚ö†Ô∏è No playoff matches parsed f

Unnamed: 0,season_year,regular_matches
0,2006,198
1,2007,198
2,2008,198
3,2009,240
4,2010,240
5,2011,240
6,2012,182
7,2013,182
8,2014,182
9,2015,182
