## 1. Environment Setup

In [27]:
# Environment setup
from pathlib import Path
from typing import Optional

try:
    from dotenv import load_dotenv
    DOTENV_AVAILABLE = True
except Exception:
    DOTENV_AVAILABLE = False

# Helper to find project root
def _find_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

# Resolve project directories consistently
ROOT = _find_root()
DATA_DIR = ROOT / 'data' / 'raw'
INTERIM_DIR = ROOT / 'data' / 'interim'
PROCESSED_DIR = ROOT / 'data' / 'processed'
FIG_DIR = ROOT / 'reports' / 'figures'
for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"\nüéØ Environment setup complete")
print(f"   ROOT: {ROOT}")
print(f"   DATA_DIR: {DATA_DIR}")


üéØ Environment setup complete
   ROOT: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks
   DATA_DIR: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw


## 2. Helper Functions

In [28]:
# Helper functions for scraping
from typing import Optional
import random
import time
from pathlib import Path
import requests

_USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36",
]

def find_repo_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

def ensure_environment():
    global ROOT, DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR
    if 'ROOT' not in globals() or not isinstance(ROOT, Path) or not (ROOT / 'data').exists():
        root_guess = find_repo_root(Path.cwd())
        if not (root_guess / 'data').exists() and (root_guess.parent / 'data').exists():
            root_guess = root_guess.parent
        ROOT = root_guess
    DATA_DIR = ROOT / 'data' / 'raw'
    INTERIM_DIR = ROOT / 'data' / 'interim'
    PROCESSED_DIR = ROOT / 'data' / 'processed'
    FIG_DIR = ROOT / 'reports' / 'figures'
    for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
        d.mkdir(parents=True, exist_ok=True)
    return ROOT, DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR


def http_get(url: str, headers: Optional[dict] = None, retries: int = 3, timeout: int = 30) -> str:
    last_err = None
    sess = requests.Session()
    for attempt in range(1, retries + 1):
        ua = random.choice(_USER_AGENTS)
        hdrs = {"User-Agent": ua, "Accept-Language": "en-US,en;q=0.9"}
        if headers:
            hdrs.update(headers)
        try:
            resp = sess.get(url, headers=hdrs, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            last_err = e
            time.sleep(0.8 * attempt)
    raise last_err  # type: ignore


def save_csv(df: 'pd.DataFrame', path: Path, **to_csv_kwargs):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, encoding=to_csv_kwargs.get('encoding', 'utf-8-sig'))
    print(f"Saved: {path}")

print("‚úÖ Helper functions loaded")

‚úÖ Helper functions loaded


## 3. Wikipedia Playoffs Scraper Functions

Scrapes playoff results from Wikipedia. The Israeli Premier League splits into two playoff groups:
- **Championship round**: Top 6 teams compete for title
- **Relegation round**: Bottom teams fight to avoid relegation

Both use results matrix format similar to regular season.

In [35]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from datetime import datetime

def _find_round_header(soup: BeautifulSoup, round_type: str):
    # Prefer stable ids used by Wikipedia sections
    target_id = "Championship_round_results" if round_type == "championship" else "Relegation_round_results"
    node = soup.find(id=target_id)
    if node:
        # Section id can be on span inside h2/h3; climb to heading
        heading = node if node.name in ("h2","h3","h4") else node.find_parent(["h2","h3","h4"])
        if heading:
            return heading
    # Fallback to exact/contains text
    wanted = ("Championship round results" if round_type=="championship" else "Relegation round results").lower()
    for hdr in soup.find_all(["h2","h3","h4"]):
        text = hdr.get_text(" ", strip=True).lower()
        if text == wanted:
            return hdr
    for hdr in soup.find_all(["h2","h3","h4"]):
        text = hdr.get_text(" ", strip=True).lower()
        if (round_type=="championship" and ("championship" in text and "results" in text)) or \
           (round_type!="championship" and ("relegation" in text and "results" in text)):
            return hdr
    return None

def _find_matrix_after(header_node):
    # Wikipedia wraps headings in div.mw-heading; move to wrapper if present
    start = header_node
    parent = getattr(header_node, 'parent', None)
    parent_classes = set((parent.get('class') or [])) if parent else set()
    if parent and ('mw-heading' in parent_classes or any(c.startswith('mw-heading') for c in parent_classes)):
        start = parent
    # Walk forward until next heading; search both direct siblings and nested tables
    current = start
    while True:
        current = current.find_next_sibling()
        if current is None:
            break
        # If encountering another heading wrapper, stop
        if current.name in ("h2","h3","h4"):
            break
        if 'mw-heading' in set(current.get('class') or []) or any(
            c.startswith('mw-heading') for c in (current.get('class') or [])
        ):
            break
        # Check if this node is a wikitable matrix
        if current.name == "table" and "wikitable" in (current.get("class") or []):
            first_row = current.find("tr")
            first_cell = first_row.find("th") if first_row else None
            header_text = first_cell.get_text(" ", strip=True) if first_cell else ""
            if re.search(r"Home\s*[\\/]\s*Away|Home.*Away", header_text, re.IGNORECASE):
                return current
        # Also search nested tables inside wrappers (divs, etc.)
        nested = current.find("table", class_=re.compile(r"\bwikitable\b"))
        while nested:
            first_row = nested.find("tr")
            first_cell = first_row.find("th") if first_row else None
            header_text = first_cell.get_text(" ", strip=True) if first_cell else ""
            if re.search(r"Home\s*[\\/]\s*Away|Home.*Away", header_text, re.IGNORECASE):
                return nested
            nested = nested.find_next("table", class_=re.compile(r"\bwikitable\b"))
    return None

def scrape_playoff_round(season_year: int, round_type: str):
    """
    Scrape a single playoff round from Wikipedia by locating the specific
    "... round results" header and parsing the next Home \ Away matrix.
    """
    season_str = f"{season_year}/{str(season_year+1)[-2:]}"
    url = f"https://en.wikipedia.org/wiki/{season_year}%E2%80%93{str(season_year+1)[-2:]}_Israeli_Premier_League"

    round_name = "Championship" if round_type == "championship" else "Relegation"
    print(f"Fetching {season_str} {round_name} round... ", end="", flush=True)

    try:
        html = http_get(url)
        if not html:
            print("‚ùå (empty HTML)")
            return None
        soup = BeautifulSoup(html, "html.parser")

        header = _find_round_header(soup, round_type)
        if not header:
            print(f"‚ùå (no {round_name} round results header)")
            return None

        results_table = _find_matrix_after(header)
        if not results_table:
            print(f"‚ùå (no results matrix after {round_name} header)")
            return None

        rows = results_table.find_all("tr")
        if len(rows) < 2:
            print("‚ùå (matrix has no data rows)")
            return None

        team_names = [th.get_text(strip=True) for th in rows[0].find_all("th")][1:]
        if not team_names:
            print("‚ùå (no team headers)")
            return None

        matches = []
        for row in rows[1:]:
            cells = row.find_all(["th", "td"])
            if len(cells) < len(team_names) + 1:
                continue
            home_team = cells[0].get_text(strip=True)
            for idx, cell in enumerate(cells[1:]):
                if idx >= len(team_names):
                    break
                away_team = team_names[idx]
                score_text = cell.get_text(strip=True)
                if re.match(r"^\d+\s*[‚Äì-]\s*\d+$", score_text):
                    home_goals, away_goals = re.split(r"[‚Äì-]", score_text)
                    matches.append({
                        "season": season_str,
                        "season_year": season_year,
                        "playoff_type": round_type,
                        "home_team": home_team,
                        "away_team": away_team,
                        "home_goals": int(home_goals.strip()),
                        "away_goals": int(away_goals.strip())
                    })

        if not matches:
            print("‚ùå (no matches found)")
            return None

        df = pd.DataFrame(matches)
        df['goal_diff'] = df['home_goals'] - df['away_goals']
        df['result'] = df['goal_diff'].apply(lambda x: 'H' if x > 0 else ('A' if x < 0 else 'D'))
        df['home_points'] = df['result'].map({'H': 3, 'D': 1, 'A': 0}).astype(int)
        df['away_points'] = df['result'].map({'A': 3, 'D': 1, 'H': 0}).astype(int)

        keep_cols = ['season','season_year','playoff_type','home_team','away_team','home_goals','away_goals','goal_diff','result','home_points','away_points']
        df = df[keep_cols]
        print(f"‚úì ({len(df)} matches)")
        return df
    except Exception as e:
        print(f"‚ùå ({str(e)[:60]}...")
        return None

def scrape_season_playoffs(season_year: int):
    """Scrape both championship and relegation playoffs for a season."""
    championship_df = scrape_playoff_round(season_year, 'championship')
    time.sleep(0.5)
    relegation_df = scrape_playoff_round(season_year, 'relegation')
    return championship_df, relegation_df

print("‚úÖ Playoff scraper functions ready")

‚úÖ Playoff scraper functions ready


  """


## 4. Multi-Season Playoff Collection

Scrapes playoff data for multiple seasons from Wikipedia.

In [37]:
# Scrape multiple seasons of playoffs from Wikipedia
import pandas as pd
import time
from datetime import datetime

ensure_environment()

# Define season range (playoffs typically started around 2006-2007)
current_year = datetime.now().year
if datetime.now().month < 8:  # If before August, last season started in previous year
    current_year -= 1

# Start from 2006/07 season when playoffs format was introduced
seasons = list(range(2006, current_year + 1))

print(f"Scraping playoffs for {len(seasons)} seasons ({seasons[0]}/{str(seasons[0]+1)[-2:]} to {seasons[-1]}/{str(seasons[-1]+1)[-2:]})...")
print("="*80)

# Storage for all playoff matches
all_championship = []
all_relegation = []
failed_seasons = []

for season_year in seasons:
    champ_df, releg_df = scrape_season_playoffs(season_year)
    
    # Save championship round
    if champ_df is not None:
        season_path = DATA_DIR / f"playoffs_championship_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_wikipedia.csv"
        save_csv(champ_df, season_path)
        all_championship.append(champ_df)
    
    # Save relegation round
    if releg_df is not None:
        season_path = DATA_DIR / f"playoffs_relegation_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_wikipedia.csv"
        save_csv(releg_df, season_path)
        all_relegation.append(releg_df)
    
    # Track failed seasons
    if champ_df is None and releg_df is None:
        failed_seasons.append(f"{season_year}/{str(season_year+1)[-2:]}")
    
    time.sleep(1)  # Be nice to Wikipedia

print("\n" + "="*80)

# Combine and save championship rounds
if all_championship:
    combined_champ = pd.concat(all_championship, ignore_index=True)
    combined_path = DATA_DIR / "playoffs_championship_all_seasons_ligat_haal_wikipedia.csv"
    save_csv(combined_champ, combined_path)
    
    print(f"\nüìä Championship Round Summary:")
    print(f"   Successfully scraped: {len(all_championship)} seasons")
    print(f"   Total matches: {len(combined_champ)}")
    print(f"\n   Matches per season:")
    champ_counts = combined_champ.groupby('season').size().sort_index()
    for season, count in champ_counts.items():
        print(f"      ‚Ä¢ {season}: {count:3d} matches")
else:
    print("\n‚ùå No championship playoff matches were successfully scraped")

print("\n" + "-"*80)

# Combine and save relegation rounds
if all_relegation:
    combined_releg = pd.concat(all_relegation, ignore_index=True)
    combined_path = DATA_DIR / "playoffs_relegation_all_seasons_ligat_haal_wikipedia.csv"
    save_csv(combined_releg, combined_path)
    
    print(f"\nüìä Relegation Round Summary:")
    print(f"   Successfully scraped: {len(all_relegation)} seasons")
    print(f"   Total matches: {len(combined_releg)}")
    print(f"\n   Matches per season:")
    releg_counts = combined_releg.groupby('season').size().sort_index()
    for season, count in releg_counts.items():
        print(f"      ‚Ä¢ {season}: {count:3d} matches")
else:
    print("\n‚ùå No relegation playoff matches were successfully scraped")

if failed_seasons:
    print(f"\n‚ö†Ô∏è  Seasons with no playoff data: {', '.join(failed_seasons)}")

print("\n" + "="*80)

# Display sample data
if all_championship:
    print(f"\n   Championship Round Sample:")
    display(combined_champ.head(10))

if all_relegation:
    print(f"\n   Relegation Round Sample:")
    display(combined_releg.head(10))

Scraping playoffs for 20 seasons (2006/07 to 2025/26)...
Fetching 2006/07 Championship round... ‚ùå (no Championship round results header)
‚ùå (no Championship round results header)
Fetching 2006/07 Relegation round... Fetching 2006/07 Relegation round... ‚ùå (no Relegation round results header)
‚ùå (no Relegation round results header)
Fetching 2007/08 Championship round... Fetching 2007/08 Championship round... ‚ùå (no Championship round results header)
‚ùå (no Championship round results header)
Fetching 2007/08 Relegation round... Fetching 2007/08 Relegation round... ‚ùå (no Relegation round results header)
‚ùå (no Relegation round results header)
Fetching 2008/09 Championship round... Fetching 2008/09 Championship round... ‚ùå (no Championship round results header)
‚ùå (no Championship round results header)
Fetching 2008/09 Relegation round... Fetching 2008/09 Relegation round... ‚ùå (no Relegation round results header)
‚ùå (no Relegation round results header)
Fetching 2009/10 Champ

Unnamed: 0,season,season_year,playoff_type,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2015/16,2015,championship,Beitar Jerusalem,BnS,0,3,-3,A,0,3
1,2015/16,2015,championship,Beitar Jerusalem,HBS,0,2,-2,A,0,3
2,2015/16,2015,championship,Beitar Jerusalem,HRA,1,0,1,H,3,0
3,2015/16,2015,championship,Beitar Jerusalem,MHA,3,2,1,H,3,0
4,2015/16,2015,championship,Beitar Jerusalem,MTA,0,2,-2,A,0,3
5,2015/16,2015,championship,Bnei Sakhnin,BEI,2,0,2,H,3,0
6,2015/16,2015,championship,Bnei Sakhnin,HBS,1,4,-3,A,0,3
7,2015/16,2015,championship,Bnei Sakhnin,HRA,1,1,0,D,1,1
8,2015/16,2015,championship,Bnei Sakhnin,MHA,0,1,-1,A,0,3
9,2015/16,2015,championship,Bnei Sakhnin,MTA,0,0,0,D,1,1



   Relegation Round Sample:


Unnamed: 0,season,season_year,playoff_type,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2015/16,2015,relegation,Bnei Yehuda,HAC,3,1,2,H,3,0
1,2015/16,2015,relegation,Bnei Yehuda,HTA,2,4,-2,A,0,3
2,2015/16,2015,relegation,Bnei Yehuda,IKS,1,0,1,H,3,0
3,2015/16,2015,relegation,Bnei Yehuda,MPT,1,2,-1,A,0,3
4,2015/16,2015,relegation,Hapoel Acre,HHA,2,3,-1,A,0,3
5,2015/16,2015,relegation,Hapoel Acre,IKS,1,1,0,D,1,1
6,2015/16,2015,relegation,Hapoel Acre,MNE,4,1,3,H,3,0
7,2015/16,2015,relegation,Hapoel Acre,MPT,1,3,-2,A,0,3
8,2015/16,2015,relegation,Hapoel Haifa,BnY,1,1,0,D,1,1
9,2015/16,2015,relegation,Hapoel Haifa,HKS,0,0,0,D,1,1


## 5. Team Name Normalization

Use the same normalization as regular season to ensure consistency.

In [None]:
# Team Name Mapping - Same as regular season
# This ensures consistency across all data files

TEAM_NAME_MAP = {
    # Abbreviations to full names
    'ASH': 'F.C. Ashdod',
    'BEI': 'Beitar Jerusalem',
    'BnS': 'Bnei Sakhnin',
    'BnY': 'Bnei Yehuda',
    'HAS': 'Hapoel Ashkelon',
    'HBS': "Hapoel Be'er Sheva",
    'HHA': 'Hapoel Haifa',
    'HKS': 'Hapoel Kfar Saba',
    'HRA': "Hapoel Ra'anana",
    'HTA': 'Hapoel Tel Aviv',
    'IKS': 'Ironi Kiryat Shmona',
    'MHA': 'Maccabi Haifa',
    'MPT': 'Maccabi Petah Tikva',
    'MTA': 'Maccabi Tel Aviv',
    'HPT': 'Hapoel Petah Tikva',
    'HRG': 'Hapoel Ramat Gan',
    'HRH': 'Hapoel Ramat HaSharon',
    'HRL': 'Rishon LeZion',
    'MAN': 'Maccabi Ahi Nazareth',
    'MBR': 'Maccabi Bnei Reineh',
    'SNZ': 'Sektzia Ness Ziona',
    'HAK': 'Hapoel Acre',
    'MHE': 'Maccabi Herzliya',
    'MNE': 'Maccabi Netanya',
    'HAR': 'Hapoel Raanana',
    'HAC': 'Hapoel Acre',
    'IRH': 'Ironi Ramat HaSharon',
    'HAH': 'Hapoel Hadera',
    'NES': 'Ness Ziona',
    'HJE': 'Hapoel Jerusalem',
    'HNG': 'Hapoel Nof HaGalil',
    'ITI': 'Ironi Tiberias',
    
    # Name variants to canonical names
    'Ashdod': 'F.C. Ashdod',
    'F.C. Ironi Ashdod': 'F.C. Ashdod',
    'Ness Ziona': 'Sektzia Ness Ziona',
    'Ironi Nir Ramat HaSharon': 'Ironi Ramat HaSharon',
    'Hakoah Amidar Ramat Gan': 'Hapoel Ramat Gan',
    'Hapoel Rishon LeZion': 'Rishon LeZion',
    'Hapoel Raanana': "Hapoel Ra'anana",
    
    # Full names map to themselves
    'F.C. Ashdod': 'F.C. Ashdod',
    'Beitar Jerusalem': 'Beitar Jerusalem',
    'Bnei Sakhnin': 'Bnei Sakhnin',
    'Bnei Yehuda': 'Bnei Yehuda',
    'Hapoel Ashkelon': 'Hapoel Ashkelon',
    "Hapoel Be'er Sheva": "Hapoel Be'er Sheva",
    'Hapoel Haifa': 'Hapoel Haifa',
    'Hapoel Kfar Saba': 'Hapoel Kfar Saba',
    "Hapoel Ra'anana": "Hapoel Ra'anana",
    'Hapoel Tel Aviv': 'Hapoel Tel Aviv',
    'Ironi Kiryat Shmona': 'Ironi Kiryat Shmona',
    'Maccabi Haifa': 'Maccabi Haifa',
    'Maccabi Petah Tikva': 'Maccabi Petah Tikva',
    'Maccabi Tel Aviv': 'Maccabi Tel Aviv',
    'Hapoel Petah Tikva': 'Hapoel Petah Tikva',
    'Hapoel Ramat Gan': 'Hapoel Ramat Gan',
    'Hapoel Ramat HaSharon': 'Hapoel Ramat HaSharon',
    'Rishon LeZion': 'Rishon LeZion',
    'Maccabi Ahi Nazareth': 'Maccabi Ahi Nazareth',
    'Maccabi Bnei Reineh': 'Maccabi Bnei Reineh',
    'Sektzia Ness Ziona': 'Sektzia Ness Ziona',
    'Hapoel Acre': 'Hapoel Acre',
    'Maccabi Herzliya': 'Maccabi Herzliya',
    'Maccabi Netanya': 'Maccabi Netanya',
    'Ironi Ramat HaSharon': 'Ironi Ramat HaSharon',
    'Hapoel Hadera': 'Hapoel Hadera',
    'Hapoel Jerusalem': 'Hapoel Jerusalem',
    'Hapoel Nof HaGalil': 'Hapoel Nof HaGalil',
    'Ironi Tiberias': 'Ironi Tiberias',
}

def normalize_team_names(df, name_map=TEAM_NAME_MAP):
    """
    Normalize team names by converting abbreviations and variants to full names.
    
    Args:
        df: DataFrame with 'home_team' and 'away_team' columns
        name_map: Dictionary mapping abbreviations/variants to standardized names
    
    Returns:
        DataFrame with normalized team names
    """
    df = df.copy()
    df['home_team'] = df['home_team'].map(lambda x: name_map.get(x, x))
    df['away_team'] = df['away_team'].map(lambda x: name_map.get(x, x))
    return df

print("‚úÖ Team Name Mapping Loaded (same as regular season)")
print(f"  ‚Ä¢ {len([k for k in TEAM_NAME_MAP.keys() if len(k) <= 3])} abbreviations")
print(f"  ‚Ä¢ {len(set(TEAM_NAME_MAP.values()))} unique teams")

‚úÖ Team Name Mapping Loaded (same as regular season)
  ‚Ä¢ 32 abbreviations
  ‚Ä¢ 31 unique teams


## 6. Normalize Playoff Data

Apply team name normalization to playoff files.

In [38]:
# Normalize team names in playoff files
import pandas as pd

ensure_environment()

print("="*80)
print("üîÑ NORMALIZING PLAYOFF DATA")
print("="*80)

# Process championship playoffs
champ_file = DATA_DIR / "playoffs_championship_all_seasons_ligat_haal_wikipedia.csv"
if champ_file.exists():
    print(f"\nüìÅ Processing: {champ_file.name}")
    df_champ = pd.read_csv(champ_file)
    
    # Show teams before normalization
    teams_before = set(df_champ['home_team'].unique()) | set(df_champ['away_team'].unique())
    print(f"   Teams before normalization: {len(teams_before)}")
    
    # Apply normalization
    df_champ_normalized = normalize_team_names(df_champ, TEAM_NAME_MAP)
    
    # Show teams after normalization
    teams_after = set(df_champ_normalized['home_team'].unique()) | set(df_champ_normalized['away_team'].unique())
    print(f"   Teams after normalization: {len(teams_after)}")
    
    # Save normalized version
    normalized_path = INTERIM_DIR / "playoffs_championship_all_seasons_normalized.csv"
    save_csv(df_champ_normalized, normalized_path)
    print(f"   ‚úì Normalized data saved")

# Process relegation playoffs
releg_file = DATA_DIR / "playoffs_relegation_all_seasons_ligat_haal_wikipedia.csv"
if releg_file.exists():
    print(f"\nüìÅ Processing: {releg_file.name}")
    df_releg = pd.read_csv(releg_file)
    
    # Show teams before normalization
    teams_before = set(df_releg['home_team'].unique()) | set(df_releg['away_team'].unique())
    print(f"   Teams before normalization: {len(teams_before)}")
    
    # Apply normalization
    df_releg_normalized = normalize_team_names(df_releg, TEAM_NAME_MAP)
    
    # Show teams after normalization
    teams_after = set(df_releg_normalized['home_team'].unique()) | set(df_releg_normalized['away_team'].unique())
    print(f"   Teams after normalization: {len(teams_after)}")
    
    # Save normalized version
    normalized_path = INTERIM_DIR / "playoffs_relegation_all_seasons_normalized.csv"
    save_csv(df_releg_normalized, normalized_path)
    print(f"   ‚úì Normalized data saved")

print("\n" + "="*80)
print("‚úÖ PLAYOFF DATA NORMALIZATION COMPLETE")
print("="*80)

üîÑ NORMALIZING PLAYOFF DATA

üìÅ Processing: playoffs_championship_all_seasons_ligat_haal_wikipedia.csv
   Teams before normalization: 24
   Teams after normalization: 12
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\playoffs_championship_all_seasons_normalized.csv
   ‚úì Normalized data saved

üìÅ Processing: playoffs_relegation_all_seasons_ligat_haal_wikipedia.csv
   Teams before normalization: 32
   Teams after normalization: 17
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\playoffs_relegation_all_seasons_normalized.csv
   ‚úì Normalized data saved

‚úÖ PLAYOFF DATA NORMALIZATION COMPLETE


## 7. Final Summary & Statistics

Statistics for all playoff data collected and normalized.

In [39]:
# Final Summary: Playoff data statistics
import pandas as pd

ensure_environment()

print("="*80)
print("üìä PLAYOFF DATA COLLECTION SUMMARY")
print("="*80)

# Check normalized files
champ_norm = INTERIM_DIR / "playoffs_championship_all_seasons_normalized.csv"
releg_norm = INTERIM_DIR / "playoffs_relegation_all_seasons_normalized.csv"

if champ_norm.exists():
    df_champ = pd.read_csv(champ_norm)
    
    print(f"\nüèÜ Championship Round:")
    print(f"   Total matches: {len(df_champ)}")
    print(f"   Seasons: {df_champ['season'].nunique()}")
    print(f"   Season range: {df_champ['season'].min()} to {df_champ['season'].max()}")
    
    teams_champ = set(df_champ['home_team'].unique()) | set(df_champ['away_team'].unique())
    print(f"   Unique teams: {len(teams_champ)}")
    
    print(f"\n   Matches per season:")
    for season, count in df_champ.groupby('season').size().sort_index().items():
        print(f"      ‚Ä¢ {season}: {count:3d} matches")
    
    print(f"\n   Sample data:")
    display(df_champ.head(10))

if releg_norm.exists():
    df_releg = pd.read_csv(releg_norm)
    
    print(f"\n‚¨áÔ∏è  Relegation Round:")
    print(f"   Total matches: {len(df_releg)}")
    print(f"   Seasons: {df_releg['season'].nunique()}")
    print(f"   Season range: {df_releg['season'].min()} to {df_releg['season'].max()}")
    
    teams_releg = set(df_releg['home_team'].unique()) | set(df_releg['away_team'].unique())
    print(f"   Unique teams: {len(teams_releg)}")
    
    print(f"\n   Matches per season:")
    for season, count in df_releg.groupby('season').size().sort_index().items():
        print(f"      ‚Ä¢ {season}: {count:3d} matches")
    
    print(f"\n   Sample data:")
    display(df_releg.head(10))

print("\n" + "="*80)
print("‚úÖ PLAYOFF DATA COLLECTION COMPLETE")
print("="*80)
print(f"\nüí° Next Steps:")
print(f"   1. Combine playoff data with regular season for full season analysis")
print(f"   2. Calculate playoff standings and rankings")
print(f"   3. Analyze championship vs relegation performance patterns")
print(f"   4. Create visualizations comparing regular season vs playoff results")
print(f"\nüìÇ Raw data: {DATA_DIR}")
print(f"üìÇ Normalized data: {INTERIM_DIR}")
print("="*80)

üìä PLAYOFF DATA COLLECTION SUMMARY

üèÜ Championship Round:
   Total matches: 150
   Seasons: 5
   Season range: 2015/16 to 2019/20
   Unique teams: 12

   Matches per season:
      ‚Ä¢ 2015/16:  30 matches
      ‚Ä¢ 2016/17:  30 matches
      ‚Ä¢ 2017/18:  30 matches
      ‚Ä¢ 2018/19:  30 matches
      ‚Ä¢ 2019/20:  30 matches

   Sample data:


Unnamed: 0,season,season_year,playoff_type,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2015/16,2015,championship,Beitar Jerusalem,Bnei Sakhnin,0,3,-3,A,0,3
1,2015/16,2015,championship,Beitar Jerusalem,Hapoel Be'er Sheva,0,2,-2,A,0,3
2,2015/16,2015,championship,Beitar Jerusalem,Hapoel Ra'anana,1,0,1,H,3,0
3,2015/16,2015,championship,Beitar Jerusalem,Maccabi Haifa,3,2,1,H,3,0
4,2015/16,2015,championship,Beitar Jerusalem,Maccabi Tel Aviv,0,2,-2,A,0,3
5,2015/16,2015,championship,Bnei Sakhnin,Beitar Jerusalem,2,0,2,H,3,0
6,2015/16,2015,championship,Bnei Sakhnin,Hapoel Be'er Sheva,1,4,-3,A,0,3
7,2015/16,2015,championship,Bnei Sakhnin,Hapoel Ra'anana,1,1,0,D,1,1
8,2015/16,2015,championship,Bnei Sakhnin,Maccabi Haifa,0,1,-1,A,0,3
9,2015/16,2015,championship,Bnei Sakhnin,Maccabi Tel Aviv,0,0,0,D,1,1



‚¨áÔ∏è  Relegation Round:
   Total matches: 140
   Seasons: 5
   Season range: 2015/16 to 2019/20
   Unique teams: 17

   Matches per season:
      ‚Ä¢ 2015/16:  28 matches
      ‚Ä¢ 2016/17:  28 matches
      ‚Ä¢ 2017/18:  28 matches
      ‚Ä¢ 2018/19:  28 matches
      ‚Ä¢ 2019/20:  28 matches

   Sample data:


Unnamed: 0,season,season_year,playoff_type,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2015/16,2015,relegation,Bnei Yehuda,Hapoel Acre,3,1,2,H,3,0
1,2015/16,2015,relegation,Bnei Yehuda,Hapoel Tel Aviv,2,4,-2,A,0,3
2,2015/16,2015,relegation,Bnei Yehuda,Ironi Kiryat Shmona,1,0,1,H,3,0
3,2015/16,2015,relegation,Bnei Yehuda,Maccabi Petah Tikva,1,2,-1,A,0,3
4,2015/16,2015,relegation,Hapoel Acre,Hapoel Haifa,2,3,-1,A,0,3
5,2015/16,2015,relegation,Hapoel Acre,Ironi Kiryat Shmona,1,1,0,D,1,1
6,2015/16,2015,relegation,Hapoel Acre,Maccabi Netanya,4,1,3,H,3,0
7,2015/16,2015,relegation,Hapoel Acre,Maccabi Petah Tikva,1,3,-2,A,0,3
8,2015/16,2015,relegation,Hapoel Haifa,Bnei Yehuda,1,1,0,D,1,1
9,2015/16,2015,relegation,Hapoel Haifa,Hapoel Kfar Saba,0,0,0,D,1,1



‚úÖ PLAYOFF DATA COLLECTION COMPLETE

üí° Next Steps:
   1. Combine playoff data with regular season for full season analysis
   2. Calculate playoff standings and rankings
   3. Analyze championship vs relegation performance patterns
   4. Create visualizations comparing regular season vs playoff results

üìÇ Raw data: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw
üìÇ Normalized data: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim


In [36]:
# Quick validation for selected seasons
seasons_to_test = [2010, 2017]
for y in seasons_to_test:
    champ, releg = scrape_season_playoffs(y)
    if champ is not None:
        print(f"\n{y}/{str(y+1)[-2:]} Championship: {len(champ)} matches; teams={sorted(set(champ.home_team.unique())|set(champ.away_team.unique()))}")
        display(champ.head(8))
    else:
        print(f"\n{y}/{str(y+1)[-2:]} Championship: None")
    if releg is not None:
        print(f"{y}/{str(y+1)[-2:]} Relegation: {len(releg)} matches; teams={sorted(set(releg.home_team.unique())|set(releg.away_team.unique()))}")
        display(releg.head(8))
    else:
        print(f"{y}/{str(y+1)[-2:]} Relegation: None")

Fetching 2010/11 Championship round... ‚ùå (no Championship round results header)
‚ùå (no Championship round results header)
Fetching 2010/11 Relegation round... Fetching 2010/11 Relegation round... ‚ùå (no Relegation round results header)

2010/11 Championship: None
2010/11 Relegation: None
Fetching 2017/18 Championship round... ‚ùå (no Relegation round results header)

2010/11 Championship: None
2010/11 Relegation: None
Fetching 2017/18 Championship round... ‚úì (30 matches)
‚úì (30 matches)
Fetching 2017/18 Relegation round... Fetching 2017/18 Relegation round... ‚úì (28 matches)

2017/18 Championship: 30 matches; teams=['BEI', 'Beitar Jerusalem', 'BnY', 'Bnei Yehuda', 'HBS', 'HHA', "Hapoel Be'er Sheva", 'Hapoel Haifa', 'MNE', 'MTA', 'Maccabi Netanya', 'Maccabi Tel Aviv']
‚úì (28 matches)

2017/18 Championship: 30 matches; teams=['BEI', 'Beitar Jerusalem', 'BnY', 'Bnei Yehuda', 'HBS', 'HHA', "Hapoel Be'er Sheva", 'Hapoel Haifa', 'MNE', 'MTA', 'Maccabi Netanya', 'Maccabi Tel Aviv']


Unnamed: 0,season,season_year,playoff_type,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2017/18,2017,championship,Beitar Jerusalem,BnY,1,1,0,D,1,1
1,2017/18,2017,championship,Beitar Jerusalem,HBS,1,4,-3,A,0,3
2,2017/18,2017,championship,Beitar Jerusalem,HHA,1,1,0,D,1,1
3,2017/18,2017,championship,Beitar Jerusalem,MNE,2,0,2,H,3,0
4,2017/18,2017,championship,Beitar Jerusalem,MTA,3,2,1,H,3,0
5,2017/18,2017,championship,Bnei Yehuda,BEI,3,3,0,D,1,1
6,2017/18,2017,championship,Bnei Yehuda,HBS,1,1,0,D,1,1
7,2017/18,2017,championship,Bnei Yehuda,HHA,3,0,3,H,3,0


2017/18 Relegation: 28 matches; teams=['ASH', 'BnS', 'Bnei Sakhnin', 'F.C. Ashdod', 'HAC', 'HAS', 'HRA', 'Hapoel Acre', 'Hapoel Ashkelon', "Hapoel Ra'anana", 'IKS', 'Ironi Kiryat Shmona', 'MHA', 'MPT', 'Maccabi Haifa', 'Maccabi Petah Tikva']


Unnamed: 0,season,season_year,playoff_type,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2017/18,2017,relegation,Bnei Sakhnin,ASH,2,1,1,H,3,0
1,2017/18,2017,relegation,Bnei Sakhnin,HRA,1,3,-2,A,0,3
2,2017/18,2017,relegation,Bnei Sakhnin,MHA,1,1,0,D,1,1
3,2017/18,2017,relegation,Bnei Sakhnin,MPT,2,2,0,D,1,1
4,2017/18,2017,relegation,F.C. Ashdod,HAS,1,0,1,H,3,0
5,2017/18,2017,relegation,F.C. Ashdod,IKS,1,2,-1,A,0,3
6,2017/18,2017,relegation,F.C. Ashdod,MHA,2,1,1,H,3,0
7,2017/18,2017,relegation,Hapoel Ashkelon,BnS,2,0,2,H,3,0


In [31]:
# Debug: inspect tables after the Championship header for 2017/18
from bs4 import BeautifulSoup
import re

year = 2017
season_str = f"{year}/{str(year+1)[-2:]}"
url = f"https://en.wikipedia.org/wiki/{year}%E2%80%93{str(year+1)[-2:]}_Israeli_Premier_League"
html = http_get(url)
soup = BeautifulSoup(html, 'html.parser')
header = _find_round_header(soup, 'championship')
print('Header found:', bool(header), '| tag:', header.name if header else None, '| text:', header.get_text(' ', strip=True)[:80] if header else None)

node = header
idx = 0
while True:
    node = node.find_next_sibling()
    if node is None or node.name in ('h2','h3','h4'):
        print('Stop at:', node.name if node else None)
        break
    if node.name == 'table':
        idx += 1
        first_row = node.find('tr')
        first_cell = first_row.find('th') if first_row else None
        header_text = first_cell.get_text(' ', strip=True) if first_cell else ''
        print(f"Table {idx}: class={node.get('class')} header='{header_text}'")
    else:
        # look for nested tables immediately inside
        for t in node.find_all('table', recursive=True):
            idx += 1
            first_row = t.find('tr')
            first_cell = first_row.find('th') if first_row else None
            header_text = first_cell.get_text(' ', strip=True) if first_cell else ''
            print(f"Nested Table {idx}: class={t.get('class')} header='{header_text[:60]}'")

Header found: True | tag: h3 | text: Championship round results
Stop at: None


In [33]:
# Debug 2: follow mw-heading wrapper and list siblings for 2017/18
from bs4 import BeautifulSoup
import re

year = 2017
url = f"https://en.wikipedia.org/wiki/{year}%E2%80%93{str(year+1)[-2:]}_Israeli_Premier_League"
soup = BeautifulSoup(http_get(url), 'html.parser')
header = _find_round_header(soup, 'championship')
print('Header:', header.name if header else None, '| parent:', header.parent.name if header and header.parent else None, '| parent classes:', header.parent.get('class') if header and header.parent else None)

start = header
parent = getattr(header, 'parent', None)
parent_classes = set((parent.get('class') or [])) if parent else set()
if parent and ('mw-heading' in parent_classes or any(c.startswith('mw-heading') for c in parent_classes)):
    start = parent

node = start
count = 0
while True:
    node = node.find_next_sibling()
    if node is None:
        print('Reached end of siblings (None)')
        break
    if node.name in ('h2','h3','h4') or 'mw-heading' in set(node.get('class') or []) or any(c.startswith('mw-heading') for c in (node.get('class') or [])):
        print('Encountered next heading wrapper or heading:', node.name, node.get('class'))
        break
    count += 1
    print(f"Sibling {count}: tag={node.name}, classes={node.get('class')}")
    # print nested tables headers
    for t in node.find_all('table'):
        fr = t.find('tr')
        fc = fr.find('th') if fr else None
        ht = fc.get_text(' ', strip=True) if fc else ''
        print('  - table:', t.get('class'), '| first th:', ht[:80])

Header: h3 | parent: div | parent classes: ['mw-heading', 'mw-heading3']
Sibling 1: tag=div, classes=None
  - table: ['wikitable', 'plainrowheaders'] | first th: Home \ Away
Encountered next heading wrapper or heading: div ['mw-heading', 'mw-heading3']
