## 1. Environment Setup

In [29]:
# Environment setup
from pathlib import Path
from typing import Optional

try:
    from dotenv import load_dotenv
    DOTENV_AVAILABLE = True
except Exception:
    DOTENV_AVAILABLE = False

# Helper to find project root
def _find_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

# Resolve project directories consistently
ROOT = _find_root()
DATA_DIR = ROOT / 'data' / 'raw'
INTERIM_DIR = ROOT / 'data' / 'interim'
PROCESSED_DIR = ROOT / 'data' / 'processed'
FIG_DIR = ROOT / 'reports' / 'figures'
for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"\nüéØ Environment setup complete")
print(f"   ROOT: {ROOT}")
print(f"   DATA_DIR: {DATA_DIR}")


üéØ Environment setup complete
   ROOT: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks
   DATA_DIR: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw


## 2. Helper Functions

In [30]:
# Helper functions for scraping
from typing import Optional
import random
import time
from pathlib import Path
import requests

_USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36",
]

def find_repo_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

def ensure_environment():
    global ROOT, DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR
    if 'ROOT' not in globals() or not isinstance(ROOT, Path) or not (ROOT / 'data').exists():
        root_guess = find_repo_root(Path.cwd())
        if not (root_guess / 'data').exists() and (root_guess.parent / 'data').exists():
            root_guess = root_guess.parent
        ROOT = root_guess
    DATA_DIR = ROOT / 'data' / 'raw'
    INTERIM_DIR = ROOT / 'data' / 'interim'
    PROCESSED_DIR = ROOT / 'data' / 'processed'
    FIG_DIR = ROOT / 'reports' / 'figures'
    for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
        d.mkdir(parents=True, exist_ok=True)
    return ROOT, DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR


def http_get(url: str, headers: Optional[dict] = None, retries: int = 3, timeout: int = 30) -> str:
    last_err = None
    sess = requests.Session()
    for attempt in range(1, retries + 1):
        ua = random.choice(_USER_AGENTS)
        hdrs = {"User-Agent": ua, "Accept-Language": "en-US,en;q=0.9"}
        if headers:
            hdrs.update(headers)
        try:
            resp = sess.get(url, headers=hdrs, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            last_err = e
            time.sleep(0.8 * attempt)
    raise last_err  # type: ignore


def save_csv(df: 'pd.DataFrame', path: Path, **to_csv_kwargs):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, encoding=to_csv_kwargs.get('encoding', 'utf-8-sig'))
    print(f"Saved: {path}")

print("‚úÖ Helper functions loaded")

‚úÖ Helper functions loaded


## 3. Wikipedia Match Results Scraper

Scrapes match results from Wikipedia results matrix ("Home \\ Away" table).

**Method:**
- Finds results matrix on season page
- Extracts team names from headers
- Parses scores from each cell (format: "X‚ÄìY" or "X-Y")
- Calculates points and result for each match

**Note:** Wikipedia uses results matrix format, not match-by-match list.

In [31]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from datetime import datetime
import urllib.parse

# Robust Wikipedia season scraper
# Differences vs original: improved table detection using regex (handles variations like 'Home \\ Away', 'Home / Away')

def _season_title(season_year: int) -> str:
    # Use Israeli Premier League season page naming
    end_two = str(season_year + 1)[-2:]
    return f"{season_year}\u2013{end_two} Israeli Premier League"

def _build_wiki_url(season_year: int) -> str:
    title = _season_title(season_year)
    encoded = urllib.parse.quote(title, safe='')
    return f"https://en.wikipedia.org/wiki/{encoded}"


def scrape_season(season_year: int):
    """
    Scrape a single season's matches from Wikipedia results matrix.

    Args:
        season_year: Starting year (e.g., 2016 for 2016/17 season)

    Returns:
        DataFrame with match results (season, teams, goals, points) or None on failure.
    """
    season_str = f"{season_year}/{str(season_year+1)[-2:]}"
    url = _build_wiki_url(season_year)

    print(f"Fetching {season_str}... ", end="", flush=True)
    try:
        html = http_get(url)
        if not html:
            print("‚ùå (empty HTML)")
            return None
        soup = BeautifulSoup(html, "html.parser")

        # Find results matrix (first wikitable whose first header cell mentions Home & Away)
        results_table = None
        for table in soup.find_all("table", class_="wikitable"):
            first_row = table.find("tr")
            if not first_row:
                continue
            first_cell = first_row.find("th")
            if not first_cell:
                continue
            header_text = first_cell.get_text(" ", strip=True)
            if re.search(r"Home.*Away", header_text, re.IGNORECASE):
                results_table = table
                break

        if not results_table:
            print("‚ùå (no results matrix)")
            return None

        rows = results_table.find_all("tr")
        if len(rows) < 2:
            print("‚ùå (matrix has no data rows)")
            return None
        # First header row: team names (skip first corner cell)
        team_names = [th.get_text(strip=True) for th in rows[0].find_all("th")][1:]
        if not team_names:
            print("‚ùå (no team headers)")
            return None

        matches = []
        for row in rows[1:]:
            cells = row.find_all(["th", "td"])
            if len(cells) < len(team_names) + 1:
                # Probably a separator or malformed row
                continue
            home_team = cells[0].get_text(strip=True)
            # Iterate over away teams
            for idx, cell in enumerate(cells[1:]):
                if idx >= len(team_names):
                    break
                away_team = team_names[idx]
                score_text = cell.get_text(strip=True)
                # Accept formats like '1‚Äì0', '2-1' (different dash characters)
                if re.match(r"^\d+\s*[‚Äì-]\s*\d+$", score_text):
                    home_goals, away_goals = re.split(r"[‚Äì-]", score_text)
                    matches.append({
                        "season": season_str,
                        "season_year": season_year,
                        "home_team": home_team,
                        "away_team": away_team,
                        "home_goals": int(home_goals.strip()),
                        "away_goals": int(away_goals.strip())
                    })

        if not matches:
            print("‚ùå (no matches found)")
            return None

        df = pd.DataFrame(matches)
        # Derived columns
        df['goal_diff'] = df['home_goals'] - df['away_goals']
        df['result'] = df['goal_diff'].apply(lambda x: 'H' if x > 0 else ('A' if x < 0 else 'D'))
        df['home_points'] = df['result'].map({'H': 3, 'D': 1, 'A': 0}).astype(int)
        df['away_points'] = df['result'].map({'A': 3, 'D': 1, 'H': 0}).astype(int)

        keep_cols = ['season', 'season_year', 'home_team', 'away_team', 'home_goals', 'away_goals', 'goal_diff', 'result', 'home_points', 'away_points']
        df = df[keep_cols]
        print(f"‚úì ({len(df)} matches)")
        return df

    except Exception as e:
        print(f"‚ùå ({str(e)[:60]}...)")
        return None

print("‚úÖ Wikipedia scraper function (robust) ready")

‚úÖ Wikipedia scraper function (robust) ready


## 4. Multi-Season Collection (Wikipedia)

Scrapes match results for last 20 seasons from Wikipedia.

In [32]:
# Scrape multiple seasons of Ligat Ha'al from Wikipedia
import pandas as pd
import time
from datetime import datetime

ensure_environment()

# List of seasons to scrape (last 20 seasons)
current_year = datetime.now().year
if datetime.now().month < 8:  # If before August, last season started in previous year
    current_year -= 1
seasons = list(range(current_year - 19, current_year + 1))

print(f"Scraping {len(seasons)} seasons from Wikipedia ({seasons[0]}/{str(seasons[0]+1)[-2:]} to {seasons[-1]}/{str(seasons[-1]+1)[-2:]})...")
print("="*80)

# Scrape each season
all_matches = []
for season_year in seasons:
    df = scrape_season(season_year)
    if df is not None:
        # Save individual season
        season_path = DATA_DIR / f"matches_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_wikipedia.csv"
        save_csv(df, season_path)
        all_matches.append(df)
    time.sleep(1)  # Be nice to Wikipedia

print("\n" + "="*80)

if all_matches:
    # Combine all seasons
    combined_df = pd.concat(all_matches, ignore_index=True)
    combined_path = DATA_DIR / "matches_all_seasons_ligat_haal_wikipedia.csv"
    save_csv(combined_df, combined_path)
    
    print(f"\nüìä Summary:")
    print(f"   Successfully scraped: {len(all_matches)} seasons")
    print(f"   Total matches: {len(combined_df)}")
    print(f"\n   Matches per season:")
    season_counts = combined_df.groupby('season').size().sort_index()
    for season, count in season_counts.items():
        print(f"      ‚Ä¢ {season}: {count:3d} matches")
    print(f"\n   All matches saved to: {combined_path.name}")
    
    # Display sample
    print(f"\n   Sample data:")
    display(combined_df.head(10))
else:
    print("\n‚ùå No matches were successfully scraped")

Scraping 20 seasons from Wikipedia (2006/07 to 2025/26)...
Fetching 2006/07... ‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2006_07_ligat_haal_wikipedia.csv
‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2006_07_ligat_haal_wikipedia.csv
Fetching 2007/08... Fetching 2007/08... ‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2007_08_ligat_haal_wikipedia.csv
‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2007_08_ligat_haal_wikipedia.csv
Fetching 2008/09... Fetching 2008/09... ‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2008_09_ligat_haal_wikipedia.csv
‚úì (132 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matche

Unnamed: 0,season,season_year,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2006/07,2006,Beitar Jerusalem,BnY,0,0,0,D,1,1
1,2006/07,2006,Beitar Jerusalem,ASH,2,0,2,H,3,0
2,2006/07,2006,Beitar Jerusalem,HAK,0,0,0,D,1,1
3,2006/07,2006,Beitar Jerusalem,HKS,2,0,2,H,3,0
4,2006/07,2006,Beitar Jerusalem,HPT,2,0,2,H,3,0
5,2006/07,2006,Beitar Jerusalem,HTA,2,1,1,H,3,0
6,2006/07,2006,Beitar Jerusalem,MHA,1,1,0,D,1,1
7,2006/07,2006,Beitar Jerusalem,MHE,3,0,3,H,3,0
8,2006/07,2006,Beitar Jerusalem,MNE,0,0,0,D,1,1
9,2006/07,2006,Beitar Jerusalem,MPT,0,0,0,D,1,1


In [33]:
import re
from bs4 import BeautifulSoup
import pandas as pd

def scrape_transfermarkt_regular(season_year):
    """
    Scrape regular season matches from Transfermarkt gesamtspielplan page.
    
    Args:
        season_year: Starting year (e.g., 2023 for 2023/24)
    
    Returns:
        DataFrame with columns: round, home, score, away
    """
    url = f"https://www.transfermarkt.com/ligat-haal/gesamtspielplan/wettbewerb/ISR1?saison_id={season_year}"
    print(f"Scraping Transfermarkt {season_year}/{str(season_year+1)[-2:]}... ", end="", flush=True)
    
    try:
        html = http_get(url)
        if not html:
            print("‚ùå (no HTML)")
            return None
        
        soup = BeautifulSoup(html, 'html.parser')
        rows_out = []
        round_num = 0
        
        # Find all tables on the page
        tables = soup.find_all('table')
        
        for table in tables:
            # Look for match rows (rows with 2 team links)
            for tr in table.find_all('tr'):
                # Find all cells
                cells = tr.find_all('td')
                if len(cells) < 5:
                    continue
                
                # Find score first to confirm this is a match row
                score_link = tr.find('a', class_='ergebnis-link')
                if not score_link:
                    continue
                
                score_text = score_link.get_text(strip=True)
                # Validate score format (d:d)
                if not re.match(r'^\d+:\d+$', score_text):
                    continue
                
                # Now find team links - typically in cells before and after score
                all_team_links = []
                for cell in cells:
                    team_link = cell.find('a', href=re.compile(r'/verein/'))
                    if team_link:
                        team_name = team_link.get_text(strip=True)
                        if team_name and team_name not in [link.get_text(strip=True) for link in all_team_links]:
                            all_team_links.append(team_link)
                
                if len(all_team_links) < 2:
                    continue
                
                home = all_team_links[0].get_text(strip=True)
                away = all_team_links[1].get_text(strip=True)
                
                # Increment round for each match found
                round_num += 1
                
                rows_out.append({
                    'round': round_num,
                    'home': home,
                    'score': score_text,
                    'away': away
                })
        
        if not rows_out:
            print("‚ö†Ô∏è (no matches)")
            return None
        
        df = pd.DataFrame(rows_out)
        print(f"‚úì ({len(df)} matches)")
        return df
        
    except Exception as e:
        print(f"‚ùå ({str(e)[:50]}...)")
        return None

print("‚úÖ Transfermarkt scraper function defined")

‚úÖ Transfermarkt scraper function defined


## 5. Transfermarkt Match Results Scraper

Scrapes match results from Transfermarkt with round numbers.

In [34]:
# Scrape multiple seasons from Transfermarkt
import pandas as pd
import time

ensure_environment()

# Use same season range as Wikipedia
print(f"Scraping {len(seasons)} seasons from Transfermarkt ({seasons[0]}/{str(seasons[0]+1)[-2:]} to {seasons[-1]}/{str(seasons[-1]+1)[-2:]})...")
print("="*80)

# Scrape each season
all_transfermarkt = []
failed_seasons = []

for season_year in seasons:
    df = scrape_transfermarkt_regular(season_year)
    if df is not None:
        # Add season info
        df['season'] = f"{season_year}/{str(season_year+1)[-2:]}"
        df['season_year'] = season_year
        
        # Save individual season
        season_path = DATA_DIR / f"matches_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_transfermarkt.csv"
        save_csv(df, season_path)
        all_transfermarkt.append(df)
    else:
        failed_seasons.append(f"{season_year}/{str(season_year+1)[-2:]}")
    
    time.sleep(1.2)  # Be polite to Transfermarkt

print("\n" + "="*80)

if all_transfermarkt:
    # Combine all seasons
    combined_tm = pd.concat(all_transfermarkt, ignore_index=True)
    combined_path_tm = DATA_DIR / "matches_all_seasons_ligat_haal_transfermarkt.csv"
    save_csv(combined_tm, combined_path_tm)
    
    print(f"\nüìä Transfermarkt Summary:")
    print(f"   Successfully scraped: {len(all_transfermarkt)} seasons")
    print(f"   Total matches: {len(combined_tm)}")
    
    if failed_seasons:
        print(f"   ‚ö†Ô∏è  Failed seasons: {', '.join(failed_seasons)}")
    
    print(f"\n   Matches per season:")
    tm_counts = combined_tm.groupby('season').size().sort_index()
    for season, count in tm_counts.items():
        print(f"      ‚Ä¢ {season}: {count:3d} matches")
    print(f"\n   All matches saved to: {combined_path_tm.name}")
    
    # Display sample
    print(f"\n   Sample data:")
    display(combined_tm.head(10))
else:
    print("\n‚ùå No matches were successfully scraped from Transfermarkt")

Scraping 20 seasons from Transfermarkt (2006/07 to 2025/26)...
Scraping Transfermarkt 2006/07... ‚úì (198 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2006_07_ligat_haal_transfermarkt.csv
‚úì (198 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2006_07_ligat_haal_transfermarkt.csv
Scraping Transfermarkt 2007/08... Scraping Transfermarkt 2007/08... ‚úì (198 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2007_08_ligat_haal_transfermarkt.csv
‚úì (198 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2007_08_ligat_haal_transfermarkt.csv
Scraping Transfermarkt 2008/09... Scraping Transfermarkt 2008/09... ‚úì (198 matches)
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\matches_2008_09_ligat_haal_transfermarkt.csv
‚úì (198 matches)

Unnamed: 0,round,home,score,away,season,season_year
0,1,H. Kfar Saba,4:1,H. Petah Tikva,2006/07,2006
1,2,M. Petah Tikva,0:0,Hakoah Amidar,2006/07,2006
2,3,FC Ashdod,1:0,Maccabi Herzlya,2006/07,2006
3,4,Maccabi Netanya,3:1,Maccabi Haifa,2006/07,2006
4,5,M. Tel Aviv,1:2,B. Jerusalem,2006/07,2006
5,6,Bnei Yehuda,1:1,Hapoel Tel Aviv,2006/07,2006
6,7,Hakoah Amidar,3:2,Bnei Yehuda,2006/07,2006
7,8,Maccabi Herzlya,0:3,M. Petah Tikva,2006/07,2006
8,9,Maccabi Haifa,1:0,FC Ashdod,2006/07,2006
9,10,H. Petah Tikva,0:0,Maccabi Netanya,2006/07,2006


## 6. Multi-Season Collection (Transfermarkt)

Scrapes all seasons from Transfermarkt with round information.

## 7. Team Name Normalization

Wikipedia uses inconsistent team names across seasons (abbreviations, variants).
This mapping consolidates all variations to standardized full names.

In [35]:
# Team Name Mapping - Normalizes abbreviations and variants to full names
# This mapping consolidates Wikipedia's inconsistent team naming across 20 seasons

TEAM_NAME_MAP = {
    # Abbreviations to full names
    'ASH': 'F.C. Ashdod',
    'BEI': 'Beitar Jerusalem',
    'BnS': 'Bnei Sakhnin',
    'BnY': 'Bnei Yehuda',
    'HAS': 'Hapoel Ashkelon',
    'HBS': "Hapoel Be'er Sheva",
    'HHA': 'Hapoel Haifa',
    'HKS': 'Hapoel Kfar Saba',
    'HRA': "Hapoel Ra'anana",
    'HTA': 'Hapoel Tel Aviv',
    'IKS': 'Ironi Kiryat Shmona',
    'MHA': 'Maccabi Haifa',
    'MPT': 'Maccabi Petah Tikva',
    'MTA': 'Maccabi Tel Aviv',
    'HPT': 'Hapoel Petah Tikva',
    'HRG': 'Hapoel Ramat Gan',
    'HRH': 'Hapoel Ramat HaSharon',
    'HRL': 'Rishon LeZion',
    'MAN': 'Maccabi Ahi Nazareth',
    'MBR': 'Maccabi Bnei Reineh',
    'SNZ': 'Sektzia Ness Ziona',
    'HAK': 'Hapoel Acre',
    'MHE': 'Maccabi Herzliya',
    'MNE': 'Maccabi Netanya',
    'HAR': 'Hapoel Raanana',
    'HAC': 'Hapoel Acre',
    'IRH': 'Ironi Ramat HaSharon',
    'HAH': 'Hapoel Hadera',
    'NES': 'Ness Ziona',
    'HJE': 'Hapoel Jerusalem',
    'HNG': 'Hapoel Nof HaGalil',
    'ITI': 'Ironi Tiberias',
    
    # Name variants to canonical names
    'Ashdod': 'F.C. Ashdod',
    'F.C. Ironi Ashdod': 'F.C. Ashdod',
    'Ness Ziona': 'Sektzia Ness Ziona',
    'Ironi Nir Ramat HaSharon': 'Ironi Ramat HaSharon',
    'Hakoah Amidar Ramat Gan': 'Hapoel Ramat Gan',
    'Hapoel Rishon LeZion': 'Rishon LeZion',
    'Hapoel Raanana': "Hapoel Ra'anana",
    
    # Full names map to themselves
    'F.C. Ashdod': 'F.C. Ashdod',
    'Beitar Jerusalem': 'Beitar Jerusalem',
    'Bnei Sakhnin': 'Bnei Sakhnin',
    'Bnei Yehuda': 'Bnei Yehuda',
    'Hapoel Ashkelon': 'Hapoel Ashkelon',
    "Hapoel Be'er Sheva": "Hapoel Be'er Sheva",
    'Hapoel Haifa': 'Hapoel Haifa',
    'Hapoel Kfar Saba': 'Hapoel Kfar Saba',
    "Hapoel Ra'anana": "Hapoel Ra'anana",
    'Hapoel Tel Aviv': 'Hapoel Tel Aviv',
    'Ironi Kiryat Shmona': 'Ironi Kiryat Shmona',
    'Maccabi Haifa': 'Maccabi Haifa',
    'Maccabi Petah Tikva': 'Maccabi Petah Tikva',
    'Maccabi Tel Aviv': 'Maccabi Tel Aviv',
    'Hapoel Petah Tikva': 'Hapoel Petah Tikva',
    'Hapoel Ramat Gan': 'Hapoel Ramat Gan',
    'Hapoel Ramat HaSharon': 'Hapoel Ramat HaSharon',
    'Rishon LeZion': 'Rishon LeZion',
    'Maccabi Ahi Nazareth': 'Maccabi Ahi Nazareth',
    'Maccabi Bnei Reineh': 'Maccabi Bnei Reineh',
    'Sektzia Ness Ziona': 'Sektzia Ness Ziona',
    'Hapoel Acre': 'Hapoel Acre',
    'Maccabi Herzliya': 'Maccabi Herzliya',
    'Maccabi Netanya': 'Maccabi Netanya',
    'Ironi Ramat HaSharon': 'Ironi Ramat HaSharon',
    'Hapoel Hadera': 'Hapoel Hadera',
    'Hapoel Jerusalem': 'Hapoel Jerusalem',
    'Hapoel Nof HaGalil': 'Hapoel Nof HaGalil',
    'Ironi Tiberias': 'Ironi Tiberias',
}

def normalize_team_names(df, name_map=TEAM_NAME_MAP):
    """
    Normalize team names by converting abbreviations and variants to full names.
    
    Args:
        df: DataFrame with 'home_team' and 'away_team' columns
        name_map: Dictionary mapping abbreviations/variants to standardized names
    
    Returns:
        DataFrame with normalized team names
    """
    df = df.copy()
    df['home_team'] = df['home_team'].map(lambda x: name_map.get(x, x))
    df['away_team'] = df['away_team'].map(lambda x: name_map.get(x, x))
    return df

def apply_season_specific_fixes(df, season):
    """
    Apply season-specific Wikipedia data corrections.
    Wikipedia sometimes uses incorrect team names in their results matrices.
    
    Args:
        df: DataFrame with match data
        season: Season string (e.g., '2006/07')
    
    Returns:
        DataFrame with season-specific fixes applied
    """
    df = df.copy()
    
    if season == '2006/07':
        df.loc[df['home_team'] == 'Hapoel Ramat Gan', 'home_team'] = 'Hapoel Acre'
    elif season == '2008/09':
        df.loc[df['home_team'] == 'Hapoel Ramat Gan', 'home_team'] = "Hapoel Ra'anana"
    
    return df

print("‚úÖ Team Name Mapping Loaded:")
print(f"  ‚Ä¢ {len([k for k in TEAM_NAME_MAP.keys() if len(k) <= 3])} abbreviations")
print(f"  ‚Ä¢ {len(set(TEAM_NAME_MAP.values()))} unique teams")

‚úÖ Team Name Mapping Loaded:
  ‚Ä¢ 32 abbreviations
  ‚Ä¢ 31 unique teams


## 8. Data Summary & Statistics

Final validation and statistics for all scraped data.

In [36]:
# Final Summary: Combined statistics across all data sources
import pandas as pd
from pathlib import Path

ensure_environment()

print("="*80)
print("üìä DATA COLLECTION SUMMARY - REGULAR SEASON")
print("="*80)

# Check all collected files
wiki_files = sorted(DATA_DIR.glob("matches_*_ligat_haal_wikipedia.csv"))
transfermarkt_files = sorted(DATA_DIR.glob("matches_*_ligat_haal_transfermarkt.csv"))

print(f"\nüìÅ Files Collected:")
print(f"   Wikipedia matches: {len(wiki_files)} seasons")
print(f"   Transfermarkt matches: {len(transfermarkt_files)} seasons")

# Load combined files if available
wiki_combined = DATA_DIR / "matches_all_seasons_ligat_haal_wikipedia.csv"
tm_combined = DATA_DIR / "matches_all_seasons_ligat_haal_transfermarkt.csv"

stats_summary = []

if wiki_combined.exists():
    df_wiki = pd.read_csv(wiki_combined)
    print(f"\n‚úÖ Wikipedia Data:")
    print(f"   Total matches: {len(df_wiki)}")
    print(f"   Seasons: {df_wiki['season'].nunique()}")
    print(f"   Season range: {df_wiki['season'].min()} to {df_wiki['season'].max()}")
    
    # Count unique teams
    teams_wiki = set(df_wiki['home_team'].unique()) | set(df_wiki['away_team'].unique())
    print(f"   Unique teams (before normalization): {len(teams_wiki)}")
    
    # Apply normalization and count again
    df_wiki_normalized = normalize_team_names(df_wiki, TEAM_NAME_MAP)
    teams_normalized = set(df_wiki_normalized['home_team'].unique()) | set(df_wiki_normalized['away_team'].unique())
    print(f"   Unique teams (after normalization): {len(teams_normalized)}")
    
    stats_summary.append({
        'Source': 'Wikipedia',
        'Total Matches': len(df_wiki),
        'Seasons': df_wiki['season'].nunique(),
        'Teams (normalized)': len(teams_normalized)
    })

if tm_combined.exists():
    df_tm = pd.read_csv(tm_combined)
    print(f"\n‚úÖ Transfermarkt Data:")
    print(f"   Total matches: {len(df_tm)}")
    print(f"   Seasons: {df_tm['season'].nunique()}")
    print(f"   Season range: {df_tm['season'].min()} to {df_tm['season'].max()}")
    print(f"   Rounds: min={df_tm['round'].min()}, max={df_tm['round'].max()}")
    
    # Count unique teams
    teams_tm = set(df_tm['home'].unique()) | set(df_tm['away'].unique())
    print(f"   Unique teams: {len(teams_tm)}")
    
    stats_summary.append({
        'Source': 'Transfermarkt',
        'Total Matches': len(df_tm),
        'Seasons': df_tm['season'].nunique(),
        'Teams (normalized)': len(teams_tm)
    })

if stats_summary:
    print(f"\nüìã Summary Table:")
    summary_df = pd.DataFrame(stats_summary)
    display(summary_df)

print(f"\n" + "="*80)
print(f"‚úÖ DATA COLLECTION COMPLETE")
print(f"="*80)
print(f"\nüí° Next Steps:")
print(f"   1. Use team normalization functions for consistent analysis")
print(f"   2. Merge with attendance data (already collected)")
print(f"   3. Calculate league standings and rankings")
print(f"   4. Perform statistical analysis")
print(f"   5. Create visualizations")
print(f"\nüìÇ All data saved to: {DATA_DIR}")
print("="*80)

üìä DATA COLLECTION SUMMARY - REGULAR SEASON

üìÅ Files Collected:
   Wikipedia matches: 21 seasons
   Transfermarkt matches: 21 seasons

‚úÖ Wikipedia Data:
   Total matches: 3533
   Seasons: 20
   Season range: 2006/07 to 2025/26
   Unique teams (before normalization): 64
   Unique teams (after normalization): 31

‚úÖ Transfermarkt Data:
   Total matches: 3754
   Seasons: 20
   Season range: 2006/07 to 2025/26
   Rounds: min=1, max=240
   Unique teams: 29

üìã Summary Table:


Unnamed: 0,Source,Total Matches,Seasons,Teams (normalized)
0,Wikipedia,3533,20,31
1,Transfermarkt,3754,20,29



‚úÖ DATA COLLECTION COMPLETE

üí° Next Steps:
   1. Use team normalization functions for consistent analysis
   2. Merge with attendance data (already collected)
   3. Calculate league standings and rankings
   4. Perform statistical analysis
   5. Create visualizations

üìÇ All data saved to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw


In [37]:
## 9. Early Seasons TM ‚Üí Wikipedia Format

# Convert Transfermarkt CSVs (2006/07‚Äì2008/09) to Wikipedia-like columns
import pandas as pd
import re
from pathlib import Path

SEASONS_TM_EARLY = [2006, 2007, 2008]
EXPECTED_MATCHES = 198

_score_re = re.compile(r"(\d+)\s*[:‚Äì-]\s*(\d+)")

def _tm_path(season_year:int) -> Path:
    return DATA_DIR / f"matches_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_transfermarkt.csv"

def _load_tm(season_year:int) -> pd.DataFrame:
    p = _tm_path(season_year)
    if not p.exists():
        raise FileNotFoundError(str(p))
    return pd.read_csv(p)

def _tm_to_wiki_like(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    home_col = next((c for c in ['home_team','home','HomeTeam','homeTeam'] if c in df.columns), None)
    away_col = next((c for c in ['away_team','away','AwayTeam','awayTeam'] if c in df.columns), None)
    score_col = next((c for c in ['score','result','Score','Result'] if c in df.columns), None)
    if not home_col or not away_col or not score_col:
        raise KeyError('TM CSV missing expected columns')
    out = pd.DataFrame({
        'home_team': df[home_col].map(lambda x: TEAM_NAME_MAP.get(x, x)),
        'away_team': df[away_col].map(lambda x: TEAM_NAME_MAP.get(x, x))
    })
    def _split(s):
        m = _score_re.search(str(s))
        if not m:
            return pd.NA, pd.NA
        return int(m.group(1)), int(m.group(2))
    goals = df[score_col].apply(_split)
    out['home_goals'] = goals.apply(lambda t: t[0])
    out['away_goals'] = goals.apply(lambda t: t[1])
    out['goal_diff'] = out['home_goals'] - out['away_goals']
    out['result'] = out['goal_diff'].apply(lambda x: 'H' if x>0 else ('A' if x<0 else 'D'))
    out['home_points'] = out['result'].map({'H':3,'D':1,'A':0}).astype('Int64')
    out['away_points'] = out['result'].map({'A':3,'D':1,'H':0}).astype('Int64')
    return out

summary_rows = []
outputs = []
for sy in SEASONS_TM_EARLY:
    season_tag = f"{sy}/{str(sy+1)[-2:]}"
    try:
        tm_df = _load_tm(sy)
        wiki_like = _tm_to_wiki_like(tm_df)
        wiki_like['season'] = season_tag
        wiki_like['season_year'] = sy
        cols = ['season','season_year','home_team','away_team','home_goals','away_goals','goal_diff','result','home_points','away_points']
        wiki_like = wiki_like[cols]
        out_path = INTERIM_DIR / f"matches_{sy}_{str(sy+1)[-2:]}_ligat_haal_regular_corrected.csv"
        wiki_like.to_csv(out_path, index=False)
        status = 'ok' if len(wiki_like)==EXPECTED_MATCHES else f"count={len(wiki_like)}"
        summary_rows.append({'season':season_tag,'matches':len(wiki_like),'status':status})
        outputs.append(wiki_like)
        print(f"Saved: {out_path}")
    except Exception as e:
        summary_rows.append({'season':season_tag,'matches':None,'status':str(e)})

summary_df = pd.DataFrame(summary_rows)
print('\nEarly Seasons Conversion Summary:')
print(summary_df)

if outputs:
    combined = pd.concat(outputs, ignore_index=True)
    combined_out = INTERIM_DIR / 'matches_2006_07_2008_09_regular_tm_corrected_combined.csv'
    combined.to_csv(combined_out, index=False)
    print(f"Combined saved: {combined_out}")

Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\matches_2006_07_ligat_haal_regular_corrected.csv
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\matches_2007_08_ligat_haal_regular_corrected.csv
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\matches_2008_09_ligat_haal_regular_corrected.csv

Early Seasons Conversion Summary:
    season  matches status
0  2006/07      198     ok
1  2007/08      198     ok
2  2008/09      198     ok
Combined saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\matches_2006_07_2008_09_regular_tm_corrected_combined.csv
