## 1. Environment Setup

In [1]:
# Environment setup
from pathlib import Path
from typing import Optional

try:
    from dotenv import load_dotenv
    DOTENV_AVAILABLE = True
except Exception:
    DOTENV_AVAILABLE = False

# Helper to find project root
def _find_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

# Resolve project directories consistently
ROOT = _find_root()
DATA_DIR = ROOT / 'data' / 'raw'
INTERIM_DIR = ROOT / 'data' / 'interim'
PROCESSED_DIR = ROOT / 'data' / 'processed'
FIG_DIR = ROOT / 'reports' / 'figures'
for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"\nüéØ Environment setup complete")
print(f"   ROOT: {ROOT}")
print(f"   DATA_DIR: {DATA_DIR}")


üéØ Environment setup complete
   ROOT: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks
   DATA_DIR: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw


## 2. Helper Functions

In [2]:
# Helper functions for scraping
from typing import Optional
import random
import time
from pathlib import Path
import requests

_USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36",
]

def find_repo_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

def ensure_environment():
    global ROOT, DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR
    if 'ROOT' not in globals() or not isinstance(ROOT, Path) or not (ROOT / 'data').exists():
        root_guess = find_repo_root(Path.cwd())
        if not (root_guess / 'data').exists() and (root_guess.parent / 'data').exists():
            root_guess = root_guess.parent
        ROOT = root_guess
    DATA_DIR = ROOT / 'data' / 'raw'
    INTERIM_DIR = ROOT / 'data' / 'interim'
    PROCESSED_DIR = ROOT / 'data' / 'processed'
    FIG_DIR = ROOT / 'reports' / 'figures'
    for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
        d.mkdir(parents=True, exist_ok=True)
    return ROOT, DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR


def http_get(url: str, headers: Optional[dict] = None, retries: int = 3, timeout: int = 30) -> str:
    last_err = None
    sess = requests.Session()
    for attempt in range(1, retries + 1):
        ua = random.choice(_USER_AGENTS)
        hdrs = {"User-Agent": ua, "Accept-Language": "en-US,en;q=0.9"}
        if headers:
            hdrs.update(headers)
        try:
            resp = sess.get(url, headers=hdrs, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            last_err = e
            time.sleep(0.8 * attempt)
    raise last_err  # type: ignore


def save_csv(df: 'pd.DataFrame', path: Path, **to_csv_kwargs):
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, encoding=to_csv_kwargs.get('encoding', 'utf-8-sig'))
    print(f"Saved: {path}")

print("‚úÖ Helper functions loaded")

‚úÖ Helper functions loaded




## 3. Attendance Scraper Function

Scrapes attendance data from Transfermarkt for a single season.

**Data collected:**
- Team name
- Stadium name
- Stadium capacity
- Total spectators (season)
- Average attendance per match

In [3]:
def scrape_transfermarkt_attendance(season_year: int) -> 'pd.DataFrame':
    """
    Scrape team attendance data from Transfermarkt for a given season.
    
    Args:
        season_year: Starting year of season (e.g., 2023 for 2023/24)
    
    Returns:
        DataFrame with columns: season, team, stadium, capacity, total_spectators, average_attendance
    """
    import pandas as pd
    from bs4 import BeautifulSoup
    import re
    
    url = f"https://www.transfermarkt.com/ligat-haal/besucherzahlen/wettbewerb/ISR1/saison_id/{season_year}"
    print(f"Scraping attendance from: {url}")
    
    try:
        html = http_get(url)
        soup = BeautifulSoup(html, "html.parser")
        
        # Find the attendance table
        tables = soup.find_all("table", class_="items")
        if not tables:
            print(f"  ‚ö†Ô∏è  No attendance tables found for {season_year}/{str(season_year+1)[-2:]}")
            return None
        
        table = tables[0]
        tbody = table.find("tbody")
        if not tbody:
            print(f"  ‚ö†Ô∏è  No tbody found in attendance table for {season_year}/{str(season_year+1)[-2:]}")
            return None
        
        rows = tbody.find_all("tr", recursive=False)
        
        attendance_data = []
        season_str = f"{season_year}/{str(season_year+1)[-2:]}"
        
        for row in rows:
            cells = row.find_all("td")
            if len(cells) < 5:
                continue
            
            # First cell is rank (skip "Total" row)
            rank_text = cells[0].get_text(strip=True)
            if not rank_text.isdigit():
                continue
            
            # Second cell contains inline table with stadium and team info
            inline_table = cells[1].find("table", class_="inline-table")
            if not inline_table:
                continue
            
            # Extract stadium name (first link in inline table)
            stadium_link = inline_table.find("a", class_="hauptlink")
            stadium = stadium_link.get_text(strip=True) if stadium_link else "Unknown"
            
            # Extract team name (second row of inline table)
            team_links = inline_table.find_all("a", title=True)
            team = "Unknown"
            for link in team_links:
                title = link.get("title", "")
                if title and "spielplan" in link.get("href", ""):
                    team = title
                    break
            
            # Extract capacity, total spectators, average (last 3 cells)
            # Note: Numbers use European format (dots for thousands)
            capacity_text = cells[-3].get_text(strip=True)
            total_text = cells[-2].get_text(strip=True)
            average_text = cells[-1].get_text(strip=True)
            
            # Convert European number format (remove dots, handle empty values)
            def parse_number(text):
                if not text or text == "-":
                    return None
                return int(text.replace(".", "").replace(",", ""))
            
            capacity = parse_number(capacity_text)
            total_spectators = parse_number(total_text)
            average_attendance = parse_number(average_text)
            
            attendance_data.append({
                "season": season_str,
                "team": team,
                "stadium": stadium,
                "capacity": capacity,
                "total_spectators": total_spectators,
                "average_attendance": average_attendance
            })
        
        if not attendance_data:
            print(f"  ‚ö†Ô∏è  No attendance data extracted for {season_year}/{str(season_year+1)[-2:]}")
            return None
        
        df = pd.DataFrame(attendance_data)
        print(f"  ‚úÖ Scraped {len(df)} teams for {season_str}")
        return df
        
    except Exception as e:
        print(f"  ‚ùå Error scraping {season_year}/{str(season_year+1)[-2:]}: {e}")
        return None

print("‚úÖ Attendance scraper function defined")

‚úÖ Attendance scraper function defined


## 4. Test Scraper (Optional)

Test on 2023/24 season before running full collection.

In [4]:
# Quick test: scrape 2023/24 season attendance
ensure_environment()
season_year = 2023
_df_2023 = scrape_transfermarkt_attendance(season_year)
if _df_2023 is not None:
    _csv_2023 = DATA_DIR / f"attendance_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_transfermarkt.csv"
    save_csv(_df_2023, _csv_2023)
    display(_df_2023.head(20))
else:
    print("Failed to scrape 2023/24 attendance from Transfermarkt.")

Scraping attendance from: https://www.transfermarkt.com/ligat-haal/besucherzahlen/wettbewerb/ISR1/saison_id/2023
  ‚úÖ Scraped 14 teams for 2023/24
Saved: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw\attendance_2023_24_ligat_haal_transfermarkt.csv


Unnamed: 0,season,team,stadium,capacity,total_spectators,average_attendance
0,2023/24,Maccabi Tel Aviv,Unknown,29150,213565,17797
1,2023/24,Maccabi Haifa,Unknown,30780,171948,17195
2,2023/24,Beitar Jerusalem,Unknown,33500,144830,13166
3,2023/24,Hapoel Beer Sheva,Unknown,16126,122024,10169
4,2023/24,Hapoel Tel Aviv,Unknown,29150,101049,9186
5,2023/24,Maccabi Netanya,Unknown,13610,70127,5844
6,2023/24,Hapoel Petah Tikva,Unknown,11500,60759,5524
7,2023/24,Hapoel Haifa,Unknown,30820,42559,3869
8,2023/24,Hapoel Jerusalem,Unknown,33500,40070,3643
9,2023/24,Maccabi Petah Tikva,Unknown,11500,39337,3576


## 5. Multi-Season Collection

**Main cell** - Scrapes attendance for all 20 seasons (2006-2025).

**Features:**
- Checks for existing files (skip if already scraped)
- Polite scraping with delays
- Error handling and reporting
- Combines all seasons into one master file
- Displays summary statistics

In [5]:
# Scrape attendance data for all 20 seasons (2006-2025)
import pandas as pd
import time

ensure_environment()

# Define seasons to scrape
start_year = 2006
end_year = 2025
seasons = list(range(start_year, end_year + 1))

print(f"Scraping attendance data for {len(seasons)} seasons ({start_year}/{start_year+1}-{end_year}/{str(end_year+1)[-2:]})\n")
print("="*80)

all_attendance = []
failed = []

for season_year in seasons:
    season_str = f"{season_year}/{str(season_year+1)[-2:]}"
    print(f"\n[{season_str}]")
    
    # Check if already exists
    csv_path = DATA_DIR / f"attendance_{season_year}_{str(season_year+1)[-2:]}_ligat_haal_transfermarkt.csv"
    if csv_path.exists():
        print(f"  ‚ÑπÔ∏è  File already exists: {csv_path.name}")
        try:
            existing_df = pd.read_csv(csv_path)
            all_attendance.append(existing_df)
            print(f"  ‚úÖ Loaded existing data: {len(existing_df)} teams")
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error loading existing file: {e}")
            # Try scraping anyway
            df = scrape_transfermarkt_attendance(season_year)
            if df is not None:
                save_csv(df, csv_path)
                all_attendance.append(df)
            else:
                failed.append(season_str)
    else:
        # Scrape new data
        df = scrape_transfermarkt_attendance(season_year)
        if df is not None:
            save_csv(df, csv_path)
            all_attendance.append(df)
        else:
            failed.append(season_str)
        
        # Be polite to the server
        time.sleep(1.2)

print("\n" + "="*80)
print(f"\n‚úÖ Successfully scraped/loaded: {len(all_attendance)} seasons")
if failed:
    print(f"‚ùå Failed: {len(failed)} seasons: {', '.join(failed)}")

# Combine all data
if all_attendance:
    combined_attendance = pd.concat(all_attendance, ignore_index=True)
    combined_path = DATA_DIR / "attendance_all_seasons_ligat_haal_transfermarkt.csv"
    save_csv(combined_attendance, combined_path)
    
    print(f"\nüìä Combined attendance data:")
    print(f"   Total records: {len(combined_attendance)}")
    print(f"   Seasons: {combined_attendance['season'].nunique()}")
    print(f"   Teams: {combined_attendance['team'].nunique()}")
    print(f"\n   Saved to: {combined_path.name}")
    
    # Show summary by season
    summary = combined_attendance.groupby('season').agg({
        'team': 'count',
        'total_spectators': 'sum',
        'average_attendance': 'mean'
    }).round(0)
    summary.columns = ['Teams', 'Total Spectators', 'Avg Attendance']
    print("\n   Season Summary:")
    display(summary)

Scraping attendance data for 20 seasons (2006/2007-2025/26)


[2006/07]
  ‚ÑπÔ∏è  File already exists: attendance_2006_07_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 12 teams

[2007/08]
  ‚ÑπÔ∏è  File already exists: attendance_2007_08_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 12 teams

[2008/09]
  ‚ÑπÔ∏è  File already exists: attendance_2008_09_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 12 teams

[2009/10]
  ‚ÑπÔ∏è  File already exists: attendance_2009_10_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 16 teams

[2010/11]
  ‚ÑπÔ∏è  File already exists: attendance_2010_11_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 16 teams

[2011/12]
  ‚ÑπÔ∏è  File already exists: attendance_2011_12_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 16 teams

[2012/13]
  ‚ÑπÔ∏è  File already exists: attendance_2012_13_ligat_haal_transfermarkt.csv
  ‚úÖ Loaded existing data: 14 teams

[2013/14]
  ‚ÑπÔ∏è  File already exists: attendan

Unnamed: 0_level_0,Teams,Total Spectators,Avg Attendance
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006/07,12,119700,3136.0
2007/08,12,362600,5738.0
2008/09,12,0,0.0
2009/10,16,939155,3926.0
2010/11,16,318450,4867.0
2011/12,16,911780,3891.0
2012/13,14,916940,5038.0
2013/14,14,970781,5444.0
2014/15,14,935937,7630.0
2015/16,14,1247497,6854.0


---

## Summary

This notebook collects attendance data from Transfermarkt for all Ligat Ha'al seasons (2006-2025).

**Output files:**
- Individual: `data/raw/attendance_YYYY_YY_ligat_haal_transfermarkt.csv`
- Combined: `data/raw/attendance_all_seasons_ligat_haal_transfermarkt.csv`

**Next steps:**
1. Data cleaning and validation
2. Merge with match results
3. Analysis and visualization