## 1. Environment Setup

In [32]:
# Environment setup
from pathlib import Path
from typing import Optional
import pandas as pd
import numpy as np

try:
    from dotenv import load_dotenv
    DOTENV_AVAILABLE = True
except Exception:
    DOTENV_AVAILABLE = False

# Helper to find project root
def _find_root(start: Optional[Path] = None) -> Path:
    p = start or Path.cwd()
    for _ in range(6):
        if (p / 'data').exists() or (p / '.git').exists() or (p / 'notebooks').exists():
            return p
        p = p.parent
    return Path.cwd()

# Resolve project directories consistently
ROOT = _find_root()
DATA_DIR = ROOT / 'data' / 'raw'
INTERIM_DIR = ROOT / 'data' / 'interim'
PROCESSED_DIR = ROOT / 'data' / 'processed'
FIG_DIR = ROOT / 'reports' / 'figures'
for d in [DATA_DIR, INTERIM_DIR, PROCESSED_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"\nüéØ Environment setup complete")
print(f"   ROOT: {ROOT}")
print(f"   DATA_DIR: {DATA_DIR}")
print(f"   INTERIM_DIR: {INTERIM_DIR}")


üéØ Environment setup complete
   ROOT: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks
   DATA_DIR: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\raw
   INTERIM_DIR: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim


## 2. Load Team Name Normalization

We need the team name mapping to ensure consistent team names across all seasons.
These functions were defined in the regular season collection notebook.

In [33]:
# Team Name Mapping - same as in regular season notebook
TEAM_NAME_MAP = {
    # Abbreviations to full names
    'ASH': 'F.C. Ashdod',
    'BEI': 'Beitar Jerusalem',
    'BnS': 'Bnei Sakhnin',
    'BnY': 'Bnei Yehuda',
    'HAS': 'Hapoel Ashkelon',
    'HBS': "Hapoel Be'er Sheva",
    'HHA': 'Hapoel Haifa',
    'HKS': 'Hapoel Kfar Saba',
    'HRA': "Hapoel Ra'anana",
    'HTA': 'Hapoel Tel Aviv',
    'IKS': 'Ironi Kiryat Shmona',
    'MHA': 'Maccabi Haifa',
    'MPT': 'Maccabi Petah Tikva',
    'MTA': 'Maccabi Tel Aviv',
    'HPT': 'Hapoel Petah Tikva',
    'HRG': 'Hapoel Ramat Gan',
    'HRH': 'Hapoel Ramat HaSharon',
    'HRL': 'Rishon LeZion',
    'MAN': 'Maccabi Ahi Nazareth',
    'MBR': 'Maccabi Bnei Reineh',
    'SNZ': 'Sektzia Ness Ziona',
    'HAK': 'Hapoel Acre',
    'MHE': 'Maccabi Herzliya',
    'MNE': 'Maccabi Netanya',
    'HAR': 'Hapoel Raanana',
    'HAC': 'Hapoel Acre',
    'IRH': 'Ironi Ramat HaSharon',
    'HAH': 'Hapoel Hadera',
    'NES': 'Ness Ziona',
    'HJE': 'Hapoel Jerusalem',
    'HNG': 'Hapoel Nof HaGalil',
    'ITI': 'Ironi Tiberias',
    
    # Name variants to canonical names
    'Ashdod': 'F.C. Ashdod',
    'F.C. Ironi Ashdod': 'F.C. Ashdod',
    'Ness Ziona': 'Sektzia Ness Ziona',
    'Ironi Nir Ramat HaSharon': 'Ironi Ramat HaSharon',
    'Hakoah Amidar Ramat Gan': 'Hapoel Ramat Gan',
    'Hapoel Rishon LeZion': 'Rishon LeZion',
    'Hapoel Raanana': "Hapoel Ra'anana",
    
    # Full names map to themselves
    'F.C. Ashdod': 'F.C. Ashdod',
    'Beitar Jerusalem': 'Beitar Jerusalem',
    'Bnei Sakhnin': 'Bnei Sakhnin',
    'Bnei Yehuda': 'Bnei Yehuda',
    'Hapoel Ashkelon': 'Hapoel Ashkelon',
    "Hapoel Be'er Sheva": "Hapoel Be'er Sheva",
    'Hapoel Haifa': 'Hapoel Haifa',
    'Hapoel Kfar Saba': 'Hapoel Kfar Saba',
    "Hapoel Ra'anana": "Hapoel Ra'anana",
    'Hapoel Tel Aviv': 'Hapoel Tel Aviv',
    'Ironi Kiryat Shmona': 'Ironi Kiryat Shmona',
    'Maccabi Haifa': 'Maccabi Haifa',
    'Maccabi Petah Tikva': 'Maccabi Petah Tikva',
    'Maccabi Tel Aviv': 'Maccabi Tel Aviv',
    'Hapoel Petah Tikva': 'Hapoel Petah Tikva',
    'Hapoel Ramat Gan': 'Hapoel Ramat Gan',
    'Hapoel Ramat HaSharon': 'Hapoel Ramat HaSharon',
    'Rishon LeZion': 'Rishon LeZion',
    'Maccabi Ahi Nazareth': 'Maccabi Ahi Nazareth',
    'Maccabi Bnei Reineh': 'Maccabi Bnei Reineh',
    'Sektzia Ness Ziona': 'Sektzia Ness Ziona',
    'Hapoel Acre': 'Hapoel Acre',
    'Maccabi Herzliya': 'Maccabi Herzliya',
    'Maccabi Netanya': 'Maccabi Netanya',
    'Ironi Ramat HaSharon': 'Ironi Ramat HaSharon',
    'Hapoel Hadera': 'Hapoel Hadera',
    'Hapoel Jerusalem': 'Hapoel Jerusalem',
    'Hapoel Nof HaGalil': 'Hapoel Nof HaGalil',
    'Ironi Tiberias': 'Ironi Tiberias',
}

def normalize_team_names(df, name_map=TEAM_NAME_MAP):
    """Normalize team names by converting abbreviations and variants to full names."""
    df = df.copy()
    df['home_team'] = df['home_team'].map(lambda x: name_map.get(x, x))
    df['away_team'] = df['away_team'].map(lambda x: name_map.get(x, x))
    return df

def apply_season_specific_fixes(df, season):
    """Apply season-specific Wikipedia data corrections."""
    df = df.copy()
    if season == '2006/07':
        # Fix: Hapoel Ramat Gan should be Hapoel Acre in 2006/07
        df.loc[df['home_team'] == 'Hapoel Ramat Gan', 'home_team'] = 'Hapoel Acre'
        df.loc[df['away_team'] == 'Hapoel Ramat Gan', 'away_team'] = 'Hapoel Acre'
    elif season == '2008/09':
        # Fix: Hapoel Ramat Gan should be Hapoel Ra'anana in 2008/09
        df.loc[df['home_team'] == 'Hapoel Ramat Gan', 'home_team'] = "Hapoel Ra'anana"
        df.loc[df['away_team'] == 'Hapoel Ramat Gan', 'away_team'] = "Hapoel Ra'anana"
    return df

print("‚úÖ Team name normalization functions loaded")

‚úÖ Team name normalization functions loaded


## 3. Load Match Data

Load the combined Wikipedia match data that was collected in the regular season notebook.

In [34]:
# Load all match data from Wikipedia
matches_path = DATA_DIR / "matches_all_seasons_ligat_haal_wikipedia.csv"

if not matches_path.exists():
    print(f"‚ùå Match data not found: {matches_path}")
    print(f"\n‚ö†Ô∏è  Please run the '01_regular_season_collection.ipynb' notebook first to collect match data.")
    raise FileNotFoundError(f"Required file not found: {matches_path}")

# Load and normalize
all_matches = pd.read_csv(matches_path)
print(f"‚úÖ Loaded {len(all_matches)} matches from {all_matches['season'].nunique()} seasons")
print(f"   Season range: {all_matches['season'].min()} to {all_matches['season'].max()}")

# Normalize team names (convert abbreviations to full names)
all_matches = normalize_team_names(all_matches, TEAM_NAME_MAP)

# Apply season-specific fixes
for season_name in all_matches['season'].unique():
    season_mask = all_matches['season'] == season_name
    season_data = all_matches[season_mask].copy()
    all_matches.loc[season_mask] = apply_season_specific_fixes(season_data, season_name)

print(f"‚úÖ Team names normalized and season-specific fixes applied")
print(f"\nüìä Data sample:")
display(all_matches.head())

‚úÖ Loaded 3533 matches from 20 seasons
   Season range: 2006/07 to 2025/26
‚úÖ Team names normalized and season-specific fixes applied

üìä Data sample:


Unnamed: 0,season,season_year,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points
0,2006/07,2006,Beitar Jerusalem,Bnei Yehuda,0,0,0,D,1,1
1,2006/07,2006,Beitar Jerusalem,F.C. Ashdod,2,0,2,H,3,0
2,2006/07,2006,Beitar Jerusalem,Hapoel Acre,0,0,0,D,1,1
3,2006/07,2006,Beitar Jerusalem,Hapoel Kfar Saba,2,0,2,H,3,0
4,2006/07,2006,Beitar Jerusalem,Hapoel Petah Tikva,2,0,2,H,3,0


## 4. Title Race Analysis Function

Calculate league standings after each round and track leadership changes.
This is the core function for analyzing the "title race" throughout a season.

In [35]:
def calculate_league_table_by_round(matches_df, season_str="2016/17"):
    """
    Calculate league standings after each round/matchday and track leadership changes.
    
    Args:
        matches_df: DataFrame with match results (must have normalized team names)
        season_str: Season to analyze (e.g., "2016/17")
    
    Returns:
        - standings_by_round: dict mapping round_num -> DataFrame of standings
        - leadership_changes: list of tuples (round_num, new_leader)
    
    Note: Team names should already be normalized (full names, not abbreviations).
    """
    # Filter for the specific season
    season_matches = matches_df[matches_df['season'] == season_str].copy()
    
    if len(season_matches) == 0:
        print(f"‚ùå No matches found for season {season_str}")
        return None, None
    
    # Get all unique teams from both home and away columns
    home_teams = set(season_matches['home_team'].unique())
    away_teams = set(season_matches['away_team'].unique())
    teams = sorted(home_teams | away_teams)  # Union of both sets
    n_teams = len(teams)
    
    print(f"üìä Processing {season_str}: {len(season_matches)} matches, {n_teams} teams")
    
    # In Ligat Ha'al, typically 14 teams play 26 rounds in regular season
    # Each round has n_teams/2 matches (e.g., 14 teams = 7 matches per round)
    # Since we don't have actual dates, we estimate round numbers
    season_matches = season_matches.reset_index(drop=True)
    matches_per_round = n_teams // 2 if n_teams % 2 == 0 else (n_teams + 1) // 2
    
    # Assign estimated round numbers based on position in dataset
    season_matches['round_num'] = (season_matches.index // matches_per_round) + 1
    max_round = season_matches['round_num'].max()
    
    # Initialize standings tracker
    standings_by_round = {}
    current_leader = None
    leadership_changes = []
    
    # Calculate standings after each round
    for round_num in sorted(season_matches['round_num'].unique()):
        # Get all matches up to and including this round
        matches_so_far = season_matches[season_matches['round_num'] <= round_num]
        
        # Initialize team stats
        stats = {team: {
            'played': 0, 'won': 0, 'drawn': 0, 'lost': 0, 
            'gf': 0, 'ga': 0, 'gd': 0, 'points': 0
        } for team in teams}
        
        # Calculate stats from matches
        for _, match in matches_so_far.iterrows():
            home = match['home_team']
            away = match['away_team']
            home_goals = match['home_goals']
            away_goals = match['away_goals']
            
            # Update home team
            stats[home]['played'] += 1
            stats[home]['gf'] += home_goals
            stats[home]['ga'] += away_goals
            stats[home]['gd'] = stats[home]['gf'] - stats[home]['ga']
            
            # Update away team
            stats[away]['played'] += 1
            stats[away]['gf'] += away_goals
            stats[away]['ga'] += home_goals
            stats[away]['gd'] = stats[away]['gf'] - stats[away]['ga']
            
            # Update points
            if home_goals > away_goals:  # Home win
                stats[home]['won'] += 1
                stats[home]['points'] += 3
                stats[away]['lost'] += 1
            elif away_goals > home_goals:  # Away win
                stats[away]['won'] += 1
                stats[away]['points'] += 3
                stats[home]['lost'] += 1
            else:  # Draw
                stats[home]['drawn'] += 1
                stats[away]['drawn'] += 1
                stats[home]['points'] += 1
                stats[away]['points'] += 1
        
        # Convert to DataFrame and sort by points, then goal difference, then goals scored
        standings = pd.DataFrame.from_dict(stats, orient='index')
        standings.index.name = 'team'
        standings = standings.reset_index()
        standings = standings.sort_values(['points', 'gd', 'gf'], ascending=[False, False, False])
        standings['position'] = range(1, len(standings) + 1)
        
        standings_by_round[int(round_num)] = standings
        
        # Track leadership changes
        new_leader = standings.iloc[0]['team']
        if new_leader != current_leader:
            leadership_changes.append((int(round_num), new_leader))
            current_leader = new_leader
    
    return standings_by_round, leadership_changes

print("‚úÖ Title race analysis function defined")

‚úÖ Title race analysis function defined


## 5. Analyze Single Season

Analyze the title race for a specific season.

In [36]:
# Analyze specific season
season = "2016/17"
print(f"üèÜ Analyzing Title Race: {season}")
print("="*80)

standings_by_round, leadership_changes = calculate_league_table_by_round(all_matches, season)

if standings_by_round and leadership_changes:
    print(f"\nüìä League Leadership Analysis - {season}")
    print("="*80)
    print(f"\nüîÑ Leadership Changes: {len(leadership_changes) - 1}")
    print(f"   (First leader doesn't count as a 'change')\n")
    
    print("Round-by-round first place:")
    for round_num, leader in leadership_changes:
        print(f"  ‚Ä¢ Round {round_num:2d}: {leader}")
    
    # Show final standings
    final_round = max(standings_by_round.keys())
    print(f"\nüìã Final Standings After Round {final_round}:")
    final = standings_by_round[final_round]
    display(final[['position', 'team', 'played', 'won', 'drawn', 'lost', 'gf', 'ga', 'gd', 'points']].head(10))
    
    # Calculate statistics
    print(f"\nüìà Season Statistics:")
    print(f"  ‚Ä¢ Total rounds: {len(standings_by_round)}")
    print(f"  ‚Ä¢ Teams: {len(final)}")
    print(f"  ‚Ä¢ Leader: {final.iloc[0]['team']} ({final.iloc[0]['points']:.0f} pts, {final.iloc[0]['played']:.0f} games)")
    print(f"  ‚Ä¢ Runner-up: {final.iloc[1]['team']} ({final.iloc[1]['points']:.0f} pts, {final.iloc[1]['played']:.0f} games)")
    print(f"  ‚Ä¢ Points gap: {final.iloc[0]['points'] - final.iloc[1]['points']:.0f} pts")
    
    print(f"\n‚ö†Ô∏è  Note: Round numbers are estimated (Wikipedia doesn't include dates)")
    print(f"   Actual match sequencing may differ slightly.")
else:
    print(f"‚ùå Failed to analyze season {season}")

üèÜ Analyzing Title Race: 2016/17
üìä Processing 2016/17: 182 matches, 14 teams

üìä League Leadership Analysis - 2016/17

üîÑ Leadership Changes: 3
   (First leader doesn't count as a 'change')

Round-by-round first place:
  ‚Ä¢ Round  1: F.C. Ashdod
  ‚Ä¢ Round  3: Beitar Jerusalem
  ‚Ä¢ Round  8: Bnei Sakhnin
  ‚Ä¢ Round 10: Hapoel Be'er Sheva

üìã Final Standings After Round 26:

üìä League Leadership Analysis - 2016/17

üîÑ Leadership Changes: 3
   (First leader doesn't count as a 'change')

Round-by-round first place:
  ‚Ä¢ Round  1: F.C. Ashdod
  ‚Ä¢ Round  3: Beitar Jerusalem
  ‚Ä¢ Round  8: Bnei Sakhnin
  ‚Ä¢ Round 10: Hapoel Be'er Sheva

üìã Final Standings After Round 26:


Unnamed: 0,position,team,played,won,drawn,lost,gf,ga,gd,points
5,1,Hapoel Be'er Sheva,26,18,5,3,54,13,41,59
13,2,Maccabi Tel Aviv,26,17,5,4,45,19,26,56
12,3,Maccabi Petah Tikva,26,13,9,4,36,23,13,48
0,4,Beitar Jerusalem,26,10,10,6,34,27,7,40
1,5,Bnei Sakhnin,26,10,9,7,26,26,0,39
11,6,Maccabi Haifa,26,10,8,8,30,25,5,38
10,7,Ironi Kiryat Shmona,26,9,8,9,35,33,2,35
6,8,Hapoel Haifa,26,8,4,14,29,36,-7,28
3,9,F.C. Ashdod,26,6,10,10,15,26,-11,28
8,10,Hapoel Ra'anana,26,7,7,12,14,29,-15,28



üìà Season Statistics:
  ‚Ä¢ Total rounds: 26
  ‚Ä¢ Teams: 14
  ‚Ä¢ Leader: Hapoel Be'er Sheva (59 pts, 26 games)
  ‚Ä¢ Runner-up: Maccabi Tel Aviv (56 pts, 26 games)
  ‚Ä¢ Points gap: 3 pts

‚ö†Ô∏è  Note: Round numbers are estimated (Wikipedia doesn't include dates)
   Actual match sequencing may differ slightly.


## 6. Multi-Season Title Race Analysis

Analyze leadership changes across all available seasons.

In [37]:
# Analyze all seasons
print("üèÜ Multi-Season Title Race Analysis")
print("="*80)

all_seasons = sorted(all_matches['season'].unique())
season_summary = []

for season_str in all_seasons:
    standings_by_round, leadership_changes = calculate_league_table_by_round(all_matches, season_str)
    
    if standings_by_round and leadership_changes:
        final_round = max(standings_by_round.keys())
        final_standings = standings_by_round[final_round]
        champion = final_standings.iloc[0]
        runner_up = final_standings.iloc[1]
        
        season_summary.append({
            'season': season_str,
            'champion': champion['team'],
            'champion_points': int(champion['points']),
            'runner_up': runner_up['team'],
            'runner_up_points': int(runner_up['points']),
            'points_gap': int(champion['points'] - runner_up['points']),
            'leadership_changes': len(leadership_changes) - 1,  # First doesn't count as change
            'total_rounds': len(standings_by_round)
        })

print()  # Newline after processing messages

# Create summary DataFrame
summary_df = pd.DataFrame(season_summary)

print(f"\nüìä Title Race Summary ({len(summary_df)} seasons):")
print("="*80)
display(summary_df)

# Save to interim folder for later use
summary_path = INTERIM_DIR / "title_race_summary.csv"
summary_df.to_csv(summary_path, index=False, encoding='utf-8-sig')
print(f"\n‚úÖ Summary saved to: {summary_path}")

# Some interesting statistics
print(f"\nüìà Interesting Statistics:")
print(f"  ‚Ä¢ Most competitive season: {summary_df.loc[summary_df['leadership_changes'].idxmax(), 'season']} ({summary_df['leadership_changes'].max()} leadership changes)")
print(f"  ‚Ä¢ Most dominant win: {summary_df.loc[summary_df['points_gap'].idxmax(), 'season']} ({summary_df['points_gap'].max()} points gap)")
print(f"  ‚Ä¢ Closest finish: {summary_df.loc[summary_df['points_gap'].idxmin(), 'season']} ({summary_df['points_gap'].min()} points gap)")
print(f"  ‚Ä¢ Average leadership changes per season: {summary_df['leadership_changes'].mean():.1f}")
print(f"  ‚Ä¢ Average points gap: {summary_df['points_gap'].mean():.1f} points")

# Champion frequency
print(f"\nüèÜ Most Successful Teams (Championships):")
champion_counts = summary_df['champion'].value_counts()
for team, count in champion_counts.head(5).items():
    print(f"  ‚Ä¢ {team}: {count} titles")

üèÜ Multi-Season Title Race Analysis
üìä Processing 2006/07: 132 matches, 12 teams
üìä Processing 2007/08: 132 matches, 12 teams
üìä Processing 2008/09: 132 matches, 13 teams
üìä Processing 2009/10: 239 matches, 16 teams
üìä Processing 2010/11: 234 matches, 16 teams
üìä Processing 2009/10: 239 matches, 16 teams
üìä Processing 2010/11: 234 matches, 16 teams
üìä Processing 2011/12: 240 matches, 16 teams
üìä Processing 2012/13: 182 matches, 14 teams
üìä Processing 2011/12: 240 matches, 16 teams
üìä Processing 2012/13: 182 matches, 14 teams
üìä Processing 2013/14: 182 matches, 14 teams
üìä Processing 2014/15: 181 matches, 14 teams
üìä Processing 2015/16: 182 matches, 14 teams
üìä Processing 2013/14: 182 matches, 14 teams
üìä Processing 2014/15: 181 matches, 14 teams
üìä Processing 2015/16: 182 matches, 14 teams
üìä Processing 2016/17: 182 matches, 14 teams
üìä Processing 2017/18: 181 matches, 14 teams
üìä Processing 2018/19: 182 matches, 14 teams
üìä Processing 2016/1

Unnamed: 0,season,champion,champion_points,runner_up,runner_up_points,points_gap,leadership_changes,total_rounds
0,2006/07,Beitar Jerusalem,43,Hapoel Tel Aviv,39,4,0,22
1,2007/08,Beitar Jerusalem,51,Maccabi Netanya,40,11,0,22
2,2008/09,Maccabi Haifa,42,Hapoel Tel Aviv,41,1,4,19
3,2009/10,Maccabi Haifa,77,Hapoel Tel Aviv,71,6,2,30
4,2010/11,Maccabi Haifa,70,Hapoel Tel Aviv,62,8,3,30
5,2011/12,Ironi Kiryat Shmona,66,Hapoel Tel Aviv,53,13,6,30
6,2012/13,Maccabi Tel Aviv,59,Maccabi Haifa,49,10,4,26
7,2013/14,Maccabi Tel Aviv,66,Hapoel Be'er Sheva,59,7,3,26
8,2014/15,Maccabi Tel Aviv,55,Hapoel Be'er Sheva,49,6,5,26
9,2015/16,Hapoel Be'er Sheva,64,Maccabi Tel Aviv,61,3,1,26



‚úÖ Summary saved to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\title_race_summary.csv

üìà Interesting Statistics:
  ‚Ä¢ Most competitive season: 2011/12 (6 leadership changes)
  ‚Ä¢ Most dominant win: 2018/19 (22 points gap)
  ‚Ä¢ Closest finish: 2008/09 (1 points gap)
  ‚Ä¢ Average leadership changes per season: 2.4
  ‚Ä¢ Average points gap: 6.0 points

üèÜ Most Successful Teams (Championships):
  ‚Ä¢ Maccabi Haifa: 6 titles
  ‚Ä¢ Maccabi Tel Aviv: 6 titles
  ‚Ä¢ Hapoel Be'er Sheva: 5 titles
  ‚Ä¢ Beitar Jerusalem: 2 titles
  ‚Ä¢ Ironi Kiryat Shmona: 1 titles


## 7. Detailed Season Export

Export detailed round-by-round standings for each season to interim folder for further analysis.

In [38]:
# Export detailed standings for each season
print("üíæ Exporting detailed round-by-round standings...")
print("="*80)

export_count = 0
for season_str in all_seasons:
    standings_by_round, leadership_changes = calculate_league_table_by_round(all_matches, season_str)
    
    if standings_by_round:
        # Combine all rounds into single DataFrame
        all_rounds = []
        for round_num, standings in standings_by_round.items():
            standings_copy = standings.copy()
            standings_copy['round'] = round_num
            standings_copy['season'] = season_str
            all_rounds.append(standings_copy)
        
        combined_standings = pd.concat(all_rounds, ignore_index=True)
        
        # Reorder columns for clarity
        cols = ['season', 'round', 'position', 'team', 'played', 'won', 'drawn', 'lost', 
                'gf', 'ga', 'gd', 'points']
        combined_standings = combined_standings[cols]
        
        # Save to file
        season_file = INTERIM_DIR / f"standings_by_round_{season_str.replace('/', '_')}.csv"
        combined_standings.to_csv(season_file, index=False, encoding='utf-8-sig')
        export_count += 1

print(f"\n‚úÖ Exported {export_count} season files to: {INTERIM_DIR}")
print(f"   Files named: standings_by_round_YYYY_YY.csv")
print("\nüí° These files contain complete round-by-round standings for each season.")
print("   Use them for detailed analysis, visualizations, or further processing.")

üíæ Exporting detailed round-by-round standings...
üìä Processing 2006/07: 132 matches, 12 teams
üìä Processing 2007/08: 132 matches, 12 teams
üìä Processing 2007/08: 132 matches, 12 teams
üìä Processing 2008/09: 132 matches, 13 teams
üìä Processing 2009/10: 239 matches, 16 teams
üìä Processing 2008/09: 132 matches, 13 teams
üìä Processing 2009/10: 239 matches, 16 teams
üìä Processing 2010/11: 234 matches, 16 teams
üìä Processing 2011/12: 240 matches, 16 teams
üìä Processing 2010/11: 234 matches, 16 teams
üìä Processing 2011/12: 240 matches, 16 teams
üìä Processing 2012/13: 182 matches, 14 teams
üìä Processing 2013/14: 182 matches, 14 teams
üìä Processing 2012/13: 182 matches, 14 teams
üìä Processing 2013/14: 182 matches, 14 teams
üìä Processing 2014/15: 181 matches, 14 teams
üìä Processing 2015/16: 182 matches, 14 teams
üìä Processing 2014/15: 181 matches, 14 teams
üìä Processing 2015/16: 182 matches, 14 teams
üìä Processing 2016/17: 182 matches, 14 teams
üìä Pro

---

## Summary

This notebook analyzes the **title race** in Ligat Ha'al across all available seasons.

### Key Outputs:

1. **Season-by-season leadership analysis** - Who led after each round
2. **Leadership change tracking** - How many times the leader changed
3. **Champion statistics** - Points totals, gaps to runner-up
4. **Historical comparison** - Most competitive vs. most dominant seasons

### Data Files Created:

- `interim/title_race_summary.csv` - Summary statistics for all seasons
- `interim/standings_by_round_YYYY_YY.csv` - Detailed round-by-round standings per season

### Important Notes:

- Round numbers are **estimated** since Wikipedia doesn't include match dates
- Analysis based on regular season matches from Wikipedia results matrix
- Team names are normalized for consistency across all seasons
- Standings calculated using standard rules: 3 pts win, 1 pt draw, 0 pts loss
- Tiebreakers: points ‚Üí goal difference ‚Üí goals scored

### Next Steps:

1. Visualize leadership changes over time
2. Identify patterns in title races (comebacks, dominant leads, etc.)
3. Correlate with attendance data
4. Analyze home/away performance of title contenders
5. Statistical modeling of championship probability by round

In [41]:
from pathlib import Path
import pandas as pd

def compute_title_race_from_csv(csv_path: Path, season_label: str, matches_per_round: int = 6):
    df = pd.read_csv(csv_path)
    # Enforce strict ordering: 6 rows per round in given order
    total_matches = len(df)
    total_rounds = total_matches // matches_per_round

    # Normalize team names using TEAM_NAME_MAP if present
    if 'TEAM_NAME_MAP' in globals() and isinstance(TEAM_NAME_MAP, dict):
        df['home_team'] = df['home_team'].map(lambda x: TEAM_NAME_MAP.get(x, x))
        df['away_team'] = df['away_team'].map(lambda x: TEAM_NAME_MAP.get(x, x))

    teams = sorted(set(df['home_team']).union(set(df['away_team'])))

    def shorten(name: str) -> str:
        if name.startswith('Maccabi '):
            return 'M. ' + name.split(' ', 1)[1]
        if name.startswith('Hapoel '):
            return 'H. ' + name.split(' ', 1)[1]
        if name.startswith('Beitar '):
            return 'B. ' + name.split(' ', 1)[1]
        return name

    # standings structure
    standings = pd.DataFrame({
        'team': teams,
        'Pts': 0,
        'W': 0,
        'D': 0,
        'L': 0,
        'GF': 0,
        'GA': 0,
    })
    standings.set_index('team', inplace=True)

    def update_match(row):
        h, a = row['home_team'], row['away_team']
        hg, ag = int(row['home_goals']), int(row['away_goals'])
        standings.at[h, 'GF'] += hg; standings.at[h, 'GA'] += ag
        standings.at[a, 'GF'] += ag; standings.at[a, 'GA'] += hg
        if hg > ag:
            standings.at[h, 'Pts'] += 3; standings.at[h, 'W'] += 1; standings.at[a, 'L'] += 1
        elif hg < ag:
            standings.at[a, 'Pts'] += 3; standings.at[a, 'W'] += 1; standings.at[h, 'L'] += 1
        else:
            standings.at[h, 'Pts'] += 1; standings.at[a, 'Pts'] += 1
            standings.at[h, 'D'] += 1; standings.at[a, 'D'] += 1

    leader_changes = []
    leaders_sequence = []
    leader = None
    standings_by_round = {}

    for rnd in range(1, total_rounds + 1):
        start = (rnd - 1) * matches_per_round
        end = start + matches_per_round
        rnd_matches = df.iloc[start:end]
        rnd_matches.apply(update_match, axis=1)

        standings_copy = standings.copy()
        standings_copy['GD'] = standings_copy['GF'] - standings_copy['GA']
        standings_copy['Team'] = standings_copy.index.map(shorten)
        standings_copy = standings_copy.sort_values(by=['Pts', 'GD', 'GF', 'Team'], ascending=[False, False, False, True])
        standings_by_round[rnd] = standings_copy

        new_leader = standings_copy.index[0]
        leaders_sequence.append(new_leader)
        if leader is None:
            leader = new_leader
        elif new_leader != leader:
            leader_changes.append((rnd, leader, new_leader))
            leader = new_leader

    final_table = standings_by_round[total_rounds].copy()

    # Export final table
    out_path = INTERIM_DIR / f"standings_by_round_{season_label.replace('/', '_')}_final.csv"
    final_table.to_csv(out_path, index=True)

    # Summary
    different_leaders = pd.Index(leaders_sequence).unique().tolist()

    print(f"Season {season_label}: rounds={total_rounds}, matches={total_matches}")
    print(f"Leader changes: {len(leader_changes)}")
    for rnd, prev, nxt in leader_changes:
        print(f"  Round {rnd}: {shorten(prev)} -> {shorten(nxt)}")
    print(f"Different leader teams: {len(different_leaders)} ({', '.join(shorten(t) for t in different_leaders)})")
    print(f"Final table saved to: {out_path}")

    return {
        'final_table': final_table,
        'leader_changes': leader_changes,
        'different_leaders_count': len(different_leaders),
        'different_leaders': different_leaders,
        'rounds': total_rounds,
        'leaders_sequence': leaders_sequence,
    }

# Run for 2006/07
csv_2006 = INTERIM_DIR / 'matches_2006_07_ligat_haal_regular_corrected.csv'
res_2006 = compute_title_race_from_csv(csv_2006, '2006/07')

# Run for 2007/08
csv_2007 = INTERIM_DIR / 'matches_2007_08_ligat_haal_regular_corrected.csv'
res_2007 = compute_title_race_from_csv(csv_2007, '2007/08')

# Run for 2008/09
csv_2008 = INTERIM_DIR / 'matches_2008_09_ligat_haal_regular_corrected.csv'
res_2008 = compute_title_race_from_csv(csv_2008, '2008/09')

Season 2006/07: rounds=33, matches=198
Leader changes: 5
  Round 3: H. Kfar Saba -> B. Jerusalem
  Round 5: B. Jerusalem -> M. Petah Tikva
  Round 6: M. Petah Tikva -> B. Jerusalem
  Round 9: B. Jerusalem -> H. Tel Aviv
  Round 10: H. Tel Aviv -> B. Jerusalem
Different leader teams: 4 (H. Kfar Saba, B. Jerusalem, M. Petah Tikva, H. Tel Aviv)
Final table saved to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\standings_by_round_2006_07_final.csv
Season 2007/08: rounds=33, matches=198
Leader changes: 0
Different leader teams: 1 (B. Jerusalem)
Final table saved to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\standings_by_round_2007_08_final.csv
Season 2008/09: rounds=33, matches=198
Leader changes: 8
  Round 2: M. Haifa -> M. Netanya
  Round 6: M. Netanya -> M. Haifa
  Round 9: M. Haifa -> M. Netanya
  Round 10: M. Netanya -> M. Haifa
  Round 27: M. Haifa -> H. Tel Aviv
  Round 28: H. Tel Aviv -> M. Haifa
  Roun

In [42]:
# Re-run Positions by Round scraping with proper User-Agent to bypass 403
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from pathlib import Path

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Connection': 'keep-alive'
}

def season_title(start_year: int) -> str:
    end_two = str((start_year + 1) % 100).zfill(2)
    return f"{start_year}‚Äì{end_two} Israeli Premier League"

def fetch_positions_table(start_year: int):
    title = season_title(start_year)
    url = 'https://en.wikipedia.org/wiki/' + title.replace(' ', '_')
    resp = requests.get(url, headers=HEADERS, timeout=15)
    if resp.status_code == 403:
        print(f"üîí 403 blocked for {title}, retrying without headers...")
        resp = requests.get(url, timeout=15)
    if resp.status_code != 200:
        print(f"‚ùå {title} page status {resp.status_code}")
        return None
    soup = BeautifulSoup(resp.text, 'html.parser')
    tables = soup.find_all('table', {'class': 'wikitable'})
    target = None
    for tbl in tables:
        caption = tbl.find('caption')
        if caption and 'Positions by round' in caption.get_text():
            target = tbl
            break
    if target is None:
        for tbl in tables:
            first_th = tbl.find('th')
            if first_th and ('Team' in first_th.get_text() and 'Round' in first_th.get_text()):
                target = tbl
                break
    if target is None:
        print(f"‚ùå Positions table not found for {title}")
        return None
    rows = target.find_all('tr')
    data = []
    max_rounds = 0
    for r in rows[1:]:
        cells = r.find_all(['th','td'])
        if not cells:
            continue
        team = cells[0].get_text(strip=True)
        if not team or team.lower().startswith('notes'):
            continue
        positions = []
        for c in cells[1:]:
            txt = c.get_text(strip=True)
            if txt == '' or txt in {'‚Äì','‚Äî'}:
                positions.append(None)
                continue
            m = re.match(r'^(\d+)$', txt)
            if m:
                positions.append(int(m.group(1)))
            else:
                m2 = re.match(r'^(\d+)', txt)
                positions.append(int(m2.group(1)) if m2 else None)
        max_rounds = max(max_rounds, len(positions))
        data.append({'team': team, 'positions': positions})
    if not data:
        return None
    for row in data:
        if len(row['positions']) < max_rounds:
            row['positions'] += [None] * (max_rounds - len(row['positions']))
    cols = ['team'] + [f'R{r}' for r in range(1, max_rounds+1)]
    records = []
    for row in data:
        rec = {'team': row['team']}
        for i,pos in enumerate(row['positions'], start=1):
            rec[f'R{i}'] = pos
        records.append(rec)
    df_positions = pd.DataFrame(records, columns=cols)
    return df_positions

def extract_leader_sequence(df_positions: pd.DataFrame):
    leader_seq = []
    rounds = [c for c in df_positions.columns if c.startswith('R')]
    for r in rounds:
        leaders = df_positions[df_positions[r] == 1]
        if not leaders.empty:
            leader_seq.append(leaders.iloc[0]['team'])
        else:
            leader_seq.append(None)
    changes = []
    prev = None
    for idx, leader in enumerate(leader_seq, start=1):
        if leader is None:
            continue
        if prev is None:
            prev = leader
            continue
        if leader != prev:
            changes.append((idx, prev, leader))
            prev = leader
    distinct = [x for x in pd.Series([l for l in leader_seq if l]).unique()]
    return leader_seq, changes, distinct

positions_summary = []
for start_year in range(2015, 2024):
    df_pos = fetch_positions_table(start_year)
    if df_pos is None:
        continue
    season_label = f"{start_year}/{str((start_year+1)%100).zfill(2)}"
    leader_seq, changes, distinct = extract_leader_sequence(df_pos)
    print(f"\nSeason {season_label} Positions by Round (scraped):")
    print(f"  Rounds: {len([c for c in df_pos.columns if c.startswith('R')])}")
    print(f"  Leader changes: {len(changes)}")
    for rnd, prev, nxt in changes:
        print(f"    Round {rnd}: {prev} -> {nxt}")
    print(f"  Distinct leaders: {len(distinct)} ({', '.join(distinct)})")
    out_file = INTERIM_DIR / f"positions_by_round_{season_label.replace('/', '_')}.csv"
    df_pos.to_csv(out_file, index=False)
    positions_summary.append({
        'season': season_label,
        'rounds': len([c for c in df_pos.columns if c.startswith('R')]),
        'leader_changes': len(changes),
        'distinct_leaders': len(distinct),
        'leaders': '|'.join(distinct)
    })

if positions_summary:
    summary_df = pd.DataFrame(positions_summary)
    summary_file = INTERIM_DIR / 'positions_by_round_leader_changes_summary.csv'
    summary_df.to_csv(summary_file, index=False)
    print(f"\n‚úÖ Saved summary to {summary_file}")
else:
    print('\n‚ö†Ô∏è Still no seasons scraped successfully.')



Season 2015/16 Positions by Round (scraped):
  Rounds: 36
  Leader changes: 7
    Round 2: Bnei Yehuda -> Maccabi Petah Tikva
    Round 6: Maccabi Petah Tikva -> Maccabi Tel Aviv
    Round 8: Maccabi Tel Aviv -> Hapoel Be'er Sheva
    Round 11: Hapoel Be'er Sheva -> Maccabi Tel Aviv
    Round 15: Maccabi Tel Aviv -> Hapoel Be'er Sheva
    Round 30: Hapoel Be'er Sheva -> Maccabi Tel Aviv
    Round 31: Maccabi Tel Aviv -> Hapoel Be'er Sheva
  Distinct leaders: 4 (Bnei Yehuda, Maccabi Petah Tikva, Maccabi Tel Aviv, Hapoel Be'er Sheva)

Season 2016/17 Positions by Round (scraped):
  Rounds: 36
  Leader changes: 3
    Round 3: Maccabi Tel Aviv -> Hapoel Be'er Sheva
    Round 4: Hapoel Be'er Sheva -> Maccabi Tel Aviv
    Round 7: Maccabi Tel Aviv -> Hapoel Be'er Sheva
  Distinct leaders: 2 (Maccabi Tel Aviv, Hapoel Be'er Sheva)

Season 2017/18 Positions by Round (scraped):
  Rounds: 26
  Leader changes: 8
    Round 6: Beitar Jerusalem -> Hapoel Haifa
    Round 13: Hapoel Haifa -> Hapoel Be'

In [59]:
# Title race: compute leaders and changes from 2013/14 using Transfermarkt match files
# Simple logic: every 7 rows = 1 round (for 14-team leagues)
from pathlib import Path
import re
import pandas as pd
from typing import Tuple, Dict, List, Optional


def _parse_score(score: str) -> tuple[int, int]:
    h, a = score.split(":")
    return int(h), int(a)


def _default_matches_dir() -> Path:
    # Prefer absolute workspace path if available; else try relative to this notebook
    candidates = [
        Path.cwd() / "ligat_haal_project" / "notebooks" / "data" / "matches",
        Path.cwd() / ".." / "data" / "matches",
        Path.cwd() / "data" / "matches",
    ]
    for p in candidates:
        if p.exists():
            return p.resolve()
    # Fallback to the known repo layout
    return (Path.cwd() / "ligat_haal_project" / "notebooks" / "data" / "matches").resolve()


def _season_from_filename(p: Path) -> Optional[str]:
    m = re.search(r"matches_(\d{4})_(\d{2})_ligat_haal_transfermarkt\.csv$", p.name)
    if not m:
        return None
    y1, y2 = int(m.group(1)), int(m.group(2))
    return f"{y1}/{str(y2).zfill(2)}"


def _list_season_files(matches_dir: Path, min_year: int = 2012) -> List[Path]:
    files = sorted(matches_dir.glob("matches_*_ligat_haal_transfermarkt.csv"))
    kept: List[tuple[int, int, Path]] = []
    for f in files:
        m = re.search(r"matches_(\d{4})_(\d{2})_ligat_haal_transfermarkt\.csv$", f.name)
        if not m:
            continue
        y1, y2 = int(m.group(1)), int(m.group(2))
        if y1 >= min_year:
            kept.append((y1, y2, f))
    kept.sort()
    return [f for _, __, f in kept]


def _compute_round_leaders(df: pd.DataFrame, matches_per_round: int = 7) -> pd.DataFrame:
    """
    Compute league leaders after each round using STRICT row-order division:
    - First `matches_per_round` rows = round 1
    - Next `matches_per_round` rows = round 2, etc.
    
    This is the EXACT interpretation requested by the user.
    """
    df = df.copy().reset_index(drop=True)

    teams = sorted(set(df["home"]).union(set(df["away"])))
    if len(teams) < 4:
        raise ValueError("Too few teams detected; file may be malformed")

    # STRICT: divide by matches_per_round (default 7 for 14-team league)
    df["round_calc"] = (df.index // matches_per_round) + 1

    stats: Dict[str, Dict[str, int]] = {
        t: dict(points=0, gf=0, ga=0, gd=0, played=0, wins=0, draws=0, losses=0) for t in teams
    }

    rows: List[Dict] = []
    leader_prev: Optional[str] = None
    change_count = 0
    leaders_seen: set = set()

    for rnd in sorted(df["round_calc"].unique()):
        sub = df[df["round_calc"] == rnd]
        for _, r in sub.iterrows():
            h, a = r["home"], r["away"]
            hg, ag = _parse_score(r["score"])  # type: ignore[arg-type]

            sh, sa = stats[h], stats[a]
            sh["gf"] += hg; sh["ga"] += ag; sh["gd"] += (hg - ag); sh["played"] += 1
            sa["gf"] += ag; sa["ga"] += hg; sa["gd"] += (ag - hg); sa["played"] += 1
            if hg > ag:
                sh["points"] += 3; sh["wins"] += 1; sa["losses"] += 1
            elif hg < ag:
                sa["points"] += 3; sa["wins"] += 1; sh["losses"] += 1
            else:
                sh["points"] += 1; sa["points"] += 1; sh["draws"] += 1; sa["draws"] += 1

        table = sorted(stats.items(), key=lambda kv: (kv[1]["points"], kv[1]["gd"], kv[1]["gf"]), reverse=True)
        leader = table[0][0]
        leaders_seen.add(leader)
        changed = leader_prev is not None and leader != leader_prev
        if changed:
            change_count += 1

        rows.append({
            "round": int(rnd),
            "leader": leader,
            "leader_points": table[0][1]["points"],
            "leader_gd": table[0][1]["gd"],
            "leader_gf": table[0][1]["gf"],
            "first_place_changed": bool(changed),
            "prev_leader": leader_prev,
            "change_count": change_count,
        })
        leader_prev = leader

    return pd.DataFrame(rows)


def build_title_race_tables(matches_dir: Optional[Path | str] = None,
                            start_season_year: int = 2012) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Build per-round leader tables for each season from `start_season_year`/`start_season_year+1`
    onward using Transfermarkt match CSVs.
    
    STRICT round definition: every 7 rows in the CSV = 1 round (for 14-team leagues).
    
    Returns a tuple of:
    - per_round_df: concatenated DataFrame with columns
      ['season','round','leader','leader_points','leader_gd','leader_gf','first_place_changed','prev_leader','change_count']
    - summary_df: per-season summary with total leadership changes, distinct leaders, and first/last leaders
    """
    if matches_dir is None:
        matches_dir = _default_matches_dir()
    else:
        matches_dir = Path(matches_dir)

    season_files = _list_season_files(matches_dir, min_year=start_season_year)
    all_rows: List[pd.DataFrame] = []

    for f in season_files:
        season_label = _season_from_filename(f)
        if season_label is None:
            continue
        df = pd.read_csv(f)
        required = {"round", "home", "score", "away"}
        if not required.issubset(df.columns):
            continue
        try:
            per_round_df = _compute_round_leaders(df, matches_per_round=7)
        except ValueError:
            continue
        per_round_df.insert(0, "season", season_label)
        all_rows.append(per_round_df)

    if not all_rows:
        return pd.DataFrame(), pd.DataFrame()

    per_round_all = pd.concat(all_rows, ignore_index=True)

    # Calculate distinct leaders per season
    distinct_leaders = per_round_all.groupby("season")["leader"].nunique().rename("distinct_leaders")

    summary = (
        per_round_all.groupby("season").agg(
            leadership_changes=("first_place_changed", "sum"),
            first_leader=("leader", "first"),
            last_leader=("leader", "last"),
            last_round=("round", "max"),
        ).reset_index()
    )
    
    # Add distinct_leaders column
    summary = summary.merge(distinct_leaders, on="season")

    return per_round_all, summary

# Example usage (uncomment to run):
# per_round, summary = build_title_race_tables()
# display(summary)


In [60]:
# Run Transfermarkt title race build, save outputs, and print results
from pathlib import Path

matches_dir = _default_matches_dir()
print(f"üóÇ Using matches dir: {matches_dir}")

# Debug: list candidate files and quick round-size check
files = sorted(Path(matches_dir).glob("matches_*_ligat_haal_transfermarkt.csv"))
print(f"Found {len(files)} candidate files:")
for f in files:
    try:
        d = pd.read_csv(f)
        ok_cols = {"round", "home", "score", "away"}.issubset(d.columns)
        sizes = d.groupby("round").size().unique() if ok_cols else "<missing cols>"
        print(f"  - {f.name} | cols_ok={ok_cols} | per_round={sizes}")
    except Exception as e:
        print(f"  - {f.name} | read error: {e}")

# Direct test on a known season to validate the helper
try:
    test_f = Path(matches_dir) / "matches_2015_16_ligat_haal_transfermarkt.csv"
    if test_f.exists():
        dtest = pd.read_csv(test_f)
        print(f"\nüß™ Testing season file: {test_f.name} | rows={len(dtest)}")
        pr_test = _compute_round_leaders(dtest)
        print(f"   Built per-round leaders: {len(pr_test)} rounds")
        display(pr_test.head())
    else:
        print("\nüß™ Test file for 2015/16 not found.")
except Exception as e:
    print(f"\n‚ùå Test compute failed: {e}")

per_round, summary = build_title_race_tables(matches_dir)

if per_round.empty or summary.empty:
    print("\n‚ö†Ô∏è No seasons processed. Check matches directory and file format.")
else:
    seasons = summary['season'].tolist()
    print(f"\n‚úÖ Processed {len(seasons)} seasons: {', '.join(seasons)}")

    print("\nüìä Summary (leadership changes and leaders):")
    display(summary)

    # Save files
    out_per_round = INTERIM_DIR / "tm_title_race_per_round.csv"
    out_summary = INTERIM_DIR / "tm_title_race_summary.csv"
    per_round.to_csv(out_per_round, index=False, encoding='utf-8-sig')
    summary.to_csv(out_summary, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Saved per-round leaders to: {out_per_round}")
    print(f"üíæ Saved summary to: {out_summary}")

    # Small preview for the first season
    first_season = seasons[0]
    preview = per_round[per_round['season'] == first_season].head(10)
    print(f"\nüîé Preview: {first_season} (first 10 rounds)")
    display(preview)


üóÇ Using matches dir: C:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\matches
Found 21 candidate files:
  - matches_2006_07_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2007_08_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2008_09_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2009_10_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2010_11_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2011_12_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2012_13_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2013_14_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2014_15_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2015_16_ligat_haal_transfermarkt.csv | cols_ok=True | per_round=[1]
  - matches_2016_17_ligat_haal_transfermarkt.csv | cols_ok=True | 

Unnamed: 0,round,leader,leader_points,leader_gd,leader_gf,first_place_changed,prev_leader,change_count
0,1,Bnei Yehuda,3,3,3,False,,0
1,2,M. Petah Tikva,6,4,4,True,Bnei Yehuda,1
2,3,M. Petah Tikva,9,6,7,False,M. Petah Tikva,1
3,4,M. Petah Tikva,12,7,8,False,M. Petah Tikva,1
4,5,M. Petah Tikva,12,6,8,False,M. Petah Tikva,1



‚úÖ Processed 14 seasons: 2012/13, 2013/14, 2014/15, 2015/16, 2016/17, 2017/18, 2018/19, 2019/20, 2020/21, 2021/22, 2022/23, 2023/24, 2024/25, 2025/26

üìä Summary (leadership changes and leaders):


Unnamed: 0,season,leadership_changes,first_leader,last_leader,last_round,distinct_leaders
0,2012/13,7,FC Ashdod,M. Tel Aviv,26,3
1,2013/14,2,Maccabi Haifa,M. Tel Aviv,26,3
2,2014/15,7,Hapoel Raanana,M. Tel Aviv,26,3
3,2015/16,5,Bnei Yehuda,H. Beer Sheva,26,4
4,2016/17,3,M. Tel Aviv,H. Beer Sheva,26,2
5,2017/18,8,B. Jerusalem,H. Beer Sheva,26,4
6,2018/19,4,Hapoel Haifa,M. Tel Aviv,26,3
7,2019/20,5,M. Tel Aviv,M. Tel Aviv,26,3
8,2020/21,8,Bnei Yehuda,Maccabi Haifa,26,6
9,2021/22,5,Bnei Sakhnin,Maccabi Haifa,26,6



üíæ Saved per-round leaders to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\tm_title_race_per_round.csv
üíæ Saved summary to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\tm_title_race_summary.csv

üîé Preview: 2012/13 (first 10 rounds)


Unnamed: 0,season,round,leader,leader_points,leader_gd,leader_gf,first_place_changed,prev_leader,change_count
0,2012/13,1,FC Ashdod,3,2,2,False,,0
1,2012/13,2,FC Ashdod,6,4,5,False,FC Ashdod,0
2,2012/13,3,FC Ashdod,9,5,7,False,FC Ashdod,0
3,2012/13,4,M. Tel Aviv,12,8,10,True,FC Ashdod,1
4,2012/13,5,FC Ashdod,15,9,11,True,M. Tel Aviv,2
5,2012/13,6,FC Ashdod,15,8,12,False,FC Ashdod,2
6,2012/13,7,FC Ashdod,18,10,14,False,FC Ashdod,2
7,2012/13,8,M. Tel Aviv,21,11,18,True,FC Ashdod,3
8,2012/13,9,M. Tel Aviv,21,10,18,False,M. Tel Aviv,3
9,2012/13,10,FC Ashdod,22,11,17,True,M. Tel Aviv,4


In [61]:
# Build a TM-based table identical in shape to positions_by_round_leader_changes_summary
import pandas as pd
from pathlib import Path

# Ensure we have per_round and summary; compute if missing
try:
    _ = per_round  # type: ignore[name-defined]
    _ = summary    # type: ignore[name-defined]
except NameError:
    per_round, summary = build_title_race_tables(_default_matches_dir())

if per_round is None or per_round.empty:
    print("‚ùå per_round is empty; run the previous cell to build TM tables.")
else:
    rows = []
    for season, df_season in per_round.groupby('season'):
        df_season = df_season.sort_values('round')
        leaders_seq = df_season['leader'].tolist()
        # Ordered distinct leaders preserving first appearance order
        ordered_distinct = []
        seen = set()
        for t in leaders_seq:
            if t not in seen:
                ordered_distinct.append(t)
                seen.add(t)
        leaders_str = '|'.join(ordered_distinct)
        leader_changes = int(df_season['first_place_changed'].sum())
        rows.append({
            'season': season,
            'rounds': int(df_season['round'].max()),
            'leader_changes': leader_changes,
            'distinct_leaders': len(ordered_distinct),
            'leaders': leaders_str,
        })

    tm_summary_like = pd.DataFrame(rows)
    # Sort seasons by start year
    def _start_year(s: str) -> int:
        try:
            return int(s.split('/')[0])
        except Exception:
            return 0
    tm_summary_like = tm_summary_like.sort_values(by='season', key=lambda s: s.map(_start_year)).reset_index(drop=True)

    # Save next to the original-style file
    out_path = INTERIM_DIR / 'positions_by_round_leader_changes_summary_tm.csv'
    tm_summary_like.to_csv(out_path, index=False, encoding='utf-8-sig')

    print('‚úÖ Created TM-based summary matching the original schema:')
    display(tm_summary_like)
    print(f"\nüíæ Saved to: {out_path}")


‚úÖ Created TM-based summary matching the original schema:


Unnamed: 0,season,rounds,leader_changes,distinct_leaders,leaders
0,2012/13,26,7,3,FC Ashdod|M. Tel Aviv|Hapoel Tel Aviv
1,2013/14,26,2,3,Maccabi Haifa|H. Beer Sheva|M. Tel Aviv
2,2014/15,26,7,3,Hapoel Raanana|M. Tel Aviv|Kiryat Shmona
3,2015/16,26,5,4,Bnei Yehuda|M. Petah Tikva|M. Tel Aviv|H. Beer...
4,2016/17,26,3,2,M. Tel Aviv|H. Beer Sheva
5,2017/18,26,8,4,B. Jerusalem|Hapoel Haifa|H. Beer Sheva|M. Tel...
6,2018/19,26,4,3,Hapoel Haifa|Hapoel Hadera|M. Tel Aviv
7,2019/20,26,5,3,M. Tel Aviv|H. Beer Sheva|Maccabi Haifa
8,2020/21,26,8,6,Bnei Yehuda|B. Jerusalem|Maccabi Haifa|M. Peta...
9,2021/22,26,5,6,Bnei Sakhnin|Hapoel Tel Aviv|H. Nof HaGalil|Ha...



üíæ Saved to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\positions_by_round_leader_changes_summary_tm.csv


In [55]:
# Align TM leadership_changes with Wikipedia Positions-by-Round when available
import pandas as pd
from pathlib import Path

wiki_summary_path = INTERIM_DIR / 'positions_by_round_leader_changes_summary.csv'
if not wiki_summary_path.exists():
    print(f"‚ö†Ô∏è Wikipedia summary not found at {wiki_summary_path}. Skipping alignment.")
else:
    wiki = pd.read_csv(wiki_summary_path)
    wiki = wiki[['season','leader_changes']].rename(columns={'leader_changes':'leader_changes_wiki'})

    tm = pd.read_csv(INTERIM_DIR / 'tm_title_race_summary.csv')
    merged = tm.merge(wiki, on='season', how='left')

    # Replace TM count with Wikipedia count when available
    merged['leadership_changes_aligned'] = merged['leader_changes_wiki'].fillna(merged['leadership_changes'])

    # Report diffs
    diffs = merged[merged['leadership_changes'] != merged['leadership_changes_aligned']]
    if not diffs.empty:
        print('üîß Adjusted leadership_changes to match Wikipedia for seasons:')
        for _, row in diffs.iterrows():
            print(f"  {row['season']}: {row['leadership_changes']} -> {int(row['leadership_changes_aligned'])}")
    else:
        print('‚úÖ No differences found; TM counts already match Wikipedia.')

    out_aligned = INTERIM_DIR / 'tm_title_race_summary_aligned.csv'
    merged.drop(columns=['leader_changes_wiki']).rename(columns={'leadership_changes_aligned':'leadership_changes'}).to_csv(out_aligned, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Saved aligned summary to: {out_aligned}")

    # Show the 2015/16 row for verification
    row_1516 = merged[merged['season'] == '2015/16']
    if not row_1516.empty:
        print('\nüîé 2015/16 verification:')
        display(row_1516)


üîß Adjusted leadership_changes to match Wikipedia for seasons:
  2015/16: 5 -> 7
  2019/20: 5 -> 0
  2020/21: 8 -> 0
  2021/22: 5 -> 0
  2022/23: 5 -> 0

üíæ Saved aligned summary to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\tm_title_race_summary_aligned.csv

üîé 2015/16 verification:


Unnamed: 0,season,leadership_changes,first_leader,last_leader,last_round,leader_changes_wiki,leadership_changes_aligned
2,2015/16,5,Bnei Yehuda,H. Beer Sheva,26,7.0,7.0


In [62]:
# Example: Detailed leadership changes for season 2015/16
import pandas as pd

example_season = "2015/16"

# Filter per_round for the example season
season_data = per_round[per_round['season'] == example_season].copy()

if season_data.empty:
    print(f"‚ùå No data found for season {example_season}")
else:
    print(f"üîç Detailed Leadership Analysis: {example_season}")
    print("="*80)
    
    # Show all rounds with leader info
    display_cols = ['round', 'leader', 'leader_points', 'leader_gd', 'first_place_changed', 'prev_leader']
    print(f"\nüìã All {len(season_data)} rounds:")
    display(season_data[display_cols])
    
    # Extract only the leadership changes
    changes = season_data[season_data['first_place_changed'] == True]
    
    print(f"\nüîÑ Leadership Changes Summary:")
    print(f"   Total changes: {len(changes)}")
    print(f"   Distinct leaders: {season_data['leader'].nunique()}")
    
    if not changes.empty:
        print(f"\n   Details of each change:")
        for idx, row in changes.iterrows():
            print(f"   ‚Ä¢ Round {int(row['round'])}: {row['prev_leader']} ‚Üí {row['leader']} "
                  f"(pts: {int(row['leader_points'])}, gd: {int(row['leader_gd'])})")
    
    # Show first and last 3 rounds for context
    print(f"\nüìä Context - First 3 rounds:")
    display(season_data.head(3)[['round', 'leader', 'leader_points', 'leader_gd']])
    
    print(f"\nüìä Context - Last 3 rounds:")
    display(season_data.tail(3)[['round', 'leader', 'leader_points', 'leader_gd']])
    
    # List all unique leaders
    unique_leaders = season_data['leader'].unique()
    print(f"\nüèÜ All teams that led the table during {example_season}:")
    for i, team in enumerate(unique_leaders, 1):
        rounds_led = season_data[season_data['leader'] == team]['round'].tolist()
        print(f"   {i}. {team}: Rounds {rounds_led}")


üîç Detailed Leadership Analysis: 2015/16

üìã All 26 rounds:


Unnamed: 0,round,leader,leader_points,leader_gd,first_place_changed,prev_leader
78,1,Bnei Yehuda,3,3,False,
79,2,M. Petah Tikva,6,4,True,Bnei Yehuda
80,3,M. Petah Tikva,9,6,False,M. Petah Tikva
81,4,M. Petah Tikva,12,7,False,M. Petah Tikva
82,5,M. Petah Tikva,12,6,False,M. Petah Tikva
83,6,M. Tel Aviv,13,8,True,M. Petah Tikva
84,7,M. Tel Aviv,16,9,False,M. Tel Aviv
85,8,H. Beer Sheva,16,8,True,M. Tel Aviv
86,9,H. Beer Sheva,19,10,False,H. Beer Sheva
87,10,H. Beer Sheva,22,11,False,H. Beer Sheva



üîÑ Leadership Changes Summary:
   Total changes: 5
   Distinct leaders: 4

   Details of each change:
   ‚Ä¢ Round 2: Bnei Yehuda ‚Üí M. Petah Tikva (pts: 6, gd: 4)
   ‚Ä¢ Round 6: M. Petah Tikva ‚Üí M. Tel Aviv (pts: 13, gd: 8)
   ‚Ä¢ Round 8: M. Tel Aviv ‚Üí H. Beer Sheva (pts: 16, gd: 8)
   ‚Ä¢ Round 11: H. Beer Sheva ‚Üí M. Tel Aviv (pts: 25, gd: 12)
   ‚Ä¢ Round 15: M. Tel Aviv ‚Üí H. Beer Sheva (pts: 35, gd: 17)

üìä Context - First 3 rounds:


Unnamed: 0,round,leader,leader_points,leader_gd
78,1,Bnei Yehuda,3,3
79,2,M. Petah Tikva,6,4
80,3,M. Petah Tikva,9,6



üìä Context - Last 3 rounds:


Unnamed: 0,round,leader,leader_points,leader_gd
101,24,H. Beer Sheva,60,30
102,25,H. Beer Sheva,63,31
103,26,H. Beer Sheva,64,31



üèÜ All teams that led the table during 2015/16:
   1. Bnei Yehuda: Rounds [1]
   2. M. Petah Tikva: Rounds [2, 3, 4, 5]
   3. M. Tel Aviv: Rounds [6, 7, 11, 12, 13, 14]
   4. H. Beer Sheva: Rounds [8, 9, 10, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]


In [76]:
# Calculate leadership changes for 2006-2009 seasons (12 teams, 6 matches per round)
import pandas as pd
from pathlib import Path

# Read existing 2006-2009 match files from interim folder
seasons_12_teams = [
    ('2006/07', INTERIM_DIR / 'matches_2006_07_ligat_haal_regular_corrected.csv'),
    ('2007/08', INTERIM_DIR / 'matches_2007_08_ligat_haal_regular_corrected.csv'),
    ('2008/09', INTERIM_DIR / 'matches_2008_09_ligat_haal_regular_corrected.csv'),
]

early_seasons_data = []

for season_label, file_path in seasons_12_teams:
    if not file_path.exists():
        print(f"‚ö†Ô∏è File not found: {file_path}")
        continue
    
    df = pd.read_csv(file_path)
    
    # Check columns
    if 'home_team' in df.columns and 'away_team' in df.columns:
        # Rename to match our function's expectations
        df = df.rename(columns={'home_team': 'home', 'away_team': 'away', 
                                 'home_goals': 'home_score', 'away_goals': 'away_score'})
        # Create score column
        df['score'] = df['home_score'].astype(str) + ':' + df['away_score'].astype(str)
    
    # Ensure we have required columns
    if not {'home', 'away', 'score'}.issubset(df.columns):
        print(f"‚ö†Ô∏è Missing required columns in {file_path.name}")
        continue
    
    try:
        # Use 6 matches per round for 12-team leagues
        per_round_df = _compute_round_leaders(df, matches_per_round=6)
        per_round_df.insert(0, "season", season_label)
        early_seasons_data.append(per_round_df)
        print(f"‚úÖ Processed {season_label}: {len(per_round_df)} rounds")
    except Exception as e:
        print(f"‚ùå Error processing {season_label}: {e}")

if early_seasons_data:
    # Combine with existing per_round data
    early_per_round = pd.concat(early_seasons_data, ignore_index=True)
    
    # Calculate summary for 2006-2009
    distinct_leaders_early = early_per_round.groupby("season")["leader"].nunique().rename("distinct_leaders")
    
    summary_early = (
        early_per_round.groupby("season").agg(
            leadership_changes=("first_place_changed", "sum"),
            first_leader=("leader", "first"),
            last_leader=("leader", "last"),
            last_round=("round", "max"),
        ).reset_index()
    )
    summary_early = summary_early.merge(distinct_leaders_early, on="season")
    
    print(f"\nüìä Summary for 2006-2009 seasons:")
    display(summary_early)
    
    # Combine with existing summary (2012+)
    combined_summary = pd.concat([summary_early, summary], ignore_index=True)
    
    # Sort by season
    def _start_year(s: str) -> int:
        try:
            return int(s.split('/')[0])
        except Exception:
            return 9999
    combined_summary = combined_summary.sort_values(by='season', key=lambda s: s.map(_start_year)).reset_index(drop=True)
    
    # Save combined summary
    out_combined = INTERIM_DIR / 'tm_title_race_summary_all_seasons.csv'
    combined_summary.to_csv(out_combined, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Saved combined summary (2006-2026) to: {out_combined}")
    
    # Update tm_title_race_per_round.csv with 2006-2009 seasons (avoid duplicates)
    existing_per_round = pd.read_csv(INTERIM_DIR / 'tm_title_race_per_round.csv')
    # Remove existing 2006-2009 data if present to avoid duplicates
    existing_per_round = existing_per_round[~existing_per_round['season'].isin(['2006/07', '2007/08', '2008/09'])]
    combined_per_round = pd.concat([existing_per_round, early_per_round], ignore_index=True)
    combined_per_round = combined_per_round.sort_values(by=['season', 'round'], key=lambda x: x.map(lambda v: _start_year(v) if isinstance(v, str) else v)).reset_index(drop=True)
    out_per_round = INTERIM_DIR / 'tm_title_race_per_round.csv'
    combined_per_round.to_csv(out_per_round, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Updated tm_title_race_per_round.csv with 2006-2009 data (duplicates removed)")
    
    print(f"\nüìã Combined summary (all seasons):")
    display(combined_summary)
    
    # Update positions_by_round_leader_changes_summary_tm with early seasons
    early_summary_like = []
    for season, df_season in early_per_round.groupby('season'):
        df_season = df_season.sort_values('round')
        leaders_seq = df_season['leader'].tolist()
        ordered_distinct = []
        seen = set()
        for t in leaders_seq:
            if t not in seen:
                ordered_distinct.append(t)
                seen.add(t)
        leaders_str = '|'.join(ordered_distinct)
        leader_changes = int(df_season['first_place_changed'].sum())
        early_summary_like.append({
            'season': season,
            'rounds': int(df_season['round'].max()),
            'leader_changes': leader_changes,
            'distinct_leaders': len(ordered_distinct),
            'leaders': leaders_str,
        })
    
    early_tm_like = pd.DataFrame(early_summary_like)
    
    # Load existing tm summary and remove 2006-2009 to avoid duplicates
    existing_tm = pd.read_csv(INTERIM_DIR / 'positions_by_round_leader_changes_summary_tm.csv')
    existing_tm = existing_tm[~existing_tm['season'].isin(['2006/07', '2007/08', '2008/09'])]
    
    # Combine
    combined_tm = pd.concat([early_tm_like, existing_tm], ignore_index=True)
    combined_tm = combined_tm.sort_values(by='season', key=lambda s: s.map(_start_year)).reset_index(drop=True)
    
    # Save
    out_tm_combined = INTERIM_DIR / 'positions_by_round_leader_changes_summary_tm.csv'
    combined_tm.to_csv(out_tm_combined, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Updated positions_by_round_leader_changes_summary_tm.csv with 2006-2009 data")
    
    print(f"\nüìã Updated TM-style summary (2006-2026):")
    display(combined_tm)
else:
    print("\n‚ùå No early seasons data processed")


‚úÖ Processed 2006/07: 33 rounds
‚úÖ Processed 2007/08: 33 rounds
‚úÖ Processed 2008/09: 33 rounds

üìä Summary for 2006-2009 seasons:


Unnamed: 0,season,leadership_changes,first_leader,last_leader,last_round,distinct_leaders
0,2006/07,5,H. Kfar Saba,B. Jerusalem,33,4
1,2007/08,0,B. Jerusalem,B. Jerusalem,33,1
2,2008/09,8,Maccabi Haifa,Maccabi Haifa,33,3



üíæ Saved combined summary (2006-2026) to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\tm_title_race_summary_all_seasons.csv

üíæ Updated tm_title_race_per_round.csv with 2006-2009 data (duplicates removed)

üìã Combined summary (all seasons):


Unnamed: 0,season,leadership_changes,first_leader,last_leader,last_round,distinct_leaders
0,2006/07,5,H. Kfar Saba,B. Jerusalem,33,4
1,2007/08,0,B. Jerusalem,B. Jerusalem,33,1
2,2008/09,8,Maccabi Haifa,Maccabi Haifa,33,3
3,2012/13,7,FC Ashdod,M. Tel Aviv,26,3
4,2013/14,2,Maccabi Haifa,M. Tel Aviv,26,3
5,2014/15,7,Hapoel Raanana,M. Tel Aviv,26,3
6,2015/16,5,Bnei Yehuda,H. Beer Sheva,26,4
7,2016/17,3,M. Tel Aviv,H. Beer Sheva,26,2
8,2017/18,8,B. Jerusalem,H. Beer Sheva,26,4
9,2018/19,4,Hapoel Haifa,M. Tel Aviv,26,3



üíæ Updated positions_by_round_leader_changes_summary_tm.csv with 2006-2009 data

üìã Updated TM-style summary (2006-2026):


Unnamed: 0,season,rounds,leader_changes,distinct_leaders,leaders
0,2006/07,33,5,4,H. Kfar Saba|B. Jerusalem|M. Petah Tikva|Hapoe...
1,2007/08,33,0,1,B. Jerusalem
2,2008/09,33,8,3,Maccabi Haifa|Maccabi Netanya|Hapoel Tel Aviv
3,2009/10,30,1,2,Bnei Yehuda|Maccabi Haifa
4,2010/11,30,12,6,Maccabi Netanya|Hapoel Haifa|Kiryat Shmona|Mac...
5,2011/12,30,7,3,Hapoel Tel Aviv|M. Tel Aviv|Kiryat Shmona
6,2012/13,26,7,3,FC Ashdod|M. Tel Aviv|Hapoel Tel Aviv
7,2013/14,26,2,3,Maccabi Haifa|H. Beer Sheva|M. Tel Aviv
8,2014/15,26,7,3,Hapoel Raanana|M. Tel Aviv|Kiryat Shmona
9,2015/16,26,5,4,Bnei Yehuda|M. Petah Tikva|M. Tel Aviv|H. Beer...


In [77]:
# Calculate leadership changes for 2009-2012 seasons (16 teams, 8 matches per round)
import pandas as pd
from pathlib import Path

# Use Transfermarkt files for 2009-2012 (16-team seasons)
seasons_16_teams = [
    ('2009/10', Path(matches_dir) / 'matches_2009_10_ligat_haal_transfermarkt.csv'),
    ('2010/11', Path(matches_dir) / 'matches_2010_11_ligat_haal_transfermarkt.csv'),
    ('2011/12', Path(matches_dir) / 'matches_2011_12_ligat_haal_transfermarkt.csv'),
]

seasons_16_data = []

for season_label, file_path in seasons_16_teams:
    if not file_path.exists():
        print(f"‚ö†Ô∏è File not found: {file_path}")
        continue
    
    df = pd.read_csv(file_path)
    
    # Ensure we have required columns
    if not {'home', 'away', 'score'}.issubset(df.columns):
        print(f"‚ö†Ô∏è Missing required columns in {file_path.name}")
        continue
    
    try:
        # Use 8 matches per round for 16-team leagues
        per_round_df = _compute_round_leaders(df, matches_per_round=8)
        per_round_df.insert(0, "season", season_label)
        seasons_16_data.append(per_round_df)
        print(f"‚úÖ Processed {season_label}: {len(per_round_df)} rounds")
    except Exception as e:
        print(f"‚ùå Error processing {season_label}: {e}")

if seasons_16_data:
    # Combine 16-team seasons data
    seasons_16_per_round = pd.concat(seasons_16_data, ignore_index=True)
    
    # Calculate summary for 2009-2012
    distinct_leaders_16 = seasons_16_per_round.groupby("season")["leader"].nunique().rename("distinct_leaders")
    
    summary_16 = (
        seasons_16_per_round.groupby("season").agg(
            leadership_changes=("first_place_changed", "sum"),
            first_leader=("leader", "first"),
            last_leader=("leader", "last"),
            last_round=("round", "max"),
        ).reset_index()
    )
    summary_16 = summary_16.merge(distinct_leaders_16, on="season")
    
    print(f"\nüìä Summary for 2009-2012 seasons (16 teams):")
    display(summary_16)
    
    # Load existing combined summary
    existing_combined = pd.read_csv(INTERIM_DIR / 'tm_title_race_summary_all_seasons.csv')
    # Remove existing 2009-2012 data if present to avoid duplicates
    existing_combined = existing_combined[~existing_combined['season'].isin(['2009/10', '2010/11', '2011/12'])]
    
    # Combine with 16-team seasons
    all_seasons_summary = pd.concat([existing_combined, summary_16], ignore_index=True)
    
    # Sort by season
    def _start_year(s: str) -> int:
        try:
            return int(s.split('/')[0])
        except Exception:
            return 9999
    all_seasons_summary = all_seasons_summary.sort_values(by='season', key=lambda s: s.map(_start_year)).reset_index(drop=True)
    
    # Save updated combined summary
    out_all = INTERIM_DIR / 'tm_title_race_summary_all_seasons.csv'
    all_seasons_summary.to_csv(out_all, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Saved updated combined summary (2006-2026) to: {out_all}")
    
    # Update tm_title_race_per_round.csv with 16-team seasons (avoid duplicates)
    existing_per_round = pd.read_csv(INTERIM_DIR / 'tm_title_race_per_round.csv')
    # Remove existing 2009-2012 data if present to avoid duplicates
    existing_per_round = existing_per_round[~existing_per_round['season'].isin(['2009/10', '2010/11', '2011/12'])]
    combined_per_round = pd.concat([existing_per_round, seasons_16_per_round], ignore_index=True)
    combined_per_round = combined_per_round.sort_values(by=['season', 'round'], key=lambda x: x.map(lambda v: _start_year(v) if isinstance(v, str) else v)).reset_index(drop=True)
    out_per_round = INTERIM_DIR / 'tm_title_race_per_round.csv'
    combined_per_round.to_csv(out_per_round, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Updated tm_title_race_per_round.csv with 2009-2012 data (duplicates removed)")
    
    # Update positions_by_round_leader_changes_summary_tm with 16-team seasons
    seasons_16_summary_like = []
    for season, df_season in seasons_16_per_round.groupby('season'):
        df_season = df_season.sort_values('round')
        leaders_seq = df_season['leader'].tolist()
        ordered_distinct = []
        seen = set()
        for t in leaders_seq:
            if t not in seen:
                ordered_distinct.append(t)
                seen.add(t)
        leaders_str = '|'.join(ordered_distinct)
        leader_changes = int(df_season['first_place_changed'].sum())
        seasons_16_summary_like.append({
            'season': season,
            'rounds': int(df_season['round'].max()),
            'leader_changes': leader_changes,
            'distinct_leaders': len(ordered_distinct),
            'leaders': leaders_str,
        })
    
    seasons_16_tm_like = pd.DataFrame(seasons_16_summary_like)
    
    # Load existing tm summary
    existing_tm_all = pd.read_csv(INTERIM_DIR / 'positions_by_round_leader_changes_summary_tm.csv')
    # Remove existing 2009-2012 data if present to avoid duplicates
    existing_tm_all = existing_tm_all[~existing_tm_all['season'].isin(['2009/10', '2010/11', '2011/12'])]
    
    # Combine
    combined_tm_all = pd.concat([existing_tm_all, seasons_16_tm_like], ignore_index=True)
    combined_tm_all = combined_tm_all.sort_values(by='season', key=lambda s: s.map(_start_year)).reset_index(drop=True)
    
    # Save
    out_tm_all = INTERIM_DIR / 'positions_by_round_leader_changes_summary_tm.csv'
    combined_tm_all.to_csv(out_tm_all, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Updated positions_by_round_leader_changes_summary_tm.csv with 2009-2012 data")
    
    print(f"\nüìã Complete TM-style summary (2006-2026, all {len(combined_tm_all)} seasons):")
    display(combined_tm_all)
else:
    print("\n‚ùå No 16-team seasons data processed")


‚úÖ Processed 2009/10: 30 rounds
‚úÖ Processed 2010/11: 30 rounds
‚úÖ Processed 2011/12: 30 rounds

üìä Summary for 2009-2012 seasons (16 teams):


Unnamed: 0,season,leadership_changes,first_leader,last_leader,last_round,distinct_leaders
0,2009/10,1,Bnei Yehuda,Maccabi Haifa,30,2
1,2010/11,12,Maccabi Netanya,Maccabi Haifa,30,6
2,2011/12,7,Hapoel Tel Aviv,Kiryat Shmona,30,3



üíæ Saved updated combined summary (2006-2026) to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\tm_title_race_summary_all_seasons.csv

üíæ Updated tm_title_race_per_round.csv with 2009-2012 data (duplicates removed)

üíæ Updated positions_by_round_leader_changes_summary_tm.csv with 2009-2012 data

üìã Complete TM-style summary (2006-2026, all 20 seasons):


Unnamed: 0,season,rounds,leader_changes,distinct_leaders,leaders
0,2006/07,33,5,4,H. Kfar Saba|B. Jerusalem|M. Petah Tikva|Hapoe...
1,2007/08,33,0,1,B. Jerusalem
2,2008/09,33,8,3,Maccabi Haifa|Maccabi Netanya|Hapoel Tel Aviv
3,2009/10,30,1,2,Bnei Yehuda|Maccabi Haifa
4,2010/11,30,12,6,Maccabi Netanya|Hapoel Haifa|Kiryat Shmona|Mac...
5,2011/12,30,7,3,Hapoel Tel Aviv|M. Tel Aviv|Kiryat Shmona
6,2012/13,26,7,3,FC Ashdod|M. Tel Aviv|Hapoel Tel Aviv
7,2013/14,26,2,3,Maccabi Haifa|H. Beer Sheva|M. Tel Aviv
8,2014/15,26,7,3,Hapoel Raanana|M. Tel Aviv|Kiryat Shmona
9,2015/16,26,5,4,Bnei Yehuda|M. Petah Tikva|M. Tel Aviv|H. Beer...


In [73]:
# Calculate playoff championship leadership changes (2012/13 onwards, 14-team format)
# Top 6 teams play 10 additional matches, starting with their regular season points/GD
import pandas as pd
from pathlib import Path

PLAYOFFS_DIR = ROOT / 'data' / 'playoffs'

# Wikipedia full names to TM abbreviated names mapping (for playoff files)
WIKI_TO_TM_MAP = {
    'F.C. Ashdod': 'FC Ashdod',
    'Beitar Jerusalem': 'B. Jerusalem',
    'Bnei Sakhnin': 'Bnei Sakhnin',
    'Bnei Yehuda': 'Bnei Yehuda',
    "Hapoel Be'er Sheva": 'H. Beer Sheva',
    'Hapoel Haifa': 'Hapoel Haifa',
    "Hapoel Ra'anana": 'Hapoel Raanana',
    'Hapoel Tel Aviv': 'Hapoel Tel Aviv',
    'Ironi Kiryat Shmona': 'Kiryat Shmona',
    'Maccabi Haifa': 'Maccabi Haifa',
    'Maccabi Petah Tikva': 'M. Petah Tikva',
    'Maccabi Tel Aviv': 'M. Tel Aviv',
    'Maccabi Netanya': 'Maccabi Netanya',
    'Hapoel Hadera': 'Hapoel Hadera',
    'Hapoel Jerusalem': 'H. Jerusalem',
    'Maccabi Bnei Reineh': 'M. Bnei Reineh',
    'Ironi Ramat HaSharon': 'Ramat haSharon',
}

# Seasons with championship playoffs (16-team format: 2009-2012, 14-team format: 2012-2025)
playoff_seasons_16_teams = ['2009/10', '2010/11', '2011/12']
playoff_seasons_14_teams = [
    '2012/13', '2013/14', '2014/15', '2015/16', '2016/17', '2017/18',
    '2018/19', '2019/20', '2020/21', '2021/22', '2022/23', '2023/24', '2024/25'
]
playoff_seasons = playoff_seasons_16_teams + playoff_seasons_14_teams

def calculate_playoff_leaders(season_str: str, 
                              regular_final_standings: pd.DataFrame,
                              playoff_matches: pd.DataFrame,
                              num_playoff_teams: int = 6,
                              apply_halving: bool = False) -> pd.DataFrame:
    """
    Calculate round-by-round leaders in championship playoff.
    
    Args:
        season_str: Season label (e.g., '2015/16')
        regular_final_standings: Final standings from regular season
        playoff_matches: Playoff match results
        num_playoff_teams: Number of teams in playoff (4, 6, or 8)
        apply_halving: If True, apply "◊ó◊ï◊ß ◊î◊ß◊ô◊ñ◊ï◊ñ" - halve points before playoff (2009/10, 2010/11)
    
    Returns:
        DataFrame with playoff round leaders
    """
    # Initialize stats from regular season for top N teams
    topN = regular_final_standings.head(num_playoff_teams).copy()
    
    stats = {}
    for _, row in topN.iterrows():
        team = row['team']
        points = int(row['points'])
        
        # Apply halving rule for 2009/10 and 2010/11 seasons
        if apply_halving:
            points = points // 2
        
        stats[team] = {
            'points': points,
            'gf': int(row['gf']),
            'ga': int(row['ga']),
            'gd': int(row['gd']),
            'played': int(row['played']),
            'wins': int(row.get('won', row.get('wins', 0))),
            'draws': int(row.get('drawn', row.get('draws', 0))),
            'losses': int(row.get('lost', row.get('losses', 0))),
        }
    
    # Process playoff matches (each round = num_teams/2 matches)
    playoff_matches = playoff_matches.sort_index().reset_index(drop=True)
    matches_per_round = num_playoff_teams // 2
    playoff_matches['playoff_round'] = (playoff_matches.index // matches_per_round) + 1
    
    rows = []
    leader_prev = None
    change_count = 0
    
    # Add round 0 (starting position from regular season)
    table_start = sorted(stats.items(), key=lambda kv: (kv[1]["points"], kv[1]["gd"], kv[1]["gf"]), reverse=True)
    leader_start = table_start[0][0]
    rows.append({
        "round": 0,
        "leader": leader_start,
        "leader_points": table_start[0][1]["points"],
        "leader_gd": table_start[0][1]["gd"],
        "leader_gf": table_start[0][1]["gf"],
        "first_place_changed": False,
        "prev_leader": None,
        "change_count": 0,
    })
    leader_prev = leader_start
    
    # Process each playoff round
    for rnd in sorted(playoff_matches['playoff_round'].unique()):
        sub = playoff_matches[playoff_matches['playoff_round'] == rnd]
        for _, r in sub.iterrows():
            h, a = r['home_team'], r['away_team']
            hg, ag = int(r['home_goals']), int(r['away_goals'])
            
            sh, sa = stats[h], stats[a]
            sh['gf'] += hg; sh['ga'] += ag; sh['gd'] += (hg - ag); sh['played'] += 1
            sa['gf'] += ag; sa['ga'] += hg; sa['gd'] += (ag - hg); sa['played'] += 1
            
            if hg > ag:
                sh['points'] += 3; sh['wins'] += 1; sa['losses'] += 1
            elif hg < ag:
                sa['points'] += 3; sa['wins'] += 1; sh['losses'] += 1
            else:
                sh['points'] += 1; sa['points'] += 1
                sh['draws'] += 1; sa['draws'] += 1
        
        table = sorted(stats.items(), key=lambda kv: (kv[1]["points"], kv[1]["gd"], kv[1]["gf"]), reverse=True)
        leader = table[0][0]
        changed = leader != leader_prev
        if changed:
            change_count += 1
        
        rows.append({
            "round": int(rnd),
            "leader": leader,
            "leader_points": table[0][1]["points"],
            "leader_gd": table[0][1]["gd"],
            "leader_gf": table[0][1]["gf"],
            "first_place_changed": bool(changed),
            "prev_leader": leader_prev,
            "change_count": change_count,
        })
        leader_prev = leader
    
    return pd.DataFrame(rows)


playoff_results = []

for season in playoff_seasons:
    # Determine matches per round based on season (16 teams = 8 matches, 14 teams = 7 matches)
    if season in playoff_seasons_16_teams:
        regular_matches_per_round = 8
        matches_dir = ROOT / 'data' / 'matches'
    else:
        regular_matches_per_round = 7
        matches_dir = ROOT / 'data' / 'matches'
    
    # Load playoff matches
    playoff_file = PLAYOFFS_DIR / f'playoffs_championship_{season.replace("/", "_")}_ligat_haal_wikipedia.csv'
    if not playoff_file.exists():
        print(f"‚ö†Ô∏è Playoff file not found: {playoff_file.name}")
        continue
    
    playoff_df = pd.read_csv(playoff_file)
    
    # Normalize team names from Wikipedia format to TM format
    playoff_df['home_team'] = playoff_df['home_team'].map(lambda x: WIKI_TO_TM_MAP.get(x, x))
    playoff_df['away_team'] = playoff_df['away_team'].map(lambda x: WIKI_TO_TM_MAP.get(x, x))
    
    # Load regular season final standings from tm_title_race_per_round
    per_round_file = INTERIM_DIR / 'tm_title_race_per_round.csv'
    if per_round_file.exists():
        per_round_all = pd.read_csv(per_round_file)
        season_data = per_round_all[per_round_all['season'] == season]
        
        if season_data.empty:
            print(f"‚ö†Ô∏è No regular season data for {season}")
            continue
        
        # Get final round standings - reconstruct full table
        final_round = season_data['round'].max()
        final_round_data = season_data[season_data['round'] == final_round].iloc[0]
        
        # Need to reconstruct the full standings - use summary from earlier calculation
        # For now, use a workaround: read the last round and build top 6
        # Better: store full standings or recalculate
        
        # Temporary: Build standings from playoff match participants
        top6_teams = sorted(set(playoff_df['home_team']).union(set(playoff_df['away_team'])))
        
        # Reconstruct standings by replaying the regular season for these 6 teams
        # (This is a simplified approach - ideally we'd store full standings)
        matches_file = Path(matches_dir) / f'matches_{season.replace("/", "_")}_ligat_haal_transfermarkt.csv'
        if not matches_file.exists():
            print(f"‚ö†Ô∏è Matches file not found for {season}")
            continue
        
        matches_df = pd.read_csv(matches_file)
        
        # Calculate final standings for all teams
        teams = sorted(set(matches_df["home"]).union(set(matches_df["away"])))
        stats_final = {t: dict(points=0, gf=0, ga=0, gd=0, played=0, wins=0, draws=0, losses=0) for t in teams}
        
        for _, r in matches_df.iterrows():
            h, a = r["home"], r["away"]
            hg, ag = _parse_score(r["score"])
            
            sh, sa = stats_final[h], stats_final[a]
            sh["gf"] += hg; sh["ga"] += ag; sh["gd"] += (hg - ag); sh["played"] += 1
            sa["gf"] += ag; sa["ga"] += hg; sa["gd"] += (ag - hg); sa["played"] += 1
            
            if hg > ag:
                sh["points"] += 3; sh["wins"] += 1; sa["losses"] += 1
            elif hg < ag:
                sa["points"] += 3; sa["wins"] += 1; sh["losses"] += 1
            else:
                sh["points"] += 1; sa["points"] += 1; sh["draws"] += 1; sa["draws"] += 1
        
        # Create standings DataFrame
        standings_list = []
        for team, s in stats_final.items():
            standings_list.append({
                'team': team,
                'points': s['points'],
                'gf': s['gf'],
                'ga': s['ga'],
                'gd': s['gd'],
                'played': s['played'],
                'wins': s['wins'],
                'draws': s['draws'],
                'losses': s['losses'],
            })
        
        regular_standings = pd.DataFrame(standings_list)
        regular_standings = regular_standings.sort_values(['points', 'gd', 'gf'], ascending=[False, False, False]).reset_index(drop=True)
        
        # Determine number of playoff teams based on unique teams in playoff file
        playoff_teams_unique = sorted(set(playoff_df['home_team']).union(set(playoff_df['away_team'])))
        num_playoff_teams = len(playoff_teams_unique)
        
        # Check if this season uses the halving rule (◊ó◊ï◊ß ◊î◊ß◊ô◊ñ◊ï◊ñ)
        apply_halving = season in ['2009/10', '2010/11']
        
        # Calculate playoff leaders
        try:
            playoff_leaders_df = calculate_playoff_leaders(
                season, regular_standings, playoff_df, num_playoff_teams, apply_halving
            )
            playoff_leaders_df.insert(0, 'season', season)
            playoff_results.append(playoff_leaders_df)
            
            changes = playoff_leaders_df[playoff_leaders_df['first_place_changed'] == True]
            halving_note = " (◊ó◊ï◊ß ◊ß◊ô◊ñ◊ï◊ñ - ◊ó◊¶◊ô ◊†◊ß◊ï◊ì◊ï◊™)" if apply_halving else ""
            print(f"‚úÖ {season}: {len(changes)} leadership changes in playoff ({num_playoff_teams} teams, rounds 0-{playoff_leaders_df['round'].max()}){halving_note}")
        except Exception as e:
            print(f"‚ùå Error processing playoff for {season}: {e}")
            # Debug: show team names
            playoff_teams = sorted(set(playoff_df['home_team']).union(set(playoff_df['away_team'])))
            regular_teams = sorted(regular_standings.head(num_playoff_teams)['team'].tolist())
            print(f"   Playoff teams ({len(playoff_teams)}): {playoff_teams}")
            print(f"   Regular top {num_playoff_teams}: {regular_teams}")
    else:
        print(f"‚ö†Ô∏è Per-round file not found")

if playoff_results:
    playoff_all = pd.concat(playoff_results, ignore_index=True)
    
    # Summary
    playoff_summary = playoff_all[playoff_all['round'] > 0].groupby('season').agg(
        playoff_leadership_changes=('first_place_changed', 'sum'),
        playoff_first_leader=('leader', 'first'),
        playoff_last_leader=('leader', 'last'),
        playoff_rounds=('round', 'max'),
    ).reset_index()
    
    playoff_summary['playoff_distinct_leaders'] = playoff_all[playoff_all['round'] > 0].groupby('season')['leader'].nunique().values
    
    # Save
    out_playoff_summary = INTERIM_DIR / 'playoff_championship_leadership_changes.csv'
    playoff_summary.to_csv(out_playoff_summary, index=False, encoding='utf-8-sig')
    
    out_playoff_detail = INTERIM_DIR / 'playoff_championship_per_round.csv'
    playoff_all.to_csv(out_playoff_detail, index=False, encoding='utf-8-sig')
    
    print(f"\nüìä Championship Playoff Leadership Changes Summary:")
    display(playoff_summary)
    
    print(f"\nüíæ Saved playoff summary to: {out_playoff_summary}")
    print(f"üíæ Saved playoff per-round to: {out_playoff_detail}")
else:
    print("\n‚ùå No playoff data processed")


‚úÖ 2009/10: 1 leadership changes in playoff (6 teams, rounds 0-5) (◊ó◊ï◊ß ◊ß◊ô◊ñ◊ï◊ñ - ◊ó◊¶◊ô ◊†◊ß◊ï◊ì◊ï◊™)
‚úÖ 2010/11: 2 leadership changes in playoff (6 teams, rounds 0-5) (◊ó◊ï◊ß ◊ß◊ô◊ñ◊ï◊ñ - ◊ó◊¶◊ô ◊†◊ß◊ï◊ì◊ï◊™)
‚úÖ 2011/12: 0 leadership changes in playoff (8 teams, rounds 0-7)
‚úÖ 2012/13: 0 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2013/14: 0 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2014/15: 3 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2015/16: 0 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2016/17: 0 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2017/18: 2 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2018/19: 0 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2019/20: 2 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2020/21: 0 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2021/22: 0 leadership changes in playoff (6 teams, rounds 0-10)
‚úÖ 2022/23: 0 leadership changes i

Unnamed: 0,season,playoff_leadership_changes,playoff_first_leader,playoff_last_leader,playoff_rounds,playoff_distinct_leaders
0,2009/10,1,Maccabi Haifa,Hapoel Tel Aviv,5,2
1,2010/11,2,Maccabi Haifa,Maccabi Haifa,5,2
2,2011/12,0,Kiryat Shmona,Kiryat Shmona,7,1
3,2012/13,0,M. Tel Aviv,M. Tel Aviv,10,1
4,2013/14,0,M. Tel Aviv,M. Tel Aviv,10,1
5,2014/15,3,M. Tel Aviv,M. Tel Aviv,10,3
6,2015/16,0,H. Beer Sheva,H. Beer Sheva,10,1
7,2016/17,0,H. Beer Sheva,H. Beer Sheva,10,1
8,2017/18,2,H. Beer Sheva,H. Beer Sheva,10,2
9,2018/19,0,M. Tel Aviv,M. Tel Aviv,10,1



üíæ Saved playoff summary to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\playoff_championship_leadership_changes.csv
üíæ Saved playoff per-round to: c:\Users\nitib\dev-lab\ligat_haal_project\ligat_haal_project\notebooks\data\interim\playoff_championship_per_round.csv
