
התקנת ספריות
> Add blockquote



In [9]:
from pathlib import Path

# אם ה-notebook נמצא בתוך ligat_haal_project/notebooks
ROOT = Path.cwd().parent
DATA_DIR = ROOT / "data" / "raw"   # או "data" אם זה המבנה שלך

DATA_DIR.mkdir(parents=True, exist_ok=True)  # יוודא שהתיקייה קיימת


In [10]:
!pip -q install pandas numpy matplotlib scipy pyjanitor



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Defining the project baseline

In [11]:
import os

# זה הנתיב לתיקיה שפתחת ב-My Drive
BASE = "/content/drive/MyDrive/ligat_haal_project"

# יצירת תיקיות משנה (אם עוד לא קיימות)
for p in ["data/raw", "data/interim", "data/processed", "notebooks", "reports/figures"]:
    os.makedirs(f"{BASE}/{p}", exist_ok=True)

print("Project base path:", BASE)


Project base path: /content/drive/MyDrive/ligat_haal_project


In [12]:
with open(f"{BASE}/data/raw/test.txt", "w", encoding="utf-8") as f:
    f.write("שלום מהשותף הראשון :)")

print("נוצר קובץ בדיקה!")


נוצר קובץ בדיקה!


Milking sample data for the 2022/2023 season from the football API

In [14]:
# ====== ליגת העל בלבד (APISports) – הצגה + הורדה ======
!pip -q install pandas requests python-dateutil

import os, requests, time
import pandas as pd
from dateutil import parser as dateparser


# חיבור דרייב (רק אם מריצים ב-Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
except ImportError:
    print("Not running in Google Colab, skipping drive.mount.")
BASE = "/content/drive/MyDrive/ligat_haal_project"
os.makedirs(f"{BASE}/data/raw", exist_ok=True)

# הכנס כאן את המפתח שלך
APISPORTS_KEY = "26eba84b29785aa330e897405019faf2"  # <<< להחליף

BASE_URL = "https://v3.football.api-sports.io"
HEADERS = {"x-apisports-key": APISPORTS_KEY}

def api_get(path, params=None, pause=0.6):
    r = requests.get(f"{BASE_URL}{path}", headers=HEADERS, params=params or {}, timeout=30)
    if r.status_code != 200:
        raise RuntimeError(f"API error {r.status_code}: {r.text[:300]}")
    time.sleep(pause)
    return r.json()

# --- שלב 1: הצגת כל הליגות בישראל כדי שתראה מה קיים ---
leagues = api_get("/leagues", {"country": "Israel"})
rows = []
for item in leagues.get("response", []):
    lg = item.get("league", {})
    cn = item.get("country", {})
    if cn.get("name") == "Israel":
        rows.append({
            "id": lg.get("id"),
            "name": lg.get("name"),
            "type": lg.get("type"),
        })
israel_leagues_df = pd.DataFrame(rows).sort_values(by=["type","name"]).reset_index(drop=True)
print("ליגות שנמצאו בישראל:")
display(israel_leagues_df)

# --- שלב 2: בחירה חכמה של ליגת העל ---
PREFERRED_NAMES = {
    "ligat ha'al",
    "ligat ha’al",     # גרסאות אפוסטרוף
    "ligat ha`al",
    "israeli premier league",
    "premier league",  # לפעמים מסומן כך
}

def choose_israeli_premier(df: pd.DataFrame):
    # סינון רק מסוג 'League' (לא גביעים)
    df = df[df["type"].str.lower() == "league"].copy()
    # ניסיון התאמה לפי שם
    for _, row in df.iterrows():
        name_norm = (row["name"] or "").lower().strip()
        if any(p in name_norm for p in PREFERRED_NAMES):
            return int(row["id"]), row["name"]
    # fallback: אם לא מצאנו בשם, נעדיף זו עם 'Ligat' בשם
    for _, row in df.iterrows():
        if "ligat" in (row["name"] or "").lower():
            return int(row["id"]), row["name"]
    # אחרון חביב: נחזיר את הראשונה מסוג League
    if not df.empty:
        r0 = df.iloc[0]
        return int(r0["id"]), r0["name"]
    return None, None

LEAGUE_ID, LEAGUE_NAME = choose_israeli_premier(israel_leagues_df)
assert LEAGUE_ID is not None, "לא נמצא מזהה ליגת העל."
print(f"נבחרה ליגה: {LEAGUE_NAME} (ID={LEAGUE_ID})")

# --- שלב 3: הורדת עונה ספציפית של ליגת העל ---
SEASON_YEAR = 2022   # שנה של תחילת העונה: 2022 -> 2022/23 (שנה לפתיחה אפשר להחליף)

fx = api_get("/fixtures", {"league": LEAGUE_ID, "season": SEASON_YEAR, "timezone": "UTC"})
rows = []
for item in fx.get("response", []):
    fixture = item.get("fixture", {})
    league  = item.get("league", {})
    teams   = item.get("teams", {})
    goals   = item.get("goals", {})

    dt = fixture.get("date")
    try:
        dt = dateparser.parse(dt).strftime("%Y-%m-%d") if dt else None
    except:
        dt = None

    rows.append({
        "season": f"{SEASON_YEAR}/{str(SEASON_YEAR+1)[-2:]}",   # "2022/23"
        "date": dt,
        "round": league.get("round"),
        "stage": league.get("name"),                            # Regular Season / Championship Round...
        "home_team": teams.get("home", {}).get("name"),
        "away_team": teams.get("away", {}).get("name"),
        "home_goals": goals.get("home"),
        "away_goals": goals.get("away"),
        "venue": fixture.get("venue", {}).get("name"),
        "referee": fixture.get("referee"),
        "fixture_id": fixture.get("id"),
        "league_id": league.get("id"),
        "league_name": league.get("name"),
    })

df = pd.DataFrame(rows)

# בדיקה מהירה שלא לקחנו בטעות ליגה א'/לאומית:
print("שמות ליגה ייחודיים בטבלה:", df["league_name"].dropna().unique().tolist())

csv_path = f"{BASE}/data/raw/matches_{SEASON_YEAR}_{str(SEASON_YEAR+1)[-2:]}_ligat_haal.csv"
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"נשמר קובץ: {csv_path} | שורות: {len(df)}")
display(df.head(10))



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Not running in Google Colab, skipping drive.mount.
ליגות שנמצאו בישראל:


Unnamed: 0,id,name,type
0,384,State Cup,Cup
1,659,Super Cup,Cup
2,385,Toto Cup Ligat Al,Cup
3,496,Liga Alef,League
4,382,Liga Leumit,League
5,383,Ligat Ha'al,League


נבחרה ליגה: Ligat Ha'al (ID=383)
שמות ליגה ייחודיים בטבלה: ["Ligat Ha'al"]
נשמר קובץ: /content/drive/MyDrive/ligat_haal_project/data/raw/matches_2022_23_ligat_haal.csv | שורות: 240


Unnamed: 0,season,date,round,stage,home_team,away_team,home_goals,away_goals,venue,referee,fixture_id,league_id,league_name
0,2022/23,2022-08-20,Regular Season - 1,Ligat Ha'al,Hapoel Haifa,Hapoel Tel Aviv,2,0,Sammy Ofer Stadium,O. Grinfeeld,865835,383,Ligat Ha'al
1,2022/23,2022-08-20,Regular Season - 1,Ligat Ha'al,Hapoel Katamon,Hapoel Hadera,1,1,HaMoshava Stadium,A. Shiloach,865840,383,Ligat Ha'al
2,2022/23,2022-08-20,Regular Season - 1,Ligat Ha'al,Maccabi Netanya,Beitar Jerusalem,4,1,Netanya Stadium,R. Reinshreiber,865837,383,Ligat Ha'al
3,2022/23,2022-08-21,Regular Season - 1,Ligat Ha'al,Maccabi Tel Aviv,Maccabi Bnei Raina,5,0,Bloomfield Stadium,I. Frid,865841,383,Ligat Ha'al
4,2022/23,2022-08-22,Regular Season - 1,Ligat Ha'al,Sektzia Nes Tziona,Ironi Kiryat Shmona,0,2,HaMoshava Stadium,Y. Mizrahi,865839,383,Ligat Ha'al
5,2022/23,2022-08-27,Regular Season - 2,Ligat Ha'al,Ironi Kiryat Shmona,Hapoel Katamon,1,1,Kiryat-Shmona Municipal Stadium,O. Na'al,865844,383,Ligat Ha'al
6,2022/23,2022-08-27,Regular Season - 2,Ligat Ha'al,Hapoel Tel Aviv,Bnei Sakhnin,0,2,Bloomfield Stadium,R. Reinshreiber,865848,383,Ligat Ha'al
7,2022/23,2022-08-27,Regular Season - 2,Ligat Ha'al,Maccabi Haifa,Maccabi Netanya,4,1,Sammy Ofer Stadium,S. Levi,865847,383,Ligat Ha'al
8,2022/23,2022-08-27,Regular Season - 2,Ligat Ha'al,Ashdod,Sektzia Nes Tziona,1,0,Yud-Alef Stadium,O. Asulin,865845,383,Ligat Ha'al
9,2022/23,2022-08-28,Regular Season - 2,Ligat Ha'al,Maccabi Bnei Raina,Hapoel Haifa,1,1,Green Stadium,S. Ben Avraham,865842,383,Ligat Ha'al


Filter out irrelevant columns

In [None]:
# === העשרת הטבלה + ניקוי עמודות מיותרות ===
import re
import pandas as pd

BASE = "/content/drive/MyDrive/ligat_haal_project"
in_path  = f"{BASE}/data/raw/matches_2022_23_ligat_haal.csv"   # שנה לקובץ שלך
out_path = f"{BASE}/data/interim/matches_2022_23_enriched.csv"

df = pd.read_csv(in_path)

# --- עמודות עזר ---
# 1) שנה מספרית לפתיחת העונה
#df["season_year"] = df["season"].str.slice(0,4).astype(int)

# 2) מספר מחזור ו-phase
def parse_round(r):
    # דוגמאות: "Regular Season - 1", "Championship Round - 5"
    if pd.isna(r):
        return (None, None)
    r = str(r)
    m = re.search(r"(Regular|Championship|Relegation).*?(\d+)", r, flags=re.I)
    phase = None
    if "regular" in r.lower():      phase = "regular"
    elif "championship" in r.lower(): phase = "championship"
    elif "relegation" in r.lower():   phase = "relegation"
    round_num = int(m.group(2)) if m else None
    return (phase, round_num)

tmp = df["round"].apply(parse_round).tolist()
df["phase"] = [t[0] for t in tmp]
df["round_num"] = [t[1] for t in tmp]

# 3) הפרש שערים, תוצאה, נקודות
df["goal_diff"] = df["home_goals"] - df["away_goals"]
df["result"] = df["goal_diff"].apply(lambda x: "H" if x>0 else ("A" if x<0 else "D"))
df["home_points"] = df["result"].map({"H":3, "D":1, "A":0})
df["away_points"] = df["result"].map({"H":0, "D":1, "A":3})

# 4) דגל משחק חד-צדדי (למשל |GD|>=3)
df["one_sided"] = (df["goal_diff"].abs() >= 3).astype(int)

# 5) עמודות לא רלוונטיות להסרה (כפי שביקשת)
drop_cols = ["league_id","league_name","fixture_id"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# 6) סדר עמודות נוח
cols = [
    "season","season_year","date","phase","round_num","stage",
    "home_team","away_team","home_goals","away_goals","goal_diff","result",
    "home_points","away_points","one_sided","venue","referee"
]
df = df[[c for c in cols if c in df.columns]]

df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("נשמר:", out_path, "| שורות:", len(df))
df.head(10)


נשמר: /content/drive/MyDrive/ligat_haal_project/data/interim/matches_2022_23_enriched.csv | שורות: 240


Unnamed: 0,season,date,phase,round_num,stage,home_team,away_team,home_goals,away_goals,goal_diff,result,home_points,away_points,one_sided,venue,referee
0,2022/23,2022-08-20,regular,1,Ligat Ha'al,Hapoel Haifa,Hapoel Tel Aviv,2,0,2,H,3,0,0,Sammy Ofer Stadium,O. Grinfeeld
1,2022/23,2022-08-20,regular,1,Ligat Ha'al,Hapoel Katamon,Hapoel Hadera,1,1,0,D,1,1,0,HaMoshava Stadium,A. Shiloach
2,2022/23,2022-08-20,regular,1,Ligat Ha'al,Maccabi Netanya,Beitar Jerusalem,4,1,3,H,3,0,1,Netanya Stadium,R. Reinshreiber
3,2022/23,2022-08-21,regular,1,Ligat Ha'al,Maccabi Tel Aviv,Maccabi Bnei Raina,5,0,5,H,3,0,1,Bloomfield Stadium,I. Frid
4,2022/23,2022-08-22,regular,1,Ligat Ha'al,Sektzia Nes Tziona,Ironi Kiryat Shmona,0,2,-2,A,0,3,0,HaMoshava Stadium,Y. Mizrahi
5,2022/23,2022-08-27,regular,2,Ligat Ha'al,Ironi Kiryat Shmona,Hapoel Katamon,1,1,0,D,1,1,0,Kiryat-Shmona Municipal Stadium,O. Na'al
6,2022/23,2022-08-27,regular,2,Ligat Ha'al,Hapoel Tel Aviv,Bnei Sakhnin,0,2,-2,A,0,3,0,Bloomfield Stadium,R. Reinshreiber
7,2022/23,2022-08-27,regular,2,Ligat Ha'al,Maccabi Haifa,Maccabi Netanya,4,1,3,H,3,0,1,Sammy Ofer Stadium,S. Levi
8,2022/23,2022-08-27,regular,2,Ligat Ha'al,Ashdod,Sektzia Nes Tziona,1,0,1,H,3,0,0,Yud-Alef Stadium,O. Asulin
9,2022/23,2022-08-28,regular,2,Ligat Ha'al,Maccabi Bnei Raina,Hapoel Haifa,1,1,0,D,1,1,0,Green Stadium,S. Ben Avraham


Average audience of the team during the season

In [None]:
# === EFS attendance for Israel: fetch, clean, merge with matches ===
import os, re
import pandas as pd

BASE = "/content/drive/MyDrive/ligat_haal_project"
raw_out = f"{BASE}/data/raw/efs_israel_attendance.csv"
merged_out = f"{BASE}/data/interim/matches_2022_23_with_home_avg_attendance.csv"

URL = "https://www.european-football-statistics.co.uk/attn/aveisr.htm"

def load_efs_israel(url=URL):
    # נסה לקרוא את כל הטבלאות בעמוד
    tables = pd.read_html(url, flavor="lxml")  # לפעמים "bs4" גם עובד; נתחיל מ-lxml
    # נחפש טבלה עם עמודות שמזוהות כלוגיות: Season/Club/Avg (או שמות קרובים)
    candidate = None
    for t in tables:
        cols = [str(c).strip().lower() for c in t.columns]
        # התאמות אפשריות
        has_season = any(c in cols for c in ["season","seasons","year","years"])
        has_club   = any(c in cols for c in ["club","team","clubs","teams"])
        has_avg    = any(c in cols for c in ["avg","average","average attendance","attendance avg","avg."])
        if has_season and has_club and has_avg:
            candidate = t.copy()
            break
    if candidate is None:
        raise RuntimeError("לא נמצאה טבלה עם Season/Club/Avg בעמוד EFS.")
    # מיפוי שמות עמודות נפוצים לבסיס קבוע
    rename_map = {}
    for c in candidate.columns:
        lc = str(c).strip().lower()
        if lc in ["season","seasons","year","years"]:
            rename_map[c] = "season"
        elif lc in ["club","team","clubs","teams"]:
            rename_map[c] = "team"
        elif lc in ["avg","average","average attendance","attendance avg","avg."]:
            rename_map[c] = "avg_attendance"
    df = candidate.rename(columns=rename_map)
    # שמור רק עמודות שאנחנו צריכים
    keep = [c for c in ["season","team","avg_attendance"] if c in df.columns]
    df = df[keep].copy()

    # ניקוי season לצורה "YYYY/YY"
    def norm_season(s):
        s = str(s).strip()
        # דוגמאות: "2022-23", "2022/23", "2022 – 2023", "2022"
        s = re.sub(r"[–—\-]+", "/", s)   # החלפת מקפים ל-"/"
        s = re.sub(r"\s+", "", s)
        if re.match(r"^\d{4}/\d{2}$", s):
            return s
        m = re.match(r"^(\d{4})/(\d{4})$", s)
        if m:
            return f"{m.group(1)}/{m.group(2)[-2:]}"
        m2 = re.match(r"^(\d{4})$", s)
        if m2:  # עונה יחידה – נשאיר 4 ספרות
            return s
        return s

    df["season"] = df["season"].apply(norm_season)

    # ניקוי ממוצע קהל -> מספר שלם
    def to_int(x):
        s = str(x)
        s = s.replace(",", "").replace(".", "")
        s = re.sub(r"[^\d]", "", s)
        return int(s) if s.isdigit() else pd.NA
    df["avg_attendance"] = df["avg_attendance"].apply(to_int)

    # סטנדרטיזציה קלה לשמות קבוצות (תוכל להרחיב לפי הנתונים שלך)
    def norm_team(s):
        s = str(s).strip()
        s = s.replace("Tel-Aviv","Tel Aviv")
        s = s.replace("Kiryat-Shmona","Kiryat Shmona")
        s = s.replace("Be'er-Sheva","Be'er Sheva").replace("Beer-Sheva","Be'er Sheva")
        s = s.replace("Hapoel Kfar-Saba","Hapoel Kfar Saba")
        return s
    df["team"] = df["team"].apply(norm_team)

    # הסרת שורות ריקות/טורים מסכמים אם קיימים
    df = df.dropna(subset=["team","avg_attendance"])
    df = df[df["team"].str.len() > 0]
    return df.reset_index(drop=True)

try:
    efs = load_efs_israel(URL)
    os.makedirs(f"{BASE}/data/raw", exist_ok=True)
    efs.to_csv(raw_out, index=False, encoding="utf-8-sig")
    print("נשמר קובץ EFS נקי ל:", raw_out, "| שורות:", len(efs))
    display(efs.head(10))
except Exception as e:
    print("קריאת EFS נכשלה:", e)
    print("פתרון מהיר: פתח ידנית את ה-URL בדפדפן, עשה 'שמור בשם' לקובץ HTML, העלה אותו ל-Drive למשל ל:", f"{BASE}/data/raw/aveisr_offline.html")
    print("ואז הרץ:")
    print("""
import pandas as pd
tables = pd.read_html(f"{BASE}/data/raw/aveisr_offline.html", flavor="lxml")
# ואז המשך מהחלק של בחירת candidate והניקוי כפי שמופיע למעלה
""")

# אם קובץ המשחקים המעושר קיים – נחבר ממוצע קהל לביתית לפי season+team
matches_path = f"{BASE}/data/interim/matches_2022_23_enriched.csv"
if os.path.exists(matches_path) and os.path.exists(raw_out):
    matches = pd.read_csv(matches_path)
    efs = pd.read_csv(raw_out)
    merged = matches.merge(
        efs[["season","team","avg_attendance"]],
        left_on=["season","home_team"],
        right_on=["season","team"],
        how="left"
    ).drop(columns=["team"])
    merged = merged.rename(columns={"avg_attendance":"home_avg_attendance"})
    merged.to_csv(merged_out, index=False, encoding="utf-8-sig")
    print("נשמר קובץ ממוזג (ממוצע קהל לביתית):", merged_out, "| שורות:", len(merged))
    display(merged.head(10))
else:
    print("דלגתי על המיזוג כי חסר קובץ:", matches_path, "או", raw_out)


קריאת EFS נכשלה: לא נמצאה טבלה עם Season/Club/Avg בעמוד EFS.
פתרון מהיר: פתח ידנית את ה-URL בדפדפן, עשה 'שמור בשם' לקובץ HTML, העלה אותו ל-Drive למשל ל: /content/drive/MyDrive/ligat_haal_project/data/raw/aveisr_offline.html
ואז הרץ:

import pandas as pd
tables = pd.read_html(f"{BASE}/data/raw/aveisr_offline.html", flavor="lxml")
# ואז המשך מהחלק של בחירת candidate והניקוי כפי שמופיע למעלה

דלגתי על המיזוג כי חסר קובץ: /content/drive/MyDrive/ligat_haal_project/data/interim/matches_2022_23_enriched.csv או /content/drive/MyDrive/ligat_haal_project/data/raw/efs_israel_attendance.csv


In [None]:
import pandas as pd

BASE = "/content/drive/MyDrive/ligat_haal_project"

# כתובת העמוד של עונת 2022/23
url = "https://www.transfermarkt.com/ligat-haal/besucherzahlen/wettbewerb/ISR1?saison_id=2022"

# קריאת כל הטבלאות מהעמוד
tables = pd.read_html(url)

print("מספר טבלאות שנמצאו:", len(tables))
for i, t in enumerate(tables):
    print(f"--- טבלה {i} ---")
    print(t.head())


HTTPError: HTTP Error 403: Forbidden