In [5]:
from pathlib import Path
print(Path.cwd())

c:\Users\ersan\OneDrive\Masa√ºst√º\ders\ceng481\NBA-Game-Prediction-using-Artificial-Neural-Networks\notebooks


In [16]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path

def download_latest_injury_pdf(base_url: str, save_dir: Path) -> Path | None:
    """
    NBA official injury report sayfasƒ±ndaki en g√ºncel PDF'i bulur ve indirir.
    Returns: ƒ∞ndirilen PDF yolunu d√∂ner veya None
    """
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    # Sayfa i√ßeriƒüini √ßek
    resp = requests.get(base_url, headers=headers)
    if resp.status_code != 200:
        print(f"‚ùå Sayfa alƒ±namadƒ±: {resp.status_code}")
        return None

    soup = BeautifulSoup(resp.content, "html.parser")
    links = soup.select("a[href*='Injury-Report_']")

    if not links:
        print("‚ö†Ô∏è Hi√ß PDF linki bulunamadƒ±.")
        return None

    # PDF linklerini √ßƒ±kar (tarih + saat bilgisiyle sƒ±ralayacaƒüƒ±z)
    pdf_links = []
    for link in links:
        href = link.get("href")
        if "Injury-Report_" in href and href.endswith(".pdf"):
            full_url = href if href.startswith("http") else f"https://ak-static.cms.nba.com{href}"
            timestamp = full_url.split("Injury-Report_")[1].replace(".pdf", "")
            pdf_links.append((timestamp, full_url))

    # Saat olarak en yeni PDF'i bul
    def parse_ts(ts):
        try:
            return datetime.strptime(ts, "%Y-%m-%d_%I%M%p")
        except:
            return datetime.min

    pdf_links.sort(key=lambda x: parse_ts(x[0]), reverse=True)
    latest_ts, latest_url = pdf_links[0]

    # PDF‚Äôi indir
    save_dir.mkdir(parents=True, exist_ok=True)
    filename = f"Injury-Report_{latest_ts}.pdf"
    save_path = save_dir / filename

    r = requests.get(latest_url, headers=headers)
    with open(save_path, "wb") as f:
        f.write(r.content)

    print(f"üìÑ En g√ºncel PDF kaydedildi: {save_path}")
    return save_path


In [None]:
from pathlib import Path

download_latest_injury_pdf(
    base_url="https://official.nba.com/nba-injury-report-2025-26-season/",
    save_dir=Path("../data_raw/injury_reports_raw")
)
#bu √ßalƒ±≈ütƒ±rƒ±lƒ±nca pdf'i indiriyor

üìÑ En g√ºncel PDF kaydedildi: ..\data_raw\injury_reports_raw\Injury-Report_2025-11-16_12PM.pdf


WindowsPath('../data_raw/injury_reports_raw/Injury-Report_2025-11-16_12PM.pdf')

In [25]:
import pdfplumber
import pandas as pd
from pathlib import Path

def parse_injury_pdf_text(pdf_path: Path, csv_path: Path | None = None) -> pd.DataFrame | None:
    """
    Tablolu olmayan PDF'ten metin bazlƒ± injury satƒ±rlarƒ±nƒ± parse eder.
    """
    if not pdf_path.exists():
        print(f"‚ùå PDF bulunamadƒ±: {pdf_path}")
        return None

    players = []
    current_team = None
    current_game = None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            lines = page.extract_text().split("\n")
            for line in lines:
                if "@" in line and any(t in line for t in ["ET)", "AM", "PM"]):
                    current_game = line.strip()
                elif line.isupper() and " " in line and len(line.split()) <= 4:
                    current_team = line.strip()
                elif any(x in line for x in ["Out", "Questionable", "Probable", "Available", "Doubtful"]):
                    parts = line.strip().split(" ", 2)
                    if len(parts) >= 2:
                        player_name = parts[0] + " " + parts[1]
                        try:
                            status = parts[2].split(" ")[0]
                            reason = parts[2].replace(status, "").strip(" -‚Äì;:")
                        except:
                            status = "Unknown"
                            reason = ""
                        players.append({
                            "game": current_game,
                            "team": current_team,
                            "player_name": player_name,
                            "status": status,
                            "reason": reason
                        })

    df = pd.DataFrame(players)
    if csv_path is None:
        csv_path = pdf_path.with_suffix(".parsed.csv")
    df.to_csv(csv_path, index=False)
    print(f"‚úÖ Injury CSV kaydedildi: {csv_path}")

    return df


In [26]:
pdf_file = Path("../data_raw/injury_reports_raw/Injury-Report_2025-11-16_12PM.pdf")
parse_injury_pdf_text(pdf_file)


‚úÖ Injury CSV kaydedildi: ..\data_raw\injury_reports_raw\Injury-Report_2025-11-16_12PM.parsed.csv


Unnamed: 0,game,team,player_name,status,reason
0,"11/16/2025 03:30(ET) LAC@BOS LAClippers Beal,B...",,"Leonard,Kawhi Out",Injury/Illness-RightAnkle;Sprain,
1,"11/16/2025 03:30(ET) LAC@BOS LAClippers Beal,B...",,"Telfort,Jahmyl Out",GLeague-Two-Way,
2,"11/16/2025 03:30(ET) LAC@BOS LAClippers Beal,B...",,"BostonCeltics HarperJr.,Ron",Out,GLeague-Two-Way
3,"11/16/2025 03:30(ET) LAC@BOS LAClippers Beal,B...",,"Shulga,Max Out",GLeague-Two-Way,
4,"11/16/2025 03:30(ET) LAC@BOS LAClippers Beal,B...",,"Tatum,Jayson Out",Injury/Illness-RightAchilles;Repair,
...,...,...,...,...,...
64,"08:00(ET) ATL@PHX AtlantaHawks Dante,N'Faly Ou...",,"Hendricks,Taylor Out",Unknown,
65,"08:00(ET) ATL@PHX AtlantaHawks Dante,N'Faly Ou...",,"Kessler,Walker Out",Unknown,
66,"08:00(ET) ATL@PHX AtlantaHawks Dante,N'Faly Ou...",,"Niang,Georges Out",Unknown,
67,"08:00(ET) ATL@PHX AtlantaHawks Dante,N'Faly Ou...",,"Tonje,John Out",GLeague-Two-Way,
