<a href="https://colab.research.google.com/github/RockZeroAxl/Tsuki/blob/main/TNM_Staging_Logic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
from typing import Tuple, Dict

class TNMExtractor:
    """
    從非結構化病理報告中擷取 TNM 分期 (Pathological Staging)
    Demonstrates: Advanced Regex, Rule-based NLP, Clinical Logic
    """

    def __init__(self):
        # 定義正則表達式 (Regex Patterns)
        # 支援格式: pT1N0M0, pT2a, pN1mi, etc.
        self.pt_pattern = re.compile(r'\bpT\s*(?P<T>ISDC|IS|X|[0-4][A-Ca-c]?)', re.IGNORECASE)
        self.pn_pattern = re.compile(r'\bpN\s*[:：]?\s*(?P<N>X|0|[1-3](?:MI|[A-Ca-c])?)', re.IGNORECASE)
        self.pm_pattern = re.compile(r'\bpM\s*[:：]?\s*(?P<M>X|[01])', re.IGNORECASE)

        # 針對 Summary 段落的解析 (一次抓取整行)
        self.summary_pattern = re.compile(r"""
            (?:Pathologic(?:al)?\s*(?:TNM\s*)?stage|p?TNM)\s*[:：\-]\s* # 標題
            pT?\s*(?P<T>ISDC|IS|X|[0-4][A-Ca-c]?)                         # T
            (?:.*?(?P<Npref>[pc]?)N(?P<Nval>X|0|[1-3](?:MI|[A-Ca-c])?))?  # N (Optional)
            (?:.*?(?P<Mpref>[pc]?)M(?P<Mval>X|[01]))?                     # M (Optional)
        """, re.IGNORECASE | re.VERBOSE)

    def extract(self, text: str) -> Dict[str, str]:
        """
        主邏輯：優先查找 Summary，若無則查找內文關鍵字
        """
        result = {'pT': '', 'pN': '', 'pM': ''}

        # 策略 1: 嘗試從 Summary 段落一次性提取 (準確度較高)
        summary_match = self.summary_pattern.search(text)
        if summary_match:
            result['pT'] = (summary_match.group('T') or '').upper()

            # 處理 N (排除臨床 cN)
            n_pref = (summary_match.group('Npref') or '').lower()
            if n_pref != 'c': # 確保不是 cN
                result['pN'] = (summary_match.group('Nval') or '').upper()

            # 處理 M
            m_pref = (summary_match.group('Mpref') or '').lower()
            if m_pref != 'c':
                result['pM'] = (summary_match.group('Mval') or '').upper()

            # 如果三個都抓到了，直接回傳
            if result['pT'] and result['pN']:
                return result

        # 策略 2: 若 Summary 缺失，則進行全文掃描 (Fallback mechanism)
        if not result['pT']:
            m = self.pt_pattern.search(text)
            if m: result['pT'] = m.group('T').upper()

        if not result['pN']:
            m = self.pn_pattern.search(text)
            if m: result['pN'] = m.group('N').upper()

        if not result['pM']:
            m = self.pm_pattern.search(text)
            if m: result['pM'] = m.group('M').upper()

        return result

    def validate_logic(self, tnm_dict: Dict[str, str]) -> Dict[str, str]:
        """
        實作癌登邏輯檢核 (Validation Logic)
        例如：若 pT 為 IS (原位癌)，則 pN 應為 0 或 X，不應有轉移
        """
        # 簡單範例邏輯
        if tnm_dict['pT'] in ['IS', 'ISDC']:
            if tnm_dict['pM'] == '1':
                print("Warning: Logic Conflict - In-Situ cancer should not have Metastasis (M1).")

        # 填補預設值
        if not tnm_dict['pT']: tnm_dict['pT'] = 'X'
        if not tnm_dict['pN']: tnm_dict['pN'] = 'X'

        return tnm_dict