In [None]:
!pip install langdetect spacy beautifulsoup4 lxml instructor pydantic
!python -m spacy download en_core_web_sm


In [35]:
import os, re, json, html, string, hashlib, math, unicodedata, random, time
from pathlib import Path
from datetime import datetime, timezone, date
import boto3
import pandas as pd
from bs4 import BeautifulSoup
from dateutil.parser import parse as parse_date
from langdetect import detect, detect_langs, DetectorFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text as sk_text

DetectorFactory.seed = 42

AWS_REGION=os.getenv("AWS_REGION","us-west-2")
MODEL_TRANSLATE_FALLBACK_1=os.getenv("BEDROCK_TRANSLATE_PRIMARY","amazon.nova-micro-v1:0")
MODEL_TRANSLATE_FALLBACK_2=os.getenv("BEDROCK_TRANSLATE_SECONDARY","anthropic.claude-3-7-sonnet-20250219-v1:0")
MODEL_SPLIT=os.getenv("BEDROCK_SPLIT_MODEL","anthropic.claude-3-7-sonnet-20250219-v1:0")
MODEL_EXTRACT=os.getenv("BEDROCK_EXTRACT_MODEL","amazon.nova-premier-v1:0")
MODEL_EXTRACT_DOC=os.getenv("BEDROCK_EXTRACT_DOC_MODEL","amazon.nova-premier-v1:0:1000k")

INPUT_DIRS=json.loads(os.getenv("INPUT_DIRS","[\"./directives\"]"))
ALLOWED_EXT=set([x.lower().strip() for x in json.loads(os.getenv("ALLOWED_EXT","[\".html\",\".xml\"]"))])
RECURSIVE=os.getenv("RECURSIVE","true").strip().lower() in {"1","true","yes"}
OUT_DIR=os.getenv("OUT_DIR","out")
MAX_DOC_CHARS=int(os.getenv("MAX_DOC_CHARS","180000"))
MAX_ART_CHARS=int(os.getenv("MAX_ART_CHARS","18000"))
MIN_ART_LEN=int(os.getenv("MIN_ART_LEN","300"))
TAXONOMY_PATH=os.getenv("TAXONOMY_PATH", f"{OUT_DIR}/dynamic_taxonomy.json")

translate=boto3.client("translate",region_name=AWS_REGION)
bedrock=boto3.client("bedrock-runtime",region_name=AWS_REGION)

EXTRA_STOPWORDS={"section","sections","article","articles","annex","annexe","appendix","appendice","subtitle","title","chapter","chapitre","directive","regulation","regulations","law","act","union","paragraph","subparagraph","recital","dispositif","premier","1er","amended","shall","must","may","including","include","pursuant","accordance","specified","provide","provided","applicable","applicability","applying","applicant","applicants","applying","subject","subjects","thereof","hereof","therein","herein","thereby","hereby","whereas","hereunder","thereunder","among","between","within","without","preamble","scope","purpose","purposes","general","specific"}
STOPWORDS_EN=set(sk_text.ENGLISH_STOP_WORDS)|EXTRA_STOPWORDS

STOPWORDS_JA={"第","条","項","号","章","節","款","目次","附則","総則","抄","同","前","又は","及び","並びに","その他","こと","もの","ため","者","うえ","上","下","について","に関する","に係る","する","される","した","して","すると","され","なる","ない","これ","それ","当該","各","同条","政府","国","内閣","大臣","本部","本部長","本法","本章","本条","次項","前項","人工知能","人工知能関連技術","技術","研究開発","活用","推進","計画","基本","基本計画","施策","規定","規範","方針","必要","措置","整備","確保","促進","国際","協力","教育","人材","情報","データ","等","など"}
STRUCTURAL_LABEL_JA={"総則","附則","目次","人工知能基本計画","人工知能戦略本部"}

JURIS_HINTS={
    "european union":"European Union","eu":"European Union","gdpr":"European Union","european parliament":"European Union","cjeu":"European Union","commission":"European Union",
    "union européenne":"European Union","journal officiel de l’union européenne":"European Union","directive (ue)":"European Union","parlement européen":"European Union",
    "united states":"United States","usa":"United States","federal register":"United States","congress":"United States","sec. code":"United States",
    "canada":"Canada","crtc":"Canada","osfi":"Canada","privacy act rsc":"Canada","pipa alberta":"Canada","pipa bc":"Canada","qp la loi":"Canada",
    "united kingdom":"United Kingdom","uk":"United Kingdom","ico":"United Kingdom","ofcom":"United Kingdom","fca":"United Kingdom","pra":"United Kingdom",
    "germany":"Germany","bafin":"Germany","bsig":"Germany",
    "france":"France","cnil":"France","code monétaire":"France","code de la consommation":"France",
    "italy":"Italy","gazzetta ufficiale":"Italy","garante":"Italy",
    "spain":"Spain","boe":"Spain","aepd":"Spain",
    "netherlands":"Netherlands","ap":"Netherlands","autoriteit persoonsgegevens":"Netherlands",
    "switzerland":"Switzerland","finma":"Switzerland","revue fédérale":"Switzerland",
    "australia":"Australia","asic":"Australia","oaic":"Australia","acma":"Australia",
    "singapore":"Singapore","mas":"Singapore","pdpa":"Singapore",
    "hong kong":"Hong Kong","hkma":"Hong Kong","sfc":"Hong Kong","pdpo":"Hong Kong",
    "japan":"Japan","pipa japan":"Japan","fsa japan":"Japan","cabinet office order":"Japan",
    "china":"China","中华人民共和国":"China","国务院":"China","全国人民代表大会":"China","全国人大常委会":"China"
}

REGULATOR_HINTS=[
    "European Commission","European Parliament","Council of the European Union","EDPB","EDPS","ESMA","EBA","EIOPA",
    "United States Congress","Federal Trade Commission","Securities and Exchange Commission","FDIC","OCC","CFPB","FINRA","NIST",
    "Canadian Radio-television and Telecommunications Commission","OSFI",
    "UK Information Commissioner's Office","Ofcom","Financial Conduct Authority","Prudential Regulation Authority",
    "BaFin","CNIL","Garante","AEPD","Autoriteit Persoonsgegevens","FINMA",
    "ASIC","OAIC","ACMA","Monetary Authority of Singapore",
    "Hong Kong Monetary Authority","Securities and Futures Commission","Privacy Commissioner for Personal Data",
    "Personal Information Protection Commission"
]

SECTOR_RULES=[
    ("Financial Services",["bank","credit institution","securities","broker","insurer","insurance","reinsurance","fintech","payment","prudential","credit union","investment firm","portfolio","fund","asset manager","trading venue","trading facility","crypto","virtual asset","stablecoin","market abuse","mifid","psd","psd2","psd3","lending","microfinance","wealth management","custody","settlement","clearing"]),
    ("Telecom & Media",["telecom","telecommunications","spectrum","broadcast","streaming","carrier","sms","mms","voice","over-the-top","subscriber","satellite","internet service provider","isp","5g","fiber"]),
    ("Technology & Platforms",["platform","online platform","hosting","cloud","saas","paas","iaas","marketplace","intermediary","algorithmic","ai","artificial intelligence","ml","machine learning","foundation model","general purpose ai","gpai","search engine","app store","social network","content moderation"]),
    ("Healthcare & Pharma",["medical","health","pharma","medicinal","device","clinic","ehr","hipaa","biotech","clinical trial","hospital","telemedicine"]),
    ("Public Sector",["public authority","ministry","municipal","agency","government department","public administration","state","local authority"]),
    ("Energy & Utilities",["energy","electricity","gas","utility","grid","renewable","pipeline","oil","mining","nuclear","water","waste"]),
    ("Retail & Consumer",["retail","consumer","ecommerce","distance selling","consommateur","rétractation","marketplaces","buy now pay later","bnpl","loyalty"]),
    ("Transportation",["aviation","rail","maritime","shipping","road transport","logistics","vehicle","driver","autonomous vehicle","rideshare","drone"]),
    ("Education",["school","university","student","educational","edtech","curriculum"]),
    ("Manufacturing & Industry",["factory","industrial","manufacturing","supply chain","production","standards"]),
    ("Real Estate",["real estate","property","landlord","tenant","mortgage","construction"])
]

ACTIVITY_RULES=[
    ("Data Processing",["process personal data","processing","controller","processor","data subject","consent","profiling","retention","pseudonymisation","anonymisation","data sharing","data transfer","cross-border","automated decision"]),
    ("Customer Onboarding & KYC",["know your customer","kyc","due diligence","customer due diligence","aml","ctf","sanctions","screening","identity verification","onboarding"]),
    ("Risk & Compliance",["risk management","governance","compliance","internal control","audit","reporting","supervisory","enforcement","penalty","fine","complaint handling","whistleblowing","internal policy"]),
    ("Payments & Transfers",["payment service","payment institution","e-money","remittance","transfer","settlement","instant payment","card","token","wallet"]),
    ("Cybersecurity",["security","breach","incident","vulnerability","encryption","authentication","cybersecurity","network and information security","nis","nis2","penetration test","incident response"]),
    ("AI Systems",["ai system","foundation model","general purpose ai","gpai","high-risk","risk management framework","human oversight","model transparency","dataset","hallucination","bias"]),
    ("Marketing & Communications",["marketing","direct marketing","electronic communications","cookies","telemarketing","consent for marketing","adtech","tracking"]),
    ("Operational Resilience",["business continuity","disaster recovery","outsourcing","third party","supply chain"]),
    ("Transparency & Disclosure",["disclosure","publish","transparency","public statement","explainability"]),
    ("Consumer Rights",["withdrawal","cooling-off","returns","refunds","fairness","terms and conditions"])
]

THEME_RULES=[
    ("Privacy & Data Protection",["gdpr","data protection","personal data","privacy","dpia","data subject rights","controller","processor","lawful basis","purpose limitation","data minimisation"]),
    ("Financial Conduct & Prudential",["market abuse","mifid","crr","crd","solvency","basel","capital requirements","liquidity","governance","stress test","prudential"]),
    ("Consumer Protection",["consumer","fairness","withdrawal","cooling-off","distance selling","complaint handling","dark patterns","information duties"]),
    ("Cybersecurity & Resilience",["nis","nis2","cybersecurity","incident response","business continuity","resilience","dora","operational resilience","critical infrastructure"]),
    ("AI & Automated Decisioning",["ai act","ai system","algorithmic","automated decision","training data","testing","validation","human oversight"]),
    ("Transparency & Reporting",["disclosure","report","publish","transparency","public statement","explainability","auditability"]),
    ("Sanctions & Financial Crime",["sanction","aml","ctf","terrorist financing","proliferation financing","screening","beneficial owner"]),
    ("Digital Markets & Platforms",["gatekeeper","platform","interoperability","self-preferencing","ranking"]),
    ("Data Governance & Sharing",["data governance","data sharing","data altruism","data intermediaries"])
]

IMPACT_RULES=[
    ("Obligation",["shall","must","required","obliged","duty","应当","必须","しなければならない","必要がある"]),
    ("Prohibition",["shall not","may not","prohibited","forbidden","不得","禁止","してはならない","禁ずる"]),
    ("Permission",["may","is permitted","allowed","可以","ことができる","許可"]),
    ("Enforcement",["penalty","fine","sanction","offence","offense","罰則","罚款","处罚","処罰"]),
    ("Reporting",["report","notify","notification","disclosure","報告","备案","届出"])
]

PARLIAMENTARY=re.compile(r"^\s*(having regard|after transmission|after consulting|in accordance with|whereas|pursuant to|considering|vu(?:\s+la|(?:x|es)?)?)\b.*$",re.IGNORECASE|re.MULTILINE)
RECITALS=re.compile(r"\b(whereas|considérant(?:\s+que)?|vu(?:\s+la|(?:x|es)?)?)\b.*?(?=(^|\n)\s*(article|art\.?|dispositif|chapitre|titre)\s*[^\n]*\b(1|premier|1er)\b|\bannex|annexe|appendix|appendice|schedule\b)",re.IGNORECASE|re.DOTALL)
TAIL=re.compile(r"(done at\s+[A-Za-z]+\s+\d{1,2}\s+[A-Za-z]+\s+\d{4}.*$|for the european parliament.*$|for the council.*$)",re.IGNORECASE|re.DOTALL)

HEAD_PATTERNS=[
    r"(^|\n)\s*Article\s+(premier|1er|first|one|i|\d+[A-Za-z]?(?:\s*(?:bis|ter|quater))?)\b",
    r"(^|\n)\s*Art\.\s*\d+[A-Za-z]?\b",
    r"(^|\n)\s*Section\s+\d+[A-Za-z.-]*\b",
    r"(^|\n)\s*Sec\.\s*\d+[A-Za-z.-]*\b",
    r"(^|\n)\s*Subtitle\s+[A-Z]+\b",
    r"(^|\n)\s*Chapter\s+([IVXLCDM]+|\d+)\b",
    r"(^|\n)\s*Article\s+\d+\b",
    r"(^|\n)\s*Chapitre\s+([IVXLCDM]+|\d+)\b",
    r"(^|\n)\s*Titre\s+([IVXLCDM]+|\d+)\b",
    r"(^|\n)\s*Article\s+\d+\s*-\s*"
]
CJK_HEAD_PATTERNS=[
    r"(^|\n)\s*第[一二三四五六七八九十百千\d]+条\b",
    r"(^|\n)\s*第[一二三四五六七八九十百千\d]+章\b",
    r"(^|\n)\s*第[一二三四五六七八九十百千\d]+節\b",
    r"(^|\n)\s*章\s*名",
    r"(^|\n)\s*条\s*文"
]
ART_SPLIT=re.compile("|".join(HEAD_PATTERNS+CJK_HEAD_PATTERNS),re.IGNORECASE)
ANNEX_HEAD=re.compile(r"\b(Annex|Annexe|Appendix|Appendice|Schedule|附則|附录|附件|附錄)\b",re.IGNORECASE)
DATE_RX=re.compile(r"(\b\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{4}\b|\b\d{4}-\d{1,2}-\d{1,2}\b|\b\d{1,2}[./]\d{1,2}[./]\d{2,4}\b)",re.IGNORECASE)
CJK_DATE_RX=re.compile(r"(\d{4})年(\d{1,2})月(\d{1,2})日")

JP_ERA=[("令和",2019),("平成",1989),("昭和",1926),("大正",1912),("明治",1868)]

JP_EFFECTIVE_PATTERNS=[
    r"この法律は、?公布の日から施行する",
    r"施行期日",
    r"公布の日から起算して[^\n。]{0,20}施行する",
    r"この法律は[^\n。]{0,40}施行する",
    r"施行[^\n。]{0,20}日は[^\n。]+"
]

def _strip_japanese_structural_tokens(s: str) -> str:
    s=(s or "").strip()
    s=re.sub(r"[第章条項号節款]"," ",s)
    for w in STRUCTURAL_LABEL_JA:
        s=s.replace(w," ")
    return re.sub(r"\s+"," ",s).strip()

def _resolve_input_dir():
    for d in INPUT_DIRS:
        p=Path(d)
        if p.exists():
            return p
    p=Path("./directives")
    p.mkdir(parents=True,exist_ok=True)
    return p

def _canon_name(p: Path):
    stem=p.stem
    stem=re.sub(r"(?i)(^|[-_\s])(checkpoint|copy|copie|copiar)$","",stem).strip()
    stem=re.sub(r"(?i)[-_]checkpoint","",stem)
    stem=re.sub(r"\s*\(\d+\)$","",stem)
    stem=re.sub(r"\s+"," ",stem)
    return (stem.lower(), p.suffix.lower())

def _is_checkpoint(p: Path):
    name=p.name.lower()
    if ("-checkpoint" in name or name.endswith("_checkpoint.html") or "checkpoint.html" in name or name.endswith(".ipynb") or p.parent.name.lower()==".ipynb_checkpoints" or ".ipynb_checkpoints" in str(p.parent).lower()):
        return True
    return False

def _list_files(root,recursive=False):
    def ok(p):
        n=p.name
        if _is_checkpoint(p): return False
        if p.parent.name == ".ipynb_checkpoints": return False
        return p.is_file() and p.suffix.lower() in ALLOWED_EXT and not (n.startswith(".") or n.startswith("._") or n.endswith("~"))
    it = (root.rglob("*") if recursive else root.iterdir())
    cand=[p for p in it if ok(p)]
    grouped={}
    for p in cand:
        key=_canon_name(p)
        best=grouped.get(key)
        if best is None:
            grouped[key]=p
        else:
            best_is_checkpoint=_is_checkpoint(best)
            p_is_checkpoint=_is_checkpoint(p)
            if best_is_checkpoint and not p_is_checkpoint:
                grouped[key]=p
            elif best_is_checkpoint==p_is_checkpoint:
                if p.stat().st_mtime > best.stat().st_mtime:
                    grouped[key]=p
    files=sorted(grouped.values(),key=lambda x:x.name.lower())
    print(f"[INPUT] {len(files)} files in {root} (recursive={recursive})")
    for p in files: print(" -",p.name)
    return files

def _save_per_input(df,input_path):
    Path(OUT_DIR).mkdir(parents=True,exist_ok=True)
    stem=input_path.stem
    csv_path=f"{OUT_DIR}/{stem}.csv"
    pq_path=f"{OUT_DIR}/{stem}.parquet"
    df.to_csv(csv_path,index=False)
    try:
        df.to_parquet(pq_path,index=False)
    except Exception:
        pass
    print(f"[SAVE] {csv_path}")

def _save_all(df):
    Path(OUT_DIR).mkdir(parents=True,exist_ok=True)
    csv_path=f"{OUT_DIR}/Regulatory_Article_Extraction_ALL.csv"
    pq_path=f"{OUT_DIR}/Regulatory_Article_Extraction_ALL.parquet"
    df.to_csv(csv_path,index=False)
    try:
        df.to_parquet(pq_path,index=False)
    except Exception:
        pass
    print(f"[SAVE] {csv_path}")

def _read(path):
    return Path(path).read_text(encoding="utf-8",errors="ignore")

def _extract_main_container(soup):
    targets=["#innerDocument","main#contentsLaw","#docHtml","article","#content","body"]
    for sel in targets:
        el=soup.select_one(sel)
        if el and len(el.get_text(strip=True))>200:
            return el
    return soup

def _html_to_text(raw):
    try:
        soup=BeautifulSoup(raw,"lxml")
    except Exception:
        soup=BeautifulSoup(raw,"html.parser")
    main=_extract_main_container(soup)
    for tag in main(["script","style","nav","header","footer","noscript","aside","form"]):
        try: tag.extract()
        except Exception: pass
    for br in main.find_all(["br","hr"]):
        br.replace_with("\n")
    for li in main.find_all("li"):
        txt=li.get_text(" ",strip=True)
        li.string=("\n- "+txt+"\n") if txt else "\n"
    for th in main.find_all(["h1","h2","h3","h4","h5","h6","strong","b"]):
        t=th.get_text(" ",strip=True)
        th.string=("\n"+t+"\n") if t else "\n"
    text=html.unescape(main.get_text("\n",strip=True))
    text=re.sub(r"\n{3,}","\n\n",text)
    return text,soup

def _has_cjk(s):
    return bool(re.search(r"[\u3400-\u4dbf\u4e00-\u9fff\u3040-\u30ff]",s or ""))

def _lang_probs(text):
    try:
        return detect_langs(text)
    except Exception:
        try:
            return [type("LP",(object,),{"lang":detect(text),"prob":1.0})()]
        except Exception:
            return []

def _english_confidence(text):
    t=(text or "").strip()
    if not t:
        return 1.0
    sample=t[:8000]
    letters=[ch for ch in sample if ch.isalpha()]
    ascii_letters=[ch for ch in letters if ("A"<=ch<="Z") or ("a"<=ch<="z")]
    ascii_ratio=(len(ascii_letters)/max(1,len(letters))) if letters else 0.0
    stop=sk_text.ENGLISH_STOP_WORDS
    tokens=[w.strip(string.punctuation).lower() for w in re.split(r"\s+",sample) if w]
    stop_hits=sum(1 for tok in tokens if tok in stop)
    stop_ratio=stop_hits/max(1,len(tokens))
    ld_prob=0.0
    for lp in _lang_probs(sample):
        if getattr(lp,"lang","")=="en":
            ld_prob=max(ld_prob,float(getattr(lp,"prob",0.0)))
    score=0.75*ld_prob+0.25*(0.6*ascii_ratio+0.4*stop_ratio)
    return max(0.0,min(1.0,score))

def _is_english(text):
    if not text or len(text.strip())==0: return True
    if _has_cjk(text):
        try:
            return detect(text)=="en"
        except Exception:
            return False
    if len(text)<160:
        try:
            return detect(text)=="en"
        except Exception:
            return False
    return _english_confidence(text)>=0.55

def _chunk_for_translate(t,limit=4200):
    out=[]; i=0; n=len(t)
    seps=["\n\n","。\n","。\n\n","；","；\n","；\n\n","，","。\n—\n","\n- "]
    while i<n:
        j=min(i+limit,n)
        k=-1
        for sep in seps:
            ks=t.rfind(sep,i,j)
            if ks>k: k=ks+len(sep)
        if k<i+200: k=j
        piece=t[i:k].strip(); i=k
        if piece: out.append(piece)
    return out

def _translate_piece_bedrock(piece,model_id,system_prompt=None):
    body={"max_tokens":4000,"temperature":0.0}
    if "anthropic" in model_id:
        body={"anthropic_version":"bedrock-2023-05-31","max_tokens":4000,"temperature":0.0,"messages":[{"role":"user","content":[{"type":"text","text":"Translate this to precise legal English. Keep headings, numbering, dates, entities verbatim. No commentary.\n\n"+piece}]}]}
    else:
        prompt="Translate the following text into precise legal English. Preserve headings, numbering, and dates. No commentary.\n\n"+piece
        body={"inputText":prompt,"textGenerationConfig":{"maxTokenCount":4000,"temperature":0.0}}
    try:
        resp=bedrock.invoke_model(modelId=model_id,body=json.dumps(body))
        data=json.loads(resp["body"].read())
        if "anthropic" in model_id:
            return data.get("content",[{}])[0].get("text","")
        return data.get("outputText","")
    except Exception:
        return ""

def _force_english(text):
    if not text: return text
    if _is_english(text): return text
    chunks=_chunk_for_translate(text,limit=4000)
    out=[]
    for piece in chunks:
        ok=False
        try:
            r=translate.translate_text(Text=piece,SourceLanguageCode="auto",TargetLanguageCode="en")
            cand=r.get("TranslatedText","") or ""
            if cand.strip() and _is_english(cand):
                out.append(cand); ok=True
        except Exception:
            pass
        if not ok:
            cand=_translate_piece_bedrock(piece,MODEL_TRANSLATE_FALLBACK_1)
            if cand.strip() and _is_english(cand):
                out.append(cand); ok=True
        if not ok:
            cand=_translate_piece_bedrock(piece,MODEL_TRANSLATE_FALLBACK_2)
            if cand.strip():
                out.append(cand if _is_english(cand) else _translate_piece_bedrock(cand,MODEL_TRANSLATE_FALLBACK_1))
    final="\n".join([c for c in out if c]).strip()
    return final if final else text

def _translate(text):
    t=(text or "").strip()
    if not t: return t
    if _is_english(t): return t
    out=_force_english(t)
    if not _is_english(out):
        out=_force_english(out)
    return out

def _extract_main_title(soup, fallback):
    try:
        t=soup.title.get_text(strip=True) if soup and soup.title else ""
    except Exception:
        t=""
    h1=""
    try:
        h=soup.find(["h1","h2"])
        h1=h.get_text(" ",strip=True) if h else ""
    except Exception:
        h1=""
    cand=next((x for x in [h1,t,fallback] if x),fallback)
    return cand

def _extract_main_container_text(raw):
    txt,soup=_html_to_text(raw)
    return txt,soup

def _prune_operative(text_en):
    x=PARLIAMENTARY.sub("",text_en)
    x=RECITALS.sub("",x)
    x=TAIL.sub("",x)
    x=re.sub(r"\bTable of Contents\b.*?(?=(^|\n)\s*Article\s+(1|premier|1er)\b)", "", x, flags=re.IGNORECASE|re.DOTALL)
    return x.strip()

def _split_regex(clean):
    ms=list(ART_SPLIT.finditer(clean))
    if not ms:
        return [{"marker":"Article 1","text":clean.strip()}]
    out=[]
    idxs=[m.start() for m in ms]+[len(clean)]
    for i in range(len(ms)):
        s=idxs[i]; e=idxs[i+1]
        line_end=clean.find("\n",ms[i].start(),e)
        if line_end==-1: line_end=ms[i].end()
        marker=clean[ms[i].start():line_end].strip()
        body=clean[s:e].strip()
        if not body.lower().startswith(marker.lower()): body=marker+"\n"+body
        out.append({"marker":marker,"text":body})
    return out

def _extract_article_blocks_jp(soup):
    if not soup:
        return []
    results=[]
    titles=soup.select("._div_ArticleTitle")
    for title in titles:
        marker_text=title.get_text("\n",strip=True)
        caption=None
        prev=title.find_previous_sibling(lambda tag: tag.name and "_div_ArticleCaption" in " ".join(tag.get("class", [])))
        if prev and prev.find_next_sibling() is title:
            caption=prev.get_text("\n",strip=True)
        heading=f"{marker_text} {caption}" if caption else marker_text
        body_parts=[heading]
        cur=title.next_sibling
        while cur:
            if getattr(cur,"name",None):
                cls=" ".join(cur.get("class",[]))
                if "_div_ArticleTitle" in cls or "_div_ChapterTitle" in cls:
                    break
                if any(k in cls for k in ["_div_ParagraphSentence","_div_ItemSentence","_div_ArticleCaption"]):
                    body_parts.append(cur.get_text("\n",strip=True))
            cur=cur.next_sibling
        text="\n".join([p for p in body_parts if p]).strip()
        if text:
            results.append({"marker":heading,"text":text})
    return results

def _split_from_soup_sections(soup):
    try:
        if soup.select("._div_ArticleTitle"):
            jp=_extract_article_blocks_jp(soup)
            if jp:
                return jp
    except Exception:
        pass
    secs=[]
    try:
        for sec in soup.find_all(["section"],recursive=True):
            txt=sec.get_text("\n",strip=True)
            if txt and (re.search(r"(第[一二三四五六七八九十百千\d]+条|Article\s+\d+|Art\.\s*\d+|Article\s+(premier|1er))",txt,flags=re.IGNORECASE) or len(txt)>400):
                marker="Section"
                m=re.search(r"(第[一二三四五六七八九十百千\d]+条|Article\s+\d+|Art\.\s*\d+|Article\s+(premier|1er))",txt,flags=re.IGNORECASE)
                if m: marker=m.group(0)
                secs.append({"marker":marker,"text":txt})
        for div in soup.select("._div_ArticleTitle, ._div_ParagraphSentence"):
            block=div.get_text("\n",strip=True)
            if block:
                parent=div.parent
                grab=[]
                for sib in parent.find_all(recursive=False):
                    grab.append(sib.get_text("\n",strip=True))
                txt="\n".join([g for g in grab if g]).strip()
                if txt and len(txt)>150:
                    m=re.search(r"(第[一二三四五六七八九十百千\d]+条)",txt)
                    marker=m.group(1) if m else "Article"
                    secs.append({"marker":marker,"text":txt})
    except Exception:
        pass
    return secs

def _ai_split(text_en):
    prompt=("Extract ONLY operative Articles/Sections with exact headings + full bodies. Exclude preambles, recitals, annexes, signatures, tables of contents. Return a JSON array of strings; each string is ONE complete Article/Section starting with its heading line (e.g., 'Article 14 Human oversight').")
    body={"anthropic_version":"bedrock-2023-05-31","max_tokens":12000,"messages":[{"role":"user","content":[{"type":"text","text":prompt+"\n\nDOCUMENT:\n"+text_en[:MAX_DOC_CHARS]}]}],"temperature":0.0}
    try:
        resp=bedrock.invoke_model(modelId=MODEL_SPLIT,body=json.dumps(body))
        parsed=json.loads(resp["body"].read())
        content=parsed["content"][0]["text"]
        arts=json.loads(content)
        return [a for a in arts if isinstance(a,str) and len(a.strip())>MIN_ART_LEN]
    except Exception:
        return []

def _has_heading(text_en):
    return bool(re.search(r"(^|\n)\s*(article\s+(premier|1er|first|one|i|\d+)|art\.\s*\d+|section\s+\d+|sec\.\s*\d+|subtitle\s+[A-Z]+|chapter\s+([IVXLCDM]+|\d+)|第[一二三四五六七八九十百千\d]+条)\b",text_en,re.IGNORECASE))

def _is_article(text_en):
    if ANNEX_HEAD.search(text_en): return False
    b=text_en.strip()
    if not _has_heading(b): return False
    if not re.search(r"\b(shall|must|may\s+not|is\s+prohibited|is\s+required|are\s+required|duty|penalt|fine|obligat|require|してはならない|しなければならない|必要がある|禁止|罚|应当|必须)\b",b,re.IGNORECASE):
        if len(b)<MIN_ART_LEN:
            return False
    toks=re.findall(r"[A-Za-z%/_-]+|[\u3400-\u9fff\u3040-\u30ff]",b)
    if len(toks)<30: return False
    return True

def _to_ymd(s):
    try:
        return parse_date(s,fuzzy=True,dayfirst=False).date().isoformat()
    except Exception:
        return ""

def _parse_cjk_date(s):
    m=CJK_DATE_RX.search(s)
    if not m: return ""
    y=int(m.group(1)); mth=int(m.group(2)); d=int(m.group(3))
    try:
        return date(y,mth,d).isoformat()
    except Exception:
        return ""

def _parse_jp_era(s):
    m=re.search(r"(令和|平成|昭和|大正|明治)\s*([元\d]+)\s*年\s*([0-9]{1,2})\s*月\s*([0-9]{1,2})\s*日",s)
    if not m: return ""
    era=m.group(1); year=m.group(2); month=int(m.group(3)); day=int(m.group(4))
    base=dict(JP_ERA)[era]
    y=1 if year=="元" else int(year)
    try:
        return date(base+y-1,month,day).isoformat()
    except Exception:
        return ""

def _doc_date(raw_text,soup,fname):
    cands=[]
    try:
        cands.extend(el.get_text(" ",strip=True) for el in soup.select("#lawTitleNo,#lawTitle,.oj-hd-date,.oj-doc-ti,.date,.document-date,.pubdate,.issued,.enacted,.approved,#lawTitleNo"))
    except Exception:
        pass
    head=" ".join(cands)
    jp=_parse_jp_era(head) or _parse_jp_era(raw_text[:120000])
    if jp: return jp
    cjk=_parse_cjk_date(head) or _parse_cjk_date(raw_text[:120000])
    if cjk: return cjk
    for m in DATE_RX.findall(head):
        d=_to_ymd(m[0])
        if d: return d
    for m in DATE_RX.findall(raw_text[:120000]):
        d=_to_ymd(m[0])
        if d: return d
    m=re.search(r"(\d{4})[-_](\d{1,2})[-_](\d{1,2})",Path(fname).name)
    if m:
        try:
            return datetime(int(m.group(1)),int(m.group(2)),int(m.group(3))).date().isoformat()
        except Exception:
            pass
    m=re.search(r"\b(19|20)\d{2}\b",Path(fname).name)
    if m:
        try:
            return datetime(int(m.group(0)),6,30).date().isoformat()
        except Exception:
            pass
    try:
        ts=Path(fname).stat().st_mtime
        return datetime.fromtimestamp(ts,tz=timezone.utc).date().isoformat()
    except Exception:
        return datetime.now(timezone.utc).date().isoformat()

def _ensure_ascii_lower(s: str) -> str:
    s=(s or "").strip()
    s=unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s=re.sub(r"[^A-Za-z0-9%/_\-\s]", "", s)
    s=re.sub(r"\s+", " ", s).strip().lower()
    return s

def _normalize_label_list(items):
    if isinstance(items,str):
        items=[x.strip() for x in items.split(",") if x.strip()]
    items=[_ensure_ascii_lower(x) for x in (items or []) if _ensure_ascii_lower(x)]
    bad={"article","section","chapter","directive","regulation","annex","annexe","appendix","title","subtitle"}
    return [x for x in dict.fromkeys(items) if x not in bad]

def _load_taxonomy():
    try:
        p=Path(TAXONOMY_PATH)
        if p.exists():
            with open(p,"r",encoding="utf-8") as f:
                data=json.load(f)
            for k in ["sector","activity","regulatory_theme","impact_type"]:
                data.setdefault(k,{})
            return data
    except Exception:
        pass
    return {"sector":{}, "activity":{}, "regulatory_theme":{}, "impact_type":{}}

def _save_taxonomy(tax):
    Path(OUT_DIR).mkdir(parents=True,exist_ok=True)
    with open(TAXONOMY_PATH,"w",encoding="utf-8") as f:
        json.dump(tax, f, ensure_ascii=False, indent=2)

def _tokenize_terms(label: str):
    toks=[t for t in re.split(r"[^A-Za-z0-9%/_-]+", label) if t]
    out=set()
    for t in toks:
        if len(t) >= 3:
            out.add(t)
    for i in range(len(toks)-1):
        big=f"{toks[i]} {toks[i+1]}"
        if 3 <= len(big.replace(" ","")) <= 30:
            out.add(big)
    return sorted(out)

def _taxonomy_add_labels(tax, category: str, labels):
    cat = tax.setdefault(category, {})
    for lbl in _normalize_label_list(labels):
        if not lbl: 
            continue
        entry = cat.setdefault(lbl, {"terms":[], "count":0})
        terms=set(entry.get("terms",[]))
        for t in _tokenize_terms(lbl):
            terms.add(t)
        entry["terms"]=sorted(terms)
        entry["count"]=int(entry.get("count",0))+1
        cat[lbl]=entry
    return tax

def _taxonomy_merge_model_keywords(tax, category: str, label: str, kws):
    label=_ensure_ascii_lower(label)
    if not label: return tax
    cat=tax.setdefault(category,{})
    entry=cat.setdefault(label, {"terms":[], "count":0})
    terms=set(entry.get("terms",[]))
    for w in _normalize_label_list(kws):
        for t in _tokenize_terms(w):
            terms.add(t)
    entry["terms"]=sorted(terms)
    cat[label]=entry
    return tax

def _guess_from_taxonomy(text_en, tax, category: str):
    text_low=" "+_ensure_ascii_lower(text_en)+" "
    hits=[]
    cat=tax.get(category,{})
    for lbl, meta in cat.items():
        for t in meta.get("terms",[]):
            if re.search(r"(?<![A-Za-z0-9_/%-])"+re.escape(t)+r"(?![A-Za-z0-9_/%-])", text_low):
                hits.append(lbl); break
    hits_unique=list(dict.fromkeys(hits))
    hits_unique.sort(key=lambda x: -int(cat.get(x,{}).get("count",0)))
    return hits_unique

def _extract_doc_fields(doc_title, doc_text_en):
    prompt=("Extract a single JSON object with keys: jurisdiction (string), regulator (list), sector (list), activity (list), regulatory_theme (list), impact_type (list), company_country (list), default_effective_date (YYYY-MM-DD or empty). Infer if implicit.\n\nTITLE:\n"+(doc_title or "")+"\n\nDOCUMENT:\n"+(doc_text_en[:MAX_DOC_CHARS] or ""))
    body={"messages":[{"role":"user","content":[{"type":"text","text":prompt}]}],"max_tokens":6000,"temperature":0.0}
    try:
        resp=bedrock.invoke_model(modelId=MODEL_EXTRACT_DOC,body=json.dumps(body))
        parsed=json.loads(resp["body"].read())
        text_out=parsed.get("content",[{}])[0].get("text","{}")
        data=json.loads(text_out)
        def join(v): return ", ".join([str(x).strip() for x in (v or []) if str(x).strip()]) if isinstance(v,list) else str(v or "").strip()
        return {
            "jurisdiction":str(data.get("jurisdiction","")).strip(),
            "sector":join(data.get("sector",[])),
            "activity":join(data.get("activity",[])),
            "regulatory_theme":join(data.get("regulatory_theme",[])),
            "impact_type":join(data.get("impact_type",[])),
            "regulator":join(data.get("regulator",[])),
            "company_country":join(data.get("company_country",[])),
            "default_effective_date":str(data.get("default_effective_date","")).strip()
        }
    except Exception:
        return {"jurisdiction":"","sector":"","activity":"","regulatory_theme":"","impact_type":"","regulator":"","company_country":"","default_effective_date":""}

def _clean_tokens(tokens):
    out=[]
    for w in tokens:
        wl=_ensure_ascii_lower(w)
        if not wl: continue
        if wl in STOPWORDS_EN: continue
        if re.fullmatch(r"\d+(\.\d+)?",wl): continue
        if len(wl)<=2: continue
        out.append(wl)
    return out

def _keywords_en(text_en,max_k=20):
    try:
        vec=TfidfVectorizer(stop_words=list(STOPWORDS_EN),ngram_range=(1,2),min_df=1,token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z%/_-]+\b")
        X=vec.fit_transform([text_en]); scores=X.toarray()[0]; terms=vec.get_feature_names_out()
        pairs=sorted(zip(terms,scores),key=lambda x:-x[1])
        bad={"article","articles","annex","section","chapter","law","act","directive","regulation","regulations","union","paragraph","recital","subparagraph","subtitle","title"}
        out=[]
        for t,_ in pairs:
            tl=_ensure_ascii_lower(t)
            if tl in bad: continue
            if tl in STOPWORDS_EN: continue
            if re.fullmatch(r"\d+(\.\d+)?",tl): continue
            if len(tl)<=2: continue
            if tl not in out: out.append(tl)
            if len(out)>=max_k: break
        return out
    except Exception:
        return []

def _keywords_cjk(raw,max_k=20):
    txt=re.sub(r"\s+","",raw)
    grams={}
    for n in (2,3):
        for i in range(len(txt)-n+1):
            g=txt[i:i+n]
            if re.search(r"[\u3400-\u9fff\u3040-\u30ff]",g) and not re.search(r"^[\d\W_]+$",g):
                grams[g]=grams.get(g,0)+1
    ordered=sorted(grams.items(),key=lambda x:-x[1])
    return [k for k,_ in ordered[:max_k]]

def _translate_list_to_english(items):
    items=[str(x).strip() for x in (items or []) if str(x).strip()]
    if not items:
        return []
    joined=", ".join(items)
    def _split_clean(s):
        parts=[w.strip() for w in re.split(r"[;,]", s) if w.strip()]
        seen=set(); out=[]
        for p in parts:
            p=unicodedata.normalize("NFKD", p).encode("ascii","ignore").decode("ascii")
            p=re.sub(r"[^A-Za-z0-9%/_\-\s]", "", p)
            p=re.sub(r"\s+"," ",p).strip()
            if p and p.lower() not in seen:
                seen.add(p.lower()); out.append(p)
        return out
    try:
        r=translate.translate_text(Text=joined, SourceLanguageCode="auto", TargetLanguageCode="en")
        cand=(r.get("TranslatedText") or "").strip()
        if cand:
            out=_split_clean(cand)
            if out:
                return out
    except Exception:
        pass
    cand=_translate_piece_bedrock(joined, MODEL_TRANSLATE_FALLBACK_1)
    if cand and cand.strip():
        out=_split_clean(cand)
        if out:
            return out
    cand=_translate_piece_bedrock(joined, MODEL_TRANSLATE_FALLBACK_2)
    if cand and cand.strip():
        out=_split_clean(cand)
        if out:
            return out
    return _split_clean(joined)

def _ensure_keywords_english(kws):
    kws=_translate_list_to_english(kws)
    clean=[]
    for w in kws:
        x=unicodedata.normalize("NFKD", w).encode("ascii","ignore").decode("ascii")
        x=re.sub(r"[^A-Za-z0-9%/_\-\s]", "", x)
        x=re.sub(r"\s+"," ", x).strip().lower()
        if not x: continue
        if x in STOPWORDS_EN: continue
        if re.fullmatch(r"\d+(\.\d+)?",x): continue
        if len(x)<=2: continue
        if x not in clean: clean.append(x)
    return clean

def _extract_fields(article_en):
    prompt=("Extract JSON with keys: jurisdiction (string), sector (list), activity (list), regulatory_theme (list), impact_type (list), effective_date (YYYY-MM-DD or empty string), regulator (list), keywords (list of 5-12), company_country (list). Use only THIS article's text.\n\nARTICLE:\n"+(article_en[:MAX_ART_CHARS] or ""))
    body={"messages":[{"role":"user","content":[{"type":"text","text":prompt}]}],"max_tokens":6000,"temperature":0.0}
    try:
        resp=bedrock.invoke_model(modelId=MODEL_EXTRACT,body=json.dumps(body))
        parsed=json.loads(resp["body"].read())
        text=parsed.get("content",[{}])[0].get("text","{}")
        data=json.loads(text)
        def norm_list(v):
            if isinstance(v,list):
                return [str(x).strip() for x in v if str(x).strip()]
            s=str(v or "").strip()
            return [x.strip() for x in s.split(",") if x.strip()] if s else []
        return {
            "jurisdiction":str(data.get("jurisdiction","")).strip(),
            "sector":norm_list(data.get("sector",[])),
            "activity":norm_list(data.get("activity",[])),
            "regulatory_theme":norm_list(data.get("regulatory_theme",[])),
            "impact_type":norm_list(data.get("impact_type",[])),
            "effective_date":str(data.get("effective_date","")).strip(),
            "regulator":norm_list(data.get("regulator",[])),
            "keywords":norm_list(data.get("keywords",[])),
            "company_country":norm_list(data.get("company_country",[])),
        }
    except Exception:
        return {"jurisdiction":"","sector":[],"activity":[],"regulatory_theme":[],"impact_type":[],"effective_date":"","regulator":[],"keywords":[],"company_country":[]}

def _guess_jurisdiction(title, body):
    s=(title+" "+body).lower()
    for k,v in JURIS_HINTS.items():
        if k in s:
            return v
    if _has_cjk(title+body):
        if "内閣" in (title+body) or "法律" in (title+body): return "Japan"
        if "中华人民共和国" in (title+body) or "国务院" in (title+body): return "China"
    m=re.search(r"\b(united states|united kingdom|canada|germany|france|italy|spain|netherlands|switzerland|australia|singapore|hong kong|japan|china)\b",s)
    if m:
        name=m.group(1).title()
        return name if name in ["United States","United Kingdom"] else name
    return "Global"

def _guess_regulators(text_en):
    s=text_en.lower()
    found=[]
    for name in REGULATOR_HINTS:
        if name.lower() in s:
            found.append(name)
    if not found:
        if "supervis" in s or "competent authority" in s: found.append("Competent Supervisory Authority")
    return list(dict.fromkeys(found)) or ["General Regulator"]

def _guess_effective_date_jp(article_text, doc_date):
    head=article_text[:3000]
    for pat in JP_EFFECTIVE_PATTERNS:
        m=re.search(pat, head)
        if m:
            window=head[max(0,m.start()-50):m.end()+50]
            d=_parse_jp_era(window) or _parse_cjk_date(window)
            if not d:
                m2=DATE_RX.search(window)
                if m2:
                    d=_to_ymd(m2.group(0))
            if d:
                return d
    return ""

def _guess_effective_date(article_en, doc_date):
    m=re.search(r"\bapplicab(le|ility)\s+from\s+([^\n.;]+)",article_en,flags=re.IGNORECASE)
    if m:
        d=_to_ymd(m.group(2))
        if d: return d
    m=re.search(r"\benter(s|ed)?\s+into\s+force\s+on\s+([^\n.;]+)",article_en,flags=re.IGNORECASE)
    if m:
        d=_to_ymd(m.group(2))
        if d: return d
    m=DATE_RX.search(article_en[:8000])
    if m:
        d=_to_ymd(m[0])
        if d: return d
    cjk=_parse_cjk_date(article_en[:8000])
    if cjk: return cjk
    return doc_date or datetime.now(timezone.utc).date().isoformat()

def _guess_from_rules(text_en, rules):
    text_low=" "+_ensure_ascii_lower(text_en)+" "
    hits=[]
    for label,keys in rules:
        for k in keys:
            if f" {k.lower()} " in text_low or re.search(r"(?<![A-Za-z0-9_/%-])"+re.escape(k.lower())+r"(?![A-Za-z0-9_/%-])",text_low):
                hits.append(_ensure_ascii_lower(label)); break
    return list(dict.fromkeys(hits))

def _merge_fill(fields,doc_date,article_en,doc_backfill=None, title=""):
    tax=_load_taxonomy()
    out=dict(fields)
    title=title or ""
    body=article_en or ""
    if not (out.get("jurisdiction") or "").strip():
        out["jurisdiction"]=doc_backfill.get("jurisdiction","").strip() if doc_backfill else ""
        if not out["jurisdiction"]:
            out["jurisdiction"]=_guess_jurisdiction(title,body)
    for k in ["sector","activity","regulatory_theme","impact_type","regulator","company_country"]:
        v=out.get(k,[])
        if isinstance(v,str):
            v=[x.strip() for x in v.split(",") if x.strip()]
        if k in ["regulator","company_country"]:
            v=_translate_list_to_english(v)
            v=[_ensure_ascii_lower(x).title() for x in v if _ensure_ascii_lower(x)]
        else:
            v=_ensure_keywords_english(v)
        out[k]=", ".join(dict.fromkeys(v))
    for cat,rule_set in [("sector",SECTOR_RULES),("activity",ACTIVITY_RULES),("regulatory_theme",THEME_RULES),("impact_type",IMPACT_RULES)]:
        if not out.get(cat,"").strip():
            guesses=_guess_from_taxonomy(body, tax, cat)
            if not guesses:
                guesses=_guess_from_rules(body, rule_set)
            if guesses:
                out[cat]=", ".join(guesses[:5])
            else:
                out[cat]="Obligation" if cat=="impact_type" else "General"
    if not out.get("regulator","").strip():
        out["regulator"]=", ".join(_guess_regulators(body))
    if not out.get("company_country","").strip():
        out["company_country"]=out["jurisdiction"] if out["jurisdiction"]!="Global" else "Global"
    eff=(out.get("effective_date","") or "").strip()
    if not eff and doc_backfill and (doc_backfill.get("default_effective_date","") or "").strip():
        eff=doc_backfill["default_effective_date"]
    if not eff:
        eff=_guess_effective_date(body,doc_date)
    out["effective_date"]=eff
    for cat in ["sector","activity","regulatory_theme","impact_type"]:
        labels=_normalize_label_list(out.get(cat,"").split(","))
        tax=_taxonomy_add_labels(tax, cat, labels)
        if cat in fields and isinstance(fields[cat], list):
            tax=_taxonomy_merge_model_keywords(tax, cat, labels[0] if labels else "", fields.get("keywords",[]))
    _save_taxonomy(tax)
    return out

def _normhash(t):
    x=re.sub(r"\s+"," ",(t or "")).strip().lower()
    x=re.sub(r"[^\w\s%/_-]+","",x)
    return hashlib.sha1(x.encode("utf-8")).hexdigest()

def _ensure_min_keywords(en_kws, raw_text, en_text, k=8):
    out=list(dict.fromkeys(en_kws))
    if len(out)>=k:
        return out[:k]
    more=_keywords_en(en_text, max_k=k*2)
    for w in more:
        if w not in out:
            out.append(w)
        if len(out)>=k:
            break
    if len(out)<k and _has_cjk(raw_text) and not _is_english(raw_text):
        cjk=_keywords_cjk(raw_text, max_k=k*2)
        cjk_en=_ensure_keywords_english(_translate_list_to_english(cjk))
        for w in cjk_en:
            if w not in out:
                out.append(w)
            if len(out)>=k:
                break
    return out[:k]

def _split_candidates(en_text,soup):
    secs=_split_from_soup_sections(soup)
    if secs: return secs
    ai=_ai_split(en_text)
    if ai: return [{"marker": ai[i].split("\n",1)[0].strip() if "\n" in ai[i] else f"Article {i+1}", "text": ai[i]} for i in range(len(ai))]
    return _split_regex(en_text)

def _filter_jp_grams(grams):
    out=[]
    for g in grams:
        if any(ch in "第章条項号節款" for ch in g): 
            continue
        if g in STOPWORDS_JA:
            continue
        out.append(g)
    return out

def _keywords_pipeline(article_raw, article_en, model_kws):
    model_kws=_ensure_keywords_english(model_kws)
    if _has_cjk(article_raw) and not _is_english(article_raw):
        tfidf_raw=_keywords_cjk(article_raw,20)
        tfidf_raw=_filter_jp_grams(tfidf_raw)
        tfidf=_ensure_keywords_english(_translate_list_to_english(tfidf_raw))
    else:
        tfidf=_ensure_keywords_english(_keywords_en(article_en,20))
    out=[]
    for w in model_kws+tfidf:
        if w and w not in out: out.append(w)
    out=_ensure_min_keywords(out, article_raw, article_en, k=10)
    return out[:20]

def _post_ensure_english_fields(row):
    row["article_text"]=_force_english(row.get("article_text","") or "")
    row["keywords"]=", ".join(_ensure_keywords_english([w.strip() for w in (row.get("keywords","") or "").split(",") if w.strip()]))
    row["jurisdiction"]=_translate_list_to_english([row.get("jurisdiction","")])[0] if row.get("jurisdiction","") else "Global"
    row["sector"]=", ".join(_ensure_keywords_english([w for w in (row.get("sector","") or "").split(",") if w.strip()]))
    row["activity"]=", ".join(_ensure_keywords_english([w for w in (row.get("activity","") or "").split(",") if w.strip()]))
    row["regulatory_theme"]=", ".join(_ensure_keywords_english([w for w in (row.get("regulatory_theme","") or "").split(",") if w.strip()]))
    row["impact_type"]=", ".join(_ensure_keywords_english([w for w in (row.get("impact_type","") or "").split(",") if w.strip()])) or "obligation"
    row["regulator"]=", ".join(_translate_list_to_english([w for w in (row.get("regulator","") or "").split(",") if w.strip()])) or "General Regulator"
    row["company_country"]=", ".join(_translate_list_to_english([w for w in (row.get("company_country","") or "").split(",") if w.strip()])) or (row["jurisdiction"] if row["jurisdiction"]!="Global" else "Global")
    return row

def _process_file(path,start_id):
    raw=_read(path)
    raw_txt,soup=_extract_main_container_text(raw)
    en_doc=_translate(raw_txt)
    if not _is_english(en_doc): en_doc=_force_english(en_doc)
    doc_date=_doc_date(raw_txt,soup,path)
    title_guess=_extract_main_title(soup, Path(path).stem)
    doc_backfill=_extract_doc_fields(title_guess,en_doc)
    for k in ["sector","activity","regulatory_theme","impact_type","regulator","company_country","jurisdiction"]:
        v=doc_backfill.get(k,"")
        if isinstance(v,str):
            doc_backfill[k]=", ".join(_translate_list_to_english([w for w in v.split(",") if w.strip()]))
    if doc_backfill.get("jurisdiction",""):
        doc_backfill["jurisdiction"]=doc_backfill["jurisdiction"].strip()
    operative=_prune_operative(en_doc)
    articles=_split_candidates(operative,soup)
    seen=set(); rows=[]; cur=start_id
    for art in articles:
        raw_body=art["text"].strip()
        en_body=_translate(raw_body) if not _is_english(raw_body) else raw_body
        if not _is_article(en_body): continue
        h=_normhash(en_body)
        if h in seen: continue
        seen.add(h)
        fields_raw=_extract_fields(en_body)
        fields_raw["regulator"]=_translate_list_to_english(fields_raw.get("regulator",[]))
        fields_raw["keywords"]=_ensure_keywords_english(fields_raw.get("keywords",[]))
        fields=_merge_fill(fields_raw,doc_date,en_body,doc_backfill,title_guess)
        jp_eff=_guess_effective_date_jp(raw_body,doc_date) if _has_cjk(raw_body) else ""
        if jp_eff and not fields.get("effective_date"):
            fields["effective_date"]=jp_eff
        keywords_list=_keywords_pipeline(raw_body, en_body, fields_raw.get("keywords",[]))
        row={
            "article_id":cur,
            "jurisdiction":fields["jurisdiction"] or "Global",
            "sector":fields["sector"] or "General",
            "activity":fields["activity"] or "General",
            "regulatory_theme":fields["regulatory_theme"] or "General",
            "impact_type":fields["impact_type"] or "Obligation",
            "effective_date":fields["effective_date"],
            "regulator":fields["regulator"] or "General Regulator",
            "keywords":", ".join(keywords_list),
            "company_country":fields["company_country"] or (fields["jurisdiction"] if fields["jurisdiction"]!="Global" else "Global"),
            "_source_file":Path(path).name,
            "_article_marker":art["marker"],
            "article_text":en_body
        }
        row=_post_ensure_english_fields(row)
        rows.append(row)
        cur+=1
    return rows,cur

def process_all_documents():
    root=_resolve_input_dir()
    files=_list_files(root,recursive=RECURSIVE)
    if not files:
        print("[WARN] no inputs"); return pd.DataFrame()
    global_counter=0
    all_rows=[]
    for f in files:
        per_file_rows,global_counter=_process_file(str(f),global_counter)
        if not per_file_rows:
            raw=_read(str(f))
            raw_txt,soup=_html_to_text(raw)
            en=_translate(raw_txt)
            if not _is_english(en): en=_force_english(en)
            title_guess=_extract_main_title(soup, Path(f).stem)
            back=_extract_doc_fields(title_guess,en)
            for k in ["sector","activity","regulatory_theme","impact_type","regulator","company_country","jurisdiction"]:
                v=back.get(k,"")
                if isinstance(v,str):
                    back[k]=", ".join(_translate_list_to_english([w for w in v.split(",") if w.strip()]))
            dd=_doc_date(raw_txt,soup,str(f))
            if _has_cjk(raw_txt) and not _is_english(raw_txt):
                tfidf_kws=_ensure_keywords_english(_translate_list_to_english(_keywords_cjk(raw_txt,20)))
            else:
                tfidf_kws=_ensure_keywords_english(_keywords_en(en,20))
            tax=_load_taxonomy()
            body=en
            def guess(cat):
                g=_guess_from_taxonomy(body, tax, cat)
                if not g:
                    g=_guess_from_rules(body, {"sector":SECTOR_RULES,"activity":ACTIVITY_RULES,"regulatory_theme":THEME_RULES,"impact_type":IMPACT_RULES}[cat])
                return ", ".join(g[:5]) if g else ("Obligation" if cat=="impact_type" else "General")
            per_file_rows=[{
                "article_id":global_counter,
                "jurisdiction":back.get("jurisdiction") or _guess_jurisdiction(title_guess,en),
                "sector":back.get("sector") or guess("sector"),
                "activity":back.get("activity") or guess("activity"),
                "regulatory_theme":back.get("regulatory_theme") or guess("regulatory_theme"),
                "impact_type":back.get("impact_type") or guess("impact_type"),
                "effective_date":back.get("default_effective_date") or dd or datetime.now(timezone.utc).date().isoformat(),
                "regulator":back.get("regulator") or ", ".join(_guess_regulators(en)),
                "keywords":", ".join(_ensure_min_keywords(_ensure_keywords_english(tfidf_kws), raw_txt, en, k=10)),
                "company_country":back.get("company_country") or (_guess_jurisdiction(title_guess,en) if _guess_jurisdiction(title_guess,en)!="Global" else "Global"),
                "_source_file":Path(f).name,
                "_article_marker":"Document",
                "article_text":en
            }]
            per_file_rows=[_post_ensure_english_fields(r) for r in per_file_rows]
            tax=_taxonomy_add_labels(_load_taxonomy(),"sector",_normalize_label_list(per_file_rows[0]["sector"].split(",")))
            tax=_taxonomy_add_labels(tax,"activity",_normalize_label_list(per_file_rows[0]["activity"].split(",")))
            tax=_taxonomy_add_labels(tax,"regulatory_theme",_normalize_label_list(per_file_rows[0]["regulatory_theme"].split(",")))
            tax=_taxonomy_add_labels(tax,"impact_type",_normalize_label_list(per_file_rows[0]["impact_type"].split(",")))
            _save_taxonomy(tax)
            global_counter+=1
        df_file=pd.DataFrame(per_file_rows,columns=[
            "article_id","jurisdiction","sector","activity","regulatory_theme","impact_type",
            "effective_date","regulator","keywords","company_country",
            "_source_file","_article_marker","article_text"
        ])
        _save_per_input(df_file,Path(f))
        print(f"[FILE] {Path(f).name}: {len(per_file_rows)} rows; next_id={global_counter}")
        all_rows.extend(per_file_rows)
    df_all=pd.DataFrame(all_rows,columns=[
        "article_id","jurisdiction","sector","activity","regulatory_theme","impact_type",
        "effective_date","regulator","keywords","company_country",
        "_source_file","_article_marker","article_text"
    ])
    if not df_all.empty: _save_all(df_all)
    print(f"[TOTAL] {len(df_all)} rows")
    return df_all

if __name__=="__main__":
    df=process_all_documents()
    print(df.head() if not df.empty else "No results")


[INPUT] 5 files in directives (recursive=True)
 - 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL.html
 - 3.H.R.5376 - Inflation Reduction Act of 2022.xml
 - 4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.html
 - 5.中华人民共和国能源法__中国政府网.html
 - 6.人工知能関連技術の研究開発及び活用の推進に関する法律.html
[SAVE] out/1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL.csv
[FILE] 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL.html: 5 rows; next_id=5
[SAVE] out/3.H.R.5376 - Inflation Reduction Act of 2022.csv
[FILE] 3.H.R.5376 - Inflation Reduction Act of 2022.xml: 144 rows; next_id=149
[SAVE] out/4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.csv
[FILE] 4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.html: 74 rows; next_id=223
[SAVE] out/5.中华人民共和国能源法__中国政府网.csv
[FILE] 5.中华人民共和国能源法__中国政府网.html: 29 rows; next_id=252
[SAVE] out/6.人工知能関連技術の研究開発及び活用の推進に関する法律.csv
[FILE] 6.人工知能関連技術の研究開発及び活用の推進に関する法律.html: 6 rows

In [36]:
import os, re, json, html, string, unicodedata
from pathlib import Path
from datetime import datetime, timezone, date
import boto3
import pandas as pd
from bs4 import BeautifulSoup
from dateutil.parser import parse as parse_date
from langdetect import detect, detect_langs, DetectorFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text as sk_text

DetectorFactory.seed = 42

AWS_REGION=os.getenv("AWS_REGION","us-west-2")
MODEL_TRANSLATE_FALLBACK_1=os.getenv("BEDROCK_TRANSLATE_PRIMARY","amazon.nova-micro-v1:0")
MODEL_TRANSLATE_FALLBACK_2=os.getenv("BEDROCK_TRANSLATE_SECONDARY","anthropic.claude-3-7-sonnet-20250219-v1:0")
MODEL_EXTRACT=os.getenv("BEDROCK_EXTRACT_MODEL","amazon.nova-premier-v1:0")
MODEL_EXTRACT_DOC=os.getenv("BEDROCK_EXTRACT_DOC_MODEL","amazon.nova-premier-v1:0:1000k")

INPUT_DIRS=json.loads(os.getenv("INPUT_DIRS","[\"./directives\"]"))
ALLOWED_EXT=set([x.lower().strip() for x in json.loads(os.getenv("ALLOWED_EXT","[\".html\",\".xml\"]"))])
RECURSIVE=os.getenv("RECURSIVE","true").strip().lower() in {"1","true","yes"}
OUT_DIR=os.getenv("OUT_DIR","out")
MAX_DOC_CHARS=int(os.getenv("MAX_DOC_CHARS","180000"))
MAX_CHUNK_CHARS=int(os.getenv("MAX_CHUNK_CHARS","18000"))

translate=boto3.client("translate",region_name=AWS_REGION)
bedrock=boto3.client("bedrock-runtime",region_name=AWS_REGION)

EXTRA_STOPWORDS={"section","sections","article","articles","annex","annexe","appendix","appendice","subtitle","title","chapter","chapitre","directive","regulation","regulations","law","act","union","paragraph","subparagraph","recital","dispositif","premier","1er","amended","shall","must","may","including","include","pursuant","accordance","specified","provide","provided","applicable","applicability","applying","applicant","applicants","applying","subject","subjects","thereof","hereof","therein","herein","thereby","hereby","whereas","hereunder","thereunder","among","between","within","without","preamble","scope","purpose","purposes","general","specific"}
STOPWORDS_EN=set(sk_text.ENGLISH_STOP_WORDS)|EXTRA_STOPWORDS

STOPWORDS_JA={"第","条","項","号","章","節","款","目次","附則","総則","抄","同","前","又は","及び","並びに","その他","こと","もの","ため","者","うえ","上","下","について","に関する","に係る","する","される","した","して","すると","され","なる","ない","これ","それ","当該","各","同条","政府","国","内閣","大臣","本部","本部長","本法","本章","本条","次項","前項","人工知能","人工知能関連技術","技術","研究開発","活用","推進","計画","基本","基本計画","施策","規定","規範","方針","必要","措置","整備","確保","促進","国際","協力","教育","人材","情報","データ","等","など"}
STRUCTURAL_LABEL_JA={"総則","附則","目次","人工知能基本計画","人工知能戦略本部"}

PARLIAMENTARY=re.compile(r"^\s*(having regard|after transmission|after consulting|in accordance with|whereas|pursuant to|considering|vu(?:\s+la|(?:x|es)?)?)\b.*$",re.IGNORECASE|re.MULTILINE)
RECITALS=re.compile(r"\b(whereas|considérant(?:\s+que)?|vu(?:\s+la|(?:x|es)?)?)\b.*?(?=(^|\n)\s*(article|art\.?|dispositif|chapitre|titre)\s*[^\n]*\b(1|premier|1er)\b|\bannex|annexe|appendix|appendice|schedule\b)",re.IGNORECASE|re.DOTALL)
TAIL=re.compile(r"(done at\s+[A-Za-z]+\s+\d{1,2}\s+[A-Za-z]+\s+\d{4}.*$|for the european parliament.*$|for the council.*$)",re.IGNORECASE|re.DOTALL)
DATE_RX=re.compile(r"(\b\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{4}\b|\b\d{4}-\d{1,2}-\d{1,2}\b|\b\d{1,2}[./]\d{1,2}[./]\d{2,4}\b)",re.IGNORECASE)
CJK_DATE_RX=re.compile(r"(\d{4})年(\d{1,2})月(\d{1,2})日")
JP_ERA=[("令和",2019),("平成",1989),("昭和",1926),("大正",1912),("明治",1868)]

def _resolve_input_dir():
    for d in INPUT_DIRS:
        p=Path(d)
        if p.exists(): return p
    p=Path("./directives"); p.mkdir(parents=True,exist_ok=True); return p

def _canon_name(p: Path):
    stem=p.stem
    stem=re.sub(r"(?i)(^|[-_\s])(checkpoint|copy|copie|copiar)$","",stem).strip()
    stem=re.sub(r"(?i)[-_]checkpoint","",stem)
    stem=re.sub(r"\s*\(\d+\)$","",stem)
    stem=re.sub(r"\s+"," ",stem)
    return (stem.lower(), p.suffix.lower())

def _is_checkpoint(p: Path):
    name=p.name.lower()
    return ("-checkpoint" in name or name.endswith("_checkpoint.html") or "checkpoint.html" in name or name.endswith(".ipynb") or p.parent.name.lower()==".ipynb_checkpoints" or ".ipynb_checkpoints" in str(p.parent).lower())

def _list_files(root,recursive=False):
    def ok(p):
        n=p.name
        if _is_checkpoint(p): return False
        if p.parent.name == ".ipynb_checkpoints": return False
        return p.is_file() and p.suffix.lower() in ALLOWED_EXT and not (n.startswith(".") or n.startswith("._") or n.endswith("~"))
    it=(root.rglob("*") if recursive else root.iterdir())
    cand=[p for p in it if ok(p)]
    grouped={}
    for p in cand:
        key=_canon_name(p); best=grouped.get(key)
        if best is None: grouped[key]=p
        else:
            if _is_checkpoint(best) and not _is_checkpoint(p): grouped[key]=p
            elif _is_checkpoint(best)==_is_checkpoint(p):
                if p.stat().st_mtime>best.stat().st_mtime: grouped[key]=p
    files=sorted(grouped.values(),key=lambda x:x.name.lower())
    print(f"[INPUT] {len(files)} files in {root} (recursive={recursive})")
    for p in files: print(" -",p.name)
    return files

def _read(path): return Path(path).read_text(encoding="utf-8",errors="ignore")

def _extract_main_container(soup):
    for sel in ["#innerDocument","main#contentsLaw","#docHtml","article","#content","body"]:
        el=soup.select_one(sel)
        if el and len(el.get_text(strip=True))>200: return el
    return soup

def _html_to_text(raw):
    try: soup=BeautifulSoup(raw,"lxml")
    except Exception: soup=BeautifulSoup(raw,"html.parser")
    main=_extract_main_container(soup)
    for tag in main(["script","style","nav","header","footer","noscript","aside","form"]):
        try: tag.extract()
        except Exception: pass
    for br in main.find_all(["br","hr"]): br.replace_with("\n")
    for li in main.find_all("li"):
        txt=li.get_text(" ",strip=True); li.string=("\n- "+txt+"\n") if txt else "\n"
    for th in main.find_all(["h1","h2","h3","h4","h5","h6","strong","b"]):
        t=th.get_text(" ",strip=True); th.string=("\n"+t+"\n") if t else "\n"
    text=html.unescape(main.get_text("\n",strip=True))
    text=re.sub(r"\n{3,}","\n\n",text)
    return text,soup

def _extract_main_title(soup, fallback):
    try: t=soup.title.get_text(strip=True) if soup and soup.title else ""
    except Exception: t=""
    try: h=soup.find(["h1","h2"]); h1=h.get_text(" ",strip=True) if h else ""
    except Exception: h1=""
    return next((x for x in [h1,t,fallback] if x),fallback)

def _has_cjk(s): return bool(re.search(r"[\u3400-\u4dbf\u4e00-\u9fff\u3040-\u30ff]",s or ""))

def _lang_probs(text):
    try: return detect_langs(text)
    except Exception:
        try: return [type("LP",(object,),{"lang":detect(text),"prob":1.0})()]
        except Exception: return []

def _english_confidence(text):
    t=(text or "").strip()
    if not t: return 1.0
    sample=t[:8000]
    letters=[ch for ch in sample if ch.isalpha()]
    ascii_letters=[ch for ch in letters if ("A"<=ch<="Z") or ("a"<=ch<="z")]
    ascii_ratio=(len(ascii_letters)/max(1,len(letters))) if letters else 0.0
    tokens=[w.strip(string.punctuation).lower() for w in re.split(r"\s+",sample) if w]
    stop_hits=sum(1 for tok in tokens if tok in sk_text.ENGLISH_STOP_WORDS)
    stop_ratio=stop_hits/max(1,len(tokens))
    ld_prob=0.0
    for lp in _lang_probs(sample):
        if getattr(lp,"lang","")=="en": ld_prob=max(ld_prob,float(getattr(lp,"prob",0.0)))
    score=0.75*ld_prob+0.25*(0.6*ascii_ratio+0.4*stop_ratio)
    return max(0.0,min(1.0,score))

def _is_english(text):
    if not text or len(text.strip())==0: return True
    if _has_cjk(text):
        try: return detect(text)=="en"
        except Exception: return False
    if len(text)<160:
        try: return detect(text)=="en"
        except Exception: return False
    return _english_confidence(text)>=0.55

def _translate_piece_bedrock(piece,model_id,system_prompt=None):
    body={"max_tokens":4000,"temperature":0.0}
    if "anthropic" in model_id:
        body={"anthropic_version":"bedrock-2023-05-31","max_tokens":4000,"temperature":0.0,"messages":[{"role":"user","content":[{"type":"text","text":"Translate this to precise legal English. Keep headings, numbering, dates, entities verbatim. No commentary.\n\n"+piece}]}]}
    else:
        prompt="Translate the following text into precise legal English. Preserve headings, numbering, and dates. No commentary.\n\n"+piece
        body={"inputText":prompt,"textGenerationConfig":{"maxTokenCount":4000,"temperature":0.0}}
    try:
        resp=bedrock.invoke_model(modelId=model_id,body=json.dumps(body))
        data=json.loads(resp["body"].read())
        if "anthropic" in model_id: return data.get("content",[{}])[0].get("text","")
        return data.get("outputText","")
    except Exception:
        return ""

def _chunk_for_translate(t,limit=4200):
    out=[]; i=0; n=len(t)
    seps=["\n\n","。\n","。\n\n","；","；\n","；\n\n","，","。\n—\n","\n- "]
    while i<n:
        j=min(i+limit,n)
        k=-1
        for sep in seps:
            ks=t.rfind(sep,i,j)
            if ks>k: k=ks+len(sep)
        if k<i+200: k=j
        piece=t[i:k].strip(); i=k
        if piece: out.append(piece)
    return out

def _force_english(text):
    if not text: return text
    if _is_english(text): return text
    chunks=_chunk_for_translate(text,limit=4000)
    out=[]
    for piece in chunks:
        ok=False
        try:
            r=translate.translate_text(Text=piece,SourceLanguageCode="auto",TargetLanguageCode="en")
            cand=r.get("TranslatedText","") or ""
            if cand.strip() and _is_english(cand): out.append(cand); ok=True
        except Exception: pass
        if not ok:
            cand=_translate_piece_bedrock(piece,MODEL_TRANSLATE_FALLBACK_1)
            if cand.strip() and _is_english(cand): out.append(cand); ok=True
        if not ok:
            cand=_translate_piece_bedrock(piece,MODEL_TRANSLATE_FALLBACK_2)
            if cand.strip():
                out.append(cand if _is_english(cand) else _translate_piece_bedrock(cand,MODEL_TRANSLATE_FALLBACK_1))
    final="\n".join([c for c in out if c]).strip()
    return final if final else text

def _translate(text):
    t=(text or "").strip()
    if not t: return t
    if _is_english(t): return t
    out=_force_english(t)
    if not _is_english(out): out=_force_english(out)
    return out

def _prune_operative(text_en):
    x=PARLIAMENTARY.sub("",text_en)
    x=RECITALS.sub("",x)
    x=TAIL.sub("",x)
    x=re.sub(r"\bTable of Contents\b.*?(?=(^|\n)\s*Article\s+(1|premier|1er)\b)", "", x, flags=re.IGNORECASE|re.DOTALL)
    return x.strip()

def _chunk_iter(text,limit=MAX_CHUNK_CHARS):
    i=0; n=len(text)
    seps=["\n\n","\n- ","; ",". "]
    while i<n:
        j=min(i+limit,n)
        k=-1
        for sep in seps:
            ks=text.rfind(sep,i,j)
            if ks>k: k=ks+len(sep)
        if k<i+200: k=j
        piece=text[i:k].strip(); i=k
        if piece: yield piece

def _to_ymd(s):
    try: return parse_date(s,fuzzy=True,dayfirst=False).date().isoformat()
    except Exception: return ""

def _parse_cjk_date(s):
    m=CJK_DATE_RX.search(s)
    if not m: return ""
    y=int(m.group(1)); mth=int(m.group(2)); d=int(m.group(3))
    try: return date(y,mth,d).isoformat()
    except Exception: return ""

def _parse_jp_era(s):
    m=re.search(r"(令和|平成|昭和|大正|明治)\s*([元\d]+)\s*年\s*([0-9]{1,2})\s*月\s*([0-9]{1,2})\s*日",s)
    if not m: return ""
    era=m.group(1); year=m.group(2); month=int(m.group(3)); day=int(m.group(4))
    base=dict(JP_ERA)[era]; y=1 if year=="元" else int(year)
    try: return date(base+y-1,month,day).isoformat()
    except Exception: return ""

def _doc_date(raw_text,soup,fname):
    cands=[]
    try: cands.extend(el.get_text(" ",strip=True) for el in soup.select("#lawTitleNo,#lawTitle,.oj-hd-date,.oj-doc-ti,.date,.document-date,.pubdate,.issued,.enacted,.approved,#lawTitleNo"))
    except Exception: pass
    head=" ".join(cands)
    jp=_parse_jp_era(head) or _parse_jp_era(raw_text[:120000])
    if jp: return jp
    cjk=_parse_cjk_date(head) or _parse_cjk_date(raw_text[:120000])
    if cjk: return cjk
    for m in DATE_RX.findall(head):
        d=_to_ymd(m[0])
        if d: return d
    for m in DATE_RX.findall(raw_text[:120000]):
        d=_to_ymd(m[0])
        if d: return d
    m=re.search(r"(\d{4})[-_](\d{1,2})[-_](\d{1,2})",Path(fname).name)
    if m:
        try: return datetime(int(m.group(1)),int(m.group(2)),int(m.group(3))).date().isoformat()
        except Exception: pass
    m=re.search(r"\b(19|20)\d{2}\b",Path(fname).name)
    if m:
        try: return datetime(int(m.group(0)),6,30).date().isoformat()
        except Exception: pass
    try:
        ts=Path(fname).stat().st_mtime
        return datetime.fromtimestamp(ts,tz=timezone.utc).date().isoformat()
    except Exception:
        return datetime.now(timezone.utc).date().isoformat()

def _ensure_ascii_lower(s: str):
    s=(s or "").strip()
    s=unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s=re.sub(r"[^A-Za-z0-9%/_\-\s]", "", s)
    s=re.sub(r"\s+"," ", s).strip().lower()
    return s

def _kw_from_text(text_en, max_k=40):
    try:
        vec=TfidfVectorizer(stop_words=list(STOPWORDS_EN),ngram_range=(1,2),min_df=1,token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z%/_-]+\b")
        X=vec.fit_transform([text_en]); scores=X.toarray()[0]; terms=vec.get_feature_names_out()
        pairs=sorted(zip(terms,scores),key=lambda x:-x[1])
        bad={"article","articles","annex","section","chapter","law","act","directive","regulation","regulations","union","paragraph","recital","subparagraph","subtitle","title"}
        out=[]
        for t,_ in pairs:
            tl=_ensure_ascii_lower(t)
            if tl in bad or tl in STOPWORDS_EN: continue
            if re.fullmatch(r"\d+(\.\d+)?",tl): continue
            if len(tl)<=2: continue
            if tl not in out: out.append(tl)
            if len(out)>=max_k: break
        return out
    except Exception:
        return []

def _llm_extract_chunk(chunk_en):
    prompt=("Return a strict JSON object with keys exactly:\n"
            "jurisdiction (string), sector (array of strings), activity (array of strings), regulatory_theme (array of strings), impact_type (array of strings), regulator (array of strings), company_country (array of strings), effective_date (YYYY-MM-DD or \"\").\n"
            "Use only this text, be concise, lowercase labels except jurisdictions/regulators/countries (title case), deduplicate.\n\nTEXT:\n"+chunk_en[:MAX_CHUNK_CHARS])
    body={"messages":[{"role":"user","content":[{"type":"text","text":prompt}]}],"max_tokens":3000,"temperature":0.0}
    try:
        resp=bedrock.invoke_model(modelId=MODEL_EXTRACT,body=json.dumps(body))
        parsed=json.loads(resp["body"].read())
        text_out=parsed.get("content",[{}])[0].get("text","{}")
        data=json.loads(text_out)
        def normlist(v):
            if isinstance(v,list): return [str(x).strip() for x in v if str(x).strip()]
            if isinstance(v,str) and v.strip(): return [v.strip()]
            return []
        return {
            "jurisdiction":str(data.get("jurisdiction","")).strip(),
            "sector":normlist(data.get("sector",[])),
            "activity":normlist(data.get("activity",[])),
            "regulatory_theme":normlist(data.get("regulatory_theme",[])),
            "impact_type":normlist(data.get("impact_type",[])),
            "regulator":normlist(data.get("regulator",[])),
            "company_country":normlist(data.get("company_country",[])),
            "effective_date":str(data.get("effective_date","")).strip()
        }
    except Exception:
        return {"jurisdiction":"","sector":[],"activity":[],"regulatory_theme":[],"impact_type":[],"regulator":[],"company_country":[],"effective_date":""}

def _llm_extract_doc(title, text_en):
    prompt=("From the following title and document, infer a single JSON object with keys exactly:\n"
            "jurisdiction (string), sector (array), activity (array), regulatory_theme (array), impact_type (array), regulator (array), company_country (array), default_effective_date (YYYY-MM-DD or \"\").\n\n"
            "TITLE:\n"+(title or "")+"\n\nDOCUMENT:\n"+(text_en[:MAX_DOC_CHARS] or ""))
    body={"messages":[{"role":"user","content":[{"type":"text","text":prompt}]}],"max_tokens":6000,"temperature":0.0}
    try:
        resp=bedrock.invoke_model(modelId=MODEL_EXTRACT_DOC,body=json.dumps(body))
        parsed=json.loads(resp["body"].read())
        text_out=parsed.get("content",[{}])[0].get("text","{}")
        data=json.loads(text_out)
        def normlist(v):
            if isinstance(v,list): return [str(x).strip() for x in v if str(x).strip()]
            if isinstance(v,str) and v.strip(): return [v.strip()]
            return []
        return {
            "jurisdiction":str(data.get("jurisdiction","")).strip(),
            "sector":normlist(data.get("sector",[])),
            "activity":normlist(data.get("activity",[])),
            "regulatory_theme":normlist(data.get("regulatory_theme",[])),
            "impact_type":normlist(data.get("impact_type",[])),
            "regulator":normlist(data.get("regulator",[])),
            "company_country":normlist(data.get("company_country",[])),
            "default_effective_date":str(data.get("default_effective_date","")).strip()
        }
    except Exception:
        return {"jurisdiction":"","sector":[],"activity":[],"regulatory_theme":[],"impact_type":[],"regulator":[],"company_country":[],"default_effective_date":""}

def _norm_titlecase(items): return sorted(list(dict.fromkeys([_to_title(x) for x in items if x.strip()])))
def _norm_lower(items): return sorted(list(dict.fromkeys([_ensure_ascii_lower(x) for x in items if _ensure_ascii_lower(x)])))

def _to_title(s):
    s=_ensure_ascii_lower(s)
    return " ".join([w.capitalize() for w in s.split()])

def _aggregate_fields(doc_backfill, chunk_fields_list, corpus_text_en, doc_date_guess):
    juris=doc_backfill.get("jurisdiction","").strip() or ""
    sectors=set(doc_backfill.get("sector",[]))
    activities=set(doc_backfill.get("activity",[]))
    themes=set(doc_backfill.get("regulatory_theme",[]))
    impacts=set(doc_backfill.get("impact_type",[]))
    regulators=set(doc_backfill.get("regulator",[]))
    countries=set(doc_backfill.get("company_country",[]))
    eff=doc_backfill.get("default_effective_date","").strip() or ""

    for cf in chunk_fields_list:
        if not juris and cf.get("jurisdiction",""): juris=cf["jurisdiction"]
        sectors.update(cf.get("sector",[]))
        activities.update(cf.get("activity",[]))
        themes.update(cf.get("regulatory_theme",[]))
        impacts.update(cf.get("impact_type",[]))
        regulators.update(cf.get("regulator",[]))
        countries.update(cf.get("company_country",[]))
        if not eff and cf.get("effective_date",""): eff=cf["effective_date"]

    if not eff:
        head=corpus_text_en[:8000]
        m=DATE_RX.search(head)
        if m: eff=_to_ymd(m[0]) or eff
        if not eff: eff=doc_date_guess

    juris = _to_title(juris) if juris else "Global"
    regulators=_norm_titlecase(regulators)
    countries=_norm_titlecase(countries) if countries else ([juris] if juris!="Global" else ["Global"])
    sectors=_norm_lower(sectors) or ["general"]
    activities=_norm_lower(activities) or ["general"]
    themes=_norm_lower(themes) or ["general"]
    impacts=_norm_lower(impacts) or ["obligation"]
    eff = eff or ""

    kws=_kw_from_text(corpus_text_en, max_k=50)

    return {
        "jurisdiction": juris,
        "sector": sectors,
        "activity": activities,
        "regulatory_theme": themes,
        "impact_type": impacts,
        "effective_date": eff,
        "regulator": regulators or ["General Regulator"],
        "keywords": kws,
        "company_country": countries
    }

def _serialize_row(row):
    def J(x): return json.dumps(x, ensure_ascii=False)
    return {
        "jurisdiction": row["jurisdiction"],
        "sector": J(sorted(row["sector"])),
        "activity": J(sorted(row["activity"])),
        "regulatory_theme": J(sorted(row["regulatory_theme"])),
        "impact_type": J(sorted(row["impact_type"])),
        "effective_date": row["effective_date"],
        "regulator": J(sorted(row["regulator"])),
        "keywords": J(row["keywords"]),
        "company_country": J(sorted(row["company_country"]))
    }

def _save_per_input(row,input_path):
    Path(OUT_DIR).mkdir(parents=True,exist_ok=True)
    stem=input_path.stem
    csv_path=f"{OUT_DIR}/{stem}.csv"
    df=pd.DataFrame([_serialize_row(row)],columns=["jurisdiction","sector","activity","regulatory_theme","impact_type","effective_date","regulator","keywords","company_country"])
    df.to_csv(csv_path,index=False)
    print(f"[SAVE] {csv_path}")

def _save_all(rows):
    Path(OUT_DIR).mkdir(parents=True,exist_ok=True)
    csv_path=f"{OUT_DIR}/Regulatory_Extraction_ALL.csv"
    df=pd.DataFrame([_serialize_row(r) for r in rows],columns=["jurisdiction","sector","activity","regulatory_theme","impact_type","effective_date","regulator","keywords","company_country"])
    df.to_csv(csv_path,index=False)
    print(f"[SAVE] {csv_path}")

def process_file(path: Path):
    raw=_read(path)
    raw_txt,soup=_html_to_text(raw)
    title=_extract_main_title(soup, path.stem)
    en=_translate(raw_txt)
    if not _is_english(en): en=_force_english(en)
    operative=_prune_operative(en)
    doc_date_guess=_doc_date(raw_txt,soup,str(path))
    back=_llm_extract_doc(title, operative)

    chunk_fields=[]
    for chunk in _chunk_iter(operative, limit=MAX_CHUNK_CHARS):
        c_en=_translate(chunk)
        if not _is_english(c_en): c_en=_force_english(c_en)
        f=_llm_extract_chunk(c_en)
        chunk_fields.append(f)

    row=_aggregate_fields(back, chunk_fields, operative, back.get("default_effective_date") or doc_date_guess)
    return row

def process_all_documents():
    root=_resolve_input_dir()
    files=_list_files(root,recursive=RECURSIVE)
    if not files:
        print("[WARN] no inputs"); return pd.DataFrame()
    all_rows=[]
    for f in files:
        row=process_file(f)
        _save_per_input(row,f)
        all_rows.append(row)
        print(f"[FILE] {Path(f).name}: 1 row")
    _save_all(all_rows)
    df=pd.DataFrame([_serialize_row(r) for r in all_rows],columns=["jurisdiction","sector","activity","regulatory_theme","impact_type","effective_date","regulator","keywords","company_country"])
    print(f"[TOTAL] {len(df)} rows")
    return df

if __name__=="__main__":
    df=process_all_documents()
    print(df.head() if not df.empty else "No results")


[INPUT] 5 files in directives (recursive=True)
 - 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL.html
 - 3.H.R.5376 - Inflation Reduction Act of 2022.xml
 - 4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.html
 - 5.中华人民共和国能源法__中国政府网.html
 - 6.人工知能関連技術の研究開発及び活用の推進に関する法律.html
[SAVE] out/1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL.csv
[FILE] 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL.html: 1 row
[SAVE] out/3.H.R.5376 - Inflation Reduction Act of 2022.csv
[FILE] 3.H.R.5376 - Inflation Reduction Act of 2022.xml: 1 row
[SAVE] out/4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.csv
[FILE] 4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.html: 1 row
[SAVE] out/5.中华人民共和国能源法__中国政府网.csv
[FILE] 5.中华人民共和国能源法__中国政府网.html: 1 row
[SAVE] out/6.人工知能関連技術の研究開発及び活用の推進に関する法律.csv
[FILE] 6.人工知能関連技術の研究開発及び活用の推進に関する法律.html: 1 row
[SAVE] out/Regulatory_Extraction_ALL.csv
[TOTAL] 5 rows
  

# Load and translate

In [45]:
import json
from pathlib import Path
import boto3
from botocore.config import Config
from bs4 import BeautifulSoup

AWS_REGION = "us-west-2"
OUT_DIR = "out/translate"
MAX_CHUNK_CHARS = 4000

bedrock = boto3.client("bedrock-runtime", region_name=AWS_REGION, config=Config(read_timeout=60, retries={"max_attempts": 3}))

PROFILE_IDS = {
    "anthropic_haiku_4_5": "global.anthropic.claude-haiku-4-5-20251001-v1:0",
    "anthropic_sonnet_4_5": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
    "anthropic_sonnet_4": "global.anthropic.claude-sonnet-4-20250514-v1:0"
}

def get_files():
    directives_path = Path("./directives")
    if not directives_path.exists():
        print("directives folder not found")
        return []
    files = [f for f in directives_path.iterdir() if f.is_file() and f.suffix.lower() in {".html", ".xml"}]
    return files

def html_to_text(raw):
    soup = BeautifulSoup(raw, "html.parser")
    for tag in soup(["script", "style", "nav", "header", "footer"]):
        tag.extract()
    return soup.get_text(" ", strip=True)

def chunk_text(text, limit=MAX_CHUNK_CHARS):
    chunks = []
    i = 0
    while i < len(text):
        end = min(i + limit, len(text))
        if end < len(text):
            for sep in [". ", ".\n", "! ", "? "]:
                last_sep = text.rfind(sep, i, end)
                if last_sep > i + 200:
                    end = last_sep + len(sep)
                    break
        chunks.append(text[i:end].strip())
        i = end
    return [c for c in chunks if c]

def invoke_anthropic_profile(profile_id, user_text, max_tokens=4000, temperature=0.0):
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": max_tokens,
        "temperature": temperature,
        "messages": [{"role": "user", "content": [{"type": "text", "text": user_text}]}],
    }
    resp = bedrock.invoke_model(modelId=profile_id, body=json.dumps(body))
    data = json.loads(resp["body"].read())
    return (data.get("content", [{}])[0].get("text") or "").strip()

def translate_chunk(text):
    try:
        profile_id = PROFILE_IDS["anthropic_haiku_4_5"]
        prompt = f"Translate this text to precise legal English. Preserve headings, numbering, and dates. No commentary.\n\n{text}"
        out = invoke_anthropic_profile(profile_id, prompt)
        if out:
            return out
    except Exception as e:
        print(f"Anthropic Haiku failed: {e}")
    try:
        profile_id = PROFILE_IDS["anthropic_sonnet_4_5"]
        prompt = f"Translate this text to precise legal English. Preserve headings, numbering, and dates. No commentary.\n\n{text}"
        out = invoke_anthropic_profile(profile_id, prompt)
        if out:
            return out
    except Exception as e:
        print(f"Anthropic Sonnet failed: {e}")
    return text

def process_file(file_path):
    print(f"Processing: {file_path.name}")
    raw = file_path.read_text(encoding="utf-8", errors="ignore")
    text = html_to_text(raw)
    chunks = chunk_text(text)
    translated_chunks = []
    for i, chunk in enumerate(chunks):
        print(f"  Translating chunk {i+1}/{len(chunks)}")
        translated = translate_chunk(chunk)
        translated_chunks.append(translated)
    final_text = "\n\n".join(translated_chunks)
    out_dir = Path(OUT_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f"{file_path.stem}.txt"
    out_file.write_text(final_text, encoding="utf-8")
    print(f"Saved: {out_file.name}")

def main():
    files = get_files()
    print(f"Found {len(files)} files in directives folder")
    for file_path in files:
        process_file(file_path)

if __name__ == "__main__":
    main()


Found 5 files in directives folder
Processing: 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL.html
  Translating chunk 1/28
  Translating chunk 2/28
  Translating chunk 3/28
  Translating chunk 4/28
  Translating chunk 5/28
  Translating chunk 6/28
  Translating chunk 7/28
  Translating chunk 8/28
  Translating chunk 9/28
  Translating chunk 10/28
  Translating chunk 11/28
  Translating chunk 12/28
  Translating chunk 13/28
  Translating chunk 14/28
  Translating chunk 15/28
  Translating chunk 16/28
  Translating chunk 17/28
  Translating chunk 18/28
  Translating chunk 19/28
  Translating chunk 20/28
  Translating chunk 21/28
  Translating chunk 22/28
  Translating chunk 23/28
  Translating chunk 24/28
  Translating chunk 25/28
  Translating chunk 26/28
  Translating chunk 27/28
  Translating chunk 28/28
Saved: 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL.txt
Processing: 3.H.R.5376 - Inflation Reduction Act of 2022.xml
  Translating chunk 1/200
  Trans

# Translation with English recognition

In [None]:
import json
from pathlib import Path
import boto3
from botocore.config import Config
from bs4 import BeautifulSoup
from langdetect import detect, DetectorFactory

AWS_REGION = "us-west-2"
OUT_DIR = "out/translate"
MAX_CHUNK_CHARS = 4000

DetectorFactory.seed = 42

bedrock = boto3.client("bedrock-runtime", region_name=AWS_REGION, config=Config(read_timeout=60, retries={"max_attempts": 3}))

PROFILE_IDS = {
    "anthropic_haiku_4_5": "global.anthropic.claude-haiku-4-5-20251001-v1:0",
    "anthropic_sonnet_4_5": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
    "anthropic_sonnet_4": "global.anthropic.claude-sonnet-4-20250514-v1:0"
}

def log(msg):
    print(f"[LOG] {msg}")

def get_files():
    p = Path("./directives")
    if not p.exists():
        log("directives folder not found")
        return []
    files = [f for f in p.iterdir() if f.is_file() and f.suffix.lower() in {".html", ".xml"}]
    log(f"Detected {len(files)} eligible files in directives folder")
    return files

def html_to_text(raw):
    soup = BeautifulSoup(raw, "html.parser")
    for tag in soup(["script", "style", "nav", "header", "footer"]):
        tag.extract()
    text = soup.get_text(" ", strip=True)
    log(f"Extracted {len(text)} characters of text from HTML")
    return text

def chunk_text(text, limit=MAX_CHUNK_CHARS):
    chunks, i = [], 0
    while i < len(text):
        end = min(i + limit, len(text))
        if end < len(text):
            for sep in [". ", ".\n", "! ", "? "]:
                k = text.rfind(sep, i, end)
                if k > i + 200:
                    end = k + len(sep)
                    break
        chunks.append(text[i:end].strip())
        i = end
    log(f"Split text into {len(chunks)} chunks of up to {limit} characters")
    return [c for c in chunks if c]

def invoke_anthropic_profile(profile_id, user_text, max_tokens=4000, temperature=0.0):
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": max_tokens,
        "temperature": temperature,
        "messages": [{"role": "user", "content": [{"type": "text", "text": user_text}]}],
    }
    resp = bedrock.invoke_model(modelId=profile_id, body=json.dumps(body))
    data = json.loads(resp["body"].read())
    return (data.get("content", [{}])[0].get("text") or "").strip()

def llm_is_english(sample_text):
    log("Checking language of text with LLM fallback...")
    q = "Answer with exactly 'en' if the text is English, otherwise 'non-en'. Text:\n\n" + sample_text[:2000]
    try:
        out = invoke_anthropic_profile(PROFILE_IDS["anthropic_haiku_4_5"], q, max_tokens=5)
        return out.strip().lower().startswith("en")
    except Exception as e:
        log(f"LLM language check (Haiku) failed: {e}")
        try:
            out = invoke_anthropic_profile(PROFILE_IDS["anthropic_sonnet_4_5"], q, max_tokens=5)
            return out.strip().lower().startswith("en")
        except Exception as e2:
            log(f"LLM language check (Sonnet) failed: {e2}")
            return False

def corpus_is_english(text):
    t = (text or "").strip()
    if not t:
        return True
    try:
        lang = detect(t[:10000])
        log(f"Detected language via langdetect: {lang}")
        if str(lang).lower() == "en":
            return True
    except Exception as e:
        log(f"langdetect failed: {e}")
    first_chunk = chunk_text(t, limit=MAX_CHUNK_CHARS)[:1]
    if first_chunk:
        result = llm_is_english(first_chunk[0])
        log(f"LLM language detection result: {'English' if result else 'Non-English'}")
        return result
    return False

def translate_chunk(text):
    try:
        pid = PROFILE_IDS["anthropic_haiku_4_5"]
        prompt = f"Translate this text to precise legal English. Preserve headings, numbering, and dates. No commentary.\n\n{text}"
        out = invoke_anthropic_profile(pid, prompt)
        if out:
            return out
    except Exception as e:
        log(f"Anthropic Haiku translation failed: {e}")
    try:
        pid = PROFILE_IDS["anthropic_sonnet_4_5"]
        prompt = f"Translate this text to precise legal English. Preserve headings, numbering, and dates. No commentary.\n\n{text}"
        out = invoke_anthropic_profile(pid, prompt)
        if out:
            return out
    except Exception as e:
        log(f"Anthropic Sonnet translation failed: {e}")
    return text

def process_file(file_path):
    log(f"Processing: {file_path.name}")
    raw = file_path.read_text(encoding="utf-8", errors="ignore")
    text = html_to_text(raw)
    if corpus_is_english(text):
        log("Document is already in English. Skipping translation.")
        final_text = text
    else:
        log("Document is NOT in English. Starting translation process...")
        chunks = chunk_text(text)
        translated_chunks = []
        for i, chunk in enumerate(chunks):
            log(f"Translating chunk {i+1}/{len(chunks)}")
            translated_chunks.append(translate_chunk(chunk))
        final_text = "\n\n".join(translated_chunks)
        log("Translation completed successfully.")
    out_dir = Path(OUT_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f"{file_path.stem}.txt"
    out_file.write_text(final_text, encoding="utf-8")
    log(f"Saved translated text to {out_file.name}")

def main():
    files = get_files()
    log(f"Found {len(files)} files to process.")
    for file_path in files:
        process_file(file_path)
    log("All files processed.")

if __name__ == "__main__":
    main()


# Chunk Cleaning

In [50]:
!pip install nltk



In [51]:
from pathlib import Path
from nltk.tokenize import wordpunct_tokenize

def clean_text(text):
    tokens = wordpunct_tokenize(text)
    tokens = [t for t in tokens if any(ch.isalnum() for ch in t)]
    return " ".join(tokens)

def process_translations(in_dir="out/translate", out_dir="out/processed"):
    in_path = Path(in_dir)
    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    for f in in_path.glob("*.txt"):
        raw = f.read_text(encoding="utf-8", errors="ignore")
        cleaned = clean_text(raw)
        (out_path / f.name).write_text(cleaned, encoding="utf-8")

if __name__ == "__main__":
    process_translations()


# Results

In [1]:
import json, time, re
from datetime import datetime
from pathlib import Path
import boto3
from botocore.config import Config

AWS_REGION = "us-west-2"
IN_DIR = "out/processed"
OUT_DIR = "out/results"
MAX_CHUNK_CHARS = 4000

PROFILE_IDS = {
    "anthropic_haiku_4_5": "global.anthropic.claude-haiku-4-5-20251001-v1:0",
    "anthropic_sonnet_4_5": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
    "anthropic_sonnet_4": "global.anthropic.claude-sonnet-4-20250514-v1:0"
}

bedrock = boto3.client("bedrock-runtime", region_name=AWS_REGION, config=Config(read_timeout=60, retries={"max_attempts": 3}))

def log(msg): print(f"[LOG] {msg}")
def pct(n, d): return 0 if d == 0 else round(100 * n / d, 1)

def chunk_text(text, limit=MAX_CHUNK_CHARS):
    chunks, i = [], 0
    while i < len(text):
        end = min(i + limit, len(text))
        if end < len(text):
            for sep in [". ", ".\n", "! ", "? "]:
                k = text.rfind(sep, i, end)
                if k > i + 200:
                    end = k + len(sep)
                    break
        chunks.append(text[i:end].strip())
        i = end
    return [c for c in chunks if c]

def invoke_anthropic_profile(profile_id, user_text, max_tokens=2000, temperature=0.0):
    body = {"anthropic_version": "bedrock-2023-05-31","max_tokens": max_tokens,"temperature": temperature,"messages": [{"role": "user", "content": [{"type": "text", "text": user_text}]}]}
    for attempt in range(3):
        try:
            resp = bedrock.invoke_model(modelId=profile_id, body=json.dumps(body))
            data = json.loads(resp["body"].read())
            return (data.get("content", [{}])[0].get("text") or "").strip()
        except Exception as e:
            log(f"LLM call failed (attempt {attempt+1}): {e}")
            time.sleep(1 + attempt)
    return ""

def _normalize_json_text(s):
    if not s: return ""
    s = s.replace("\u2018","'").replace("\u2019","'").replace("\u201C",'"').replace("\u201D",'"').replace("\u00AB",'"').replace("\u00BB",'"').strip()
    s = re.sub(r"^```(?:json)?", "", s, flags=re.IGNORECASE).strip()
    s = re.sub(r"```$", "", s).strip()
    m = re.search(r"[\{\[]", s)
    return s[m.start():] if m else s

def _find_balanced_json(s):
    s = s.strip()
    if not s: return ""
    start = None
    for i,ch in enumerate(s):
        if ch in "{[":
            start = i
            break
    if start is None: return ""
    stack=[];in_str=False;esc=False
    for j in range(start,len(s)):
        ch=s[j]
        if ch=='"' and not esc: in_str=not in_str
        esc=(ch=='\\' and not esc) if in_str else False
        if in_str: continue
        if ch in "{[}": 
            if ch in "{[}": 
                if ch in "{[}": 
                    pass
        elif ch in "}]":
            if not stack: return ""
            top=stack.pop()
            if (top=="{" and ch!="}") or (top=="[" and ch!="]"): return ""
            if not stack: return s[start:j+1]
    return ""

def safe_load_json(raw, expect_object=True):
    if not raw: return None
    txt=_normalize_json_text(raw)
    try:
        obj=json.loads(txt)
        if expect_object and not isinstance(obj,dict): return None
        return obj
    except Exception: pass
    seg=_find_balanced_json(txt)
    if not seg: return None
    try:
        obj=json.loads(seg)
        if expect_object and not isinstance(obj,dict): return None
        return obj
    except Exception: return None

def empty_state():
    return {"date":None,"jurisdiction_country":[],"sector":[],"activity":[],"regulatory_domain":[],"impact_type":[],"regulator_entity":[]}

def empty_date_info():
    return {"date":None,"specificity":0,"evidence_chunk":"","locked":False,"law_header":""}

def merge_state(state,update):
    for k in ["jurisdiction_country","sector","activity","regulatory_domain","impact_type","regulator_entity"]:
        seen=set(state.get(k) or [])
        for v in update.get(k,[]) or []:
            v=(v or "").strip()
            if v: seen.add(v)
        state[k]=sorted(seen,key=lambda x:x.lower())
    return state

def build_state_prompt(law_id,prior_state_json,current_date,date_evidence,law_header,chunk_text):
    return f"""Return ONLY a JSON object with keys: "date","jurisdiction_country","sector","activity","regulatory_domain","impact_type","regulator_entity".
Rules:
- law_id is {law_id} and must NOT appear in output.
- Do not change "date". If you include "date", it MUST equal CURRENT_DATE or be a strictly more specific ISO refinement of the SAME year and month.
- For list fields, add unique strings supported by this chunk; do not remove prior values.

LAW_HEADER:
{law_header}

CURRENT_STATE:
{prior_state_json}

CURRENT_DATE_CONTEXT:
date: {current_date or ""}
evidence_chunk: {date_evidence or ""}

CHUNK:
{chunk_text}"""

def build_date_probe_prompt(law_id,law_header,current_date,current_specificity,chunk_text):
    return f"""Return ONLY JSON: {{"date":"","specificity":0,"is_stronger":false,"same_law":false,"confidence":0.0,"evidence":""}}.
- Consider ONLY dates that refer to THIS law (not citations to other instruments).
- same_law: true only if the chunk clearly ties the date to THIS law identified by law_id and header.
- confidence: 0..1 for that judgment.
- specificity: 3=YYYY-MM-DD, 2=YYYY-MM, 1=YYYY, 0=unknown.
- is_stronger: true only if same_law is true AND the candidate is more specific than CURRENT_DATE and same year.

law_id: {law_id}
law_header: {law_header}

CURRENT_DATE: {current_date or ""} (specificity={current_specificity})
CHUNK:
{chunk_text}"""

def call_json(prompt,expect_object=True,max_tokens=800):
    for pid in [PROFILE_IDS["anthropic_haiku_4_5"],PROFILE_IDS["anthropic_sonnet_4_5"],PROFILE_IDS["anthropic_sonnet_4"]]:
        out=invoke_anthropic_profile(pid,prompt,max_tokens=max_tokens)
        obj=safe_load_json(out,expect_object=expect_object)
        if obj is not None: return obj
    return None

MONTHS={"january":1,"february":2,"march":3,"april":4,"may":5,"june":6,"july":7,"august":8,"september":9,"october":10,"november":11,"december":12}
HEADER_RE=re.compile(r"(DIRECTIVE|REGULATION|DECISION)[^\n]{0,300}?\bOF\b\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})",re.IGNORECASE|re.DOTALL)

def _to_iso_date(s):
    s=s.strip()
    if re.match(r"^\d{4}(-\d{2}(-\d{2})?)?$",s):
        y=int(s[:4])
        if 1950<=y<=datetime.utcnow().year+1: return s
        return ""
    m=re.match(r"^(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})$",s)
    if m:
        d=int(m.group(1)); mon=MONTHS.get(m.group(2).lower()); y=int(m.group(3))
        if mon and 1<=d<=31 and 1950<=y<=datetime.utcnow().year+1: return f"{y:04d}-{mon:02d}-{d:02d}"
    return ""

def _header_date(text):
    head=text[:2000].replace("\u00A0"," ").replace("\u202F"," ")
    m=HEADER_RE.search(head)
    if not m: return ""
    return _to_iso_date(m.group(2)) or ""

def extract_from_chunks(law_id,text):
    state=empty_state();date_info=empty_date_info()
    hd=_header_date(text)
    if hd:
        date_info["date"]=hd;date_info["specificity"]=3;date_info["evidence_chunk"]=text[:2000];date_info["locked"]=True
        log(f"Date chosen (header) for {law_id}: {hd}")
    else:
        log(f"No header date found for {law_id}")
    chunks=chunk_text(text)
    if chunks: date_info["law_header"]=chunks[0][:1000]
    total=len(chunks)
    for idx,ch in enumerate(chunks,1):
        log(f"Processing {law_id}: chunk {idx}/{total} ({pct(idx-1,total)}%)")
        if not date_info["locked"]:
            dprobe=call_json(build_date_probe_prompt(law_id,date_info["law_header"],date_info["date"],date_info["specificity"],ch),expect_object=True,max_tokens=320)
            if isinstance(dprobe,dict):
                cand=_to_iso_date((dprobe.get("date") or "").strip())
                spec=int(dprobe.get("specificity") or 0)
                stronger=bool(dprobe.get("is_stronger"));same_law=bool(dprobe.get("same_law"));conf=float(dprobe.get("confidence") or 0.0)
                if cand and same_law and conf>=0.8:
                    if date_info["date"] and len(cand)==10 and len(date_info["date"])<10 and cand.startswith(date_info["date"][:7]):
                        date_info["date"]=cand;date_info["specificity"]=spec;date_info["evidence_chunk"]=ch if len(ch)<=2000 else ch[:2000];date_info["locked"]=True
                        log(f"Date refined (model) for {law_id}: {cand} (conf {conf})")
                    elif not date_info["date"]:
                        date_info["date"]=cand;date_info["specificity"]=spec;date_info["evidence_chunk"]=ch if len(ch)<=2000 else ch[:2000];date_info["locked"]=True
                        log(f"Date chosen (model) for {law_id}: {cand} (conf {conf})")
        sprompt=build_state_prompt(law_id,json.dumps({**state,"date":date_info["date"]},ensure_ascii=False),date_info["date"],date_info["evidence_chunk"],date_info["law_header"],ch)
        supd=call_json(sprompt,expect_object=True,max_tokens=800)
        if isinstance(supd,dict):
            if supd.get("date") and supd.get("date")!=date_info["date"]:
                log(f"Ignored model date for {law_id}: {supd.get('date')} (kept {date_info['date']})")
            state=merge_state(state,supd)
        log(f"Progress {law_id}: {idx}/{total} ({pct(idx,total)}%)")
    if date_info["date"]:
        state["date"]=date_info["date"]
        log(f"Final date for {law_id}: {date_info['date']}")
    else:
        log(f"No date resolved for {law_id}")
    return state

def write_csv_row(path,row):
    path.parent.mkdir(parents=True,exist_ok=True)
    headers=["law_id","date","jurisdiction_country","sector","activity","regulatory_domain","impact_type","regulator_entity"]
    def join(v):
        if v is None: return ""
        if isinstance(v,list): return ";".join(v)
        return str(v)
    with path.open("w",encoding="utf-8") as f:
        f.write(",".join(headers)+"\n")
        f.write(",".join([row.get("law_id",""),row.get("date","") or "",join(row.get("jurisdiction_country",[])),join(row.get("sector",[])),join(row.get("activity",[])),join(row.get("regulatory_domain",[])),join(row.get("impact_type",[])),join(row.get("regulator_entity",[]))])+"\n")

def process_all():
    in_dir=Path(IN_DIR);out_dir=Path(OUT_DIR);out_dir.mkdir(parents=True,exist_ok=True)
    files=sorted(in_dir.glob("*.txt"));total=len(files);done=0
    log(f"Found {total} files in {IN_DIR}")
    for f in files:
        law_id=f.stem;log(f"Start processing {law_id}")
        text=f.read_text(encoding="utf-8",errors="ignore")
        state=extract_from_chunks(law_id,text)
        row={"law_id":law_id,**state}
        write_csv_row(out_dir/f"{law_id}.csv",row)
        done+=1;log(f"Completed {law_id} ({pct(done,total)}%)")
    log("All files processed.")

if __name__=="__main__": process_all()


[LOG] Found 5 files in out/processed
[LOG] Start processing 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL
[LOG] Date chosen (header) for 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: 2019-11-27
[LOG] Processing 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: chunk 1/23 (0.0%)
[LOG] Progress 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: 1/23 (4.3%)
[LOG] Processing 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: chunk 2/23 (4.3%)
[LOG] Progress 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: 2/23 (8.7%)
[LOG] Processing 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: chunk 3/23 (8.7%)
[LOG] Progress 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: 3/23 (13.0%)
[LOG] Processing 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: chunk 4/23 (13.0%)
[LOG] Progress 1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROPÉEN ET DU CONSEIL: 4/23 (17.4%)
[LOG] Processing 1

KeyboardInterrupt: 