In [79]:
!pip install sec-parser beautifulsoup4 pandas



In [135]:
import os, json, re, time, hashlib, datetime as dt
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple
import pandas as pd
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from slugify import slugify
from rapidfuzz import fuzz, process as rprocess


import sec_parser  

# Dossiers
FILLINGS_PATH = '/home/sagemaker-user/shared/fillings'
CACHE_DIR     = '/home/sagemaker-user/shared/cache/sec_analysis'
os.makedirs(CACHE_DIR, exist_ok=True)

OUTPUT_COLUMNS = [
    "ticker","company","sector","headquarters_country","revenues_total_usd",
    "revenue_by_region_notes","region_exposure_US","region_exposure_Europe",
    "region_exposure_China","region_exposure_India","supply_chain_regions",
    "key_suppliers","key_customers","critical_dependencies","regulatory_dependencies",
    "sanctions_exposure","environmental_regulatory_risk","labor_regulatory_risk",
    "cybersecurity_regulatory_risk","ai_governance_risk","overall_regulatory_risk_score",
    "confidence","sources"
]

COLUMN_ALIASES = {
    "hq_country": "headquarters_country",
    "risk_score": "overall_regulatory_risk_score",
    "confidence_score": "confidence",
    "environment_risk": "environmental_regulatory_risk",
    "labour_risk": "labor_regulatory_risk",
    "it_cyber_risk": "cybersecurity_regulatory_risk",
    "ai_risk": "ai_governance_risk",
    "dependencies": "critical_dependencies",
    "regulatory_exposure": "regulatory_dependencies",
    "us_exposure": "region_exposure_US",
    "eu_exposure": "region_exposure_Europe",
    "china_exposure": "region_exposure_China",
    "india_exposure": "region_exposure_India",
}

def _normalize_keys(d: Dict[str, Any]) -> Dict[str, Any]:
    out = {}
    for k, v in d.items():
        out[COLUMN_ALIASES.get(k, k)] = v
    return out

def read_html_text(fp: str) -> str:
    with open(fp, 'r', encoding='utf-8', errors='ignore') as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

def sha1(s: str) -> str:
    import hashlib
    return hashlib.sha1(s.encode('utf-8', errors='ignore')).hexdigest()


In [136]:
def list_companies(fillings_path: str) -> List[str]:
    return sorted([d for d in os.listdir(fillings_path) if os.path.isdir(os.path.join(fillings_path, d))])

def list_html_files(company_dir: str) -> List[str]:
    files = [f for f in os.listdir(company_dir) if f.lower().endswith('.html')]
    # Heuristique: traiter d’abord les fichiers contenant 10-K
    files = sorted(files, key=lambda x: (('10-k' not in x.lower()), x))
    return files

def parse_10k_sections(html_content: str) -> Dict[str, str]:
    try:
        elements = sec_parser.parse(html_content)
        flat = " ".join([getattr(e, "text", "") for e in elements if getattr(e, "text", "")])
        return {"full_text": flat}
    except Exception as e:
        # Fallback to BeautifulSoup if sec_parser fails
        soup = BeautifulSoup(html_content, 'html.parser')
        return {"full_text": soup.get_text(separator=' ', strip=True)}

import re
def split_items_by_regex(text: str) -> Dict[str, str]:
    keys = {
        "Item 1":  re.compile(r'\bitem\s*1\b(?!\s*a)', re.I),
        "Item 1A": re.compile(r'\bitem\s*1a\b', re.I),
        "Item 7": re.compile(r'\bitem\s*7\b', re.I),
    }
    # ancrages
    hits = []
    for k, rx in keys.items():
        for m in rx.finditer(text):
            hits.append((m.start(), k))
    if not hits:
        return {"FULL_TEXT_FALLBACK": text}
    hits.sort()
    out = {k:"" for k in keys}
    for i, (pos, label) in enumerate(hits):
        end = hits[i+1][0] if i+1 < len(hits) else len(text)
        chunk = text[pos:end].strip()
        if len(chunk) > len(out[label]):
            out[label] = chunk
    return out


def load_company_primary_10k(filepath: str) -> Tuple[Dict[str, str], str]:
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        html = f.read()
    sections = parse_10k_sections(html)
    return sections, html


In [137]:
import boto3
import json
import re, os
from slugify import slugify

def _extract_text(resp: dict) -> str:
    msg = resp["output"]["message"]
    pieces = msg.get("content", [])
    text = ""
    for p in pieces:
        if "text" in p:
            text += p["text"]
    return text.strip()

def _strip_code_fences(s: str) -> str:
    s = re.sub(r"^\s*```(?:json)?\s*", "", s)
    s = re.sub(r"\s*```\s*$", "", s)
    return s.strip()

def _best_json_slice(s: str) -> str:
    start = s.find("{")
    if start == -1:
        return s
    stack = 0
    end = -1
    for i, ch in enumerate(s[start:], start=start):
        if ch == "{": stack += 1
        elif ch == "}":
            stack -= 1
            if stack == 0:
                end = i
                break
    return s[start:end+1] if end != -1 else s[start:]

def _sanitize_json_text(s: str) -> str:
    s = _strip_code_fences(s)
    s = _best_json_slice(s)
    s = s.replace("“","\"").replace("”","\"").replace("’","'")
    s = s.replace("`","")
    s = re.sub(r",\s*([\]}])", r"\1", s)                       # virgules traînantes
    s = s.replace("None","null").replace("True","true").replace("False","false")
    s = re.sub(r"\bNaN\b","null", s); s = re.sub(r"\bInfinity\b","null", s, flags=re.I)
    s = re.sub(r"\b-?inf(?:inity)?\b","null", s, flags=re.I)
    return s.strip()

# Configure si besoin (SageMaker a souvent les rôles configurés)
BEDROCK_REGION = os.environ.get("AWS_REGION", "us-west-2")
MODEL_ID       = "anthropic.claude-3-5-sonnet-20240620-v1:0"  # ajuste si nécessaire

brt = boto3.client("bedrock-runtime", region_name=BEDROCK_REGION)

SYSTEM_PROMPT = """You are a senior financial analyst.
From 10-K sections (Item 1 Business, Item 1A Risk Factors, Item 7 MD&A),
produce a compact JSON with this schema (text or numeric fields, use Unknown if unknown):

{
  "ticker": "...",
  "company": "...",
  "sector": "GICS or inferred if present",
  "headquarters_country": "...",
  "revenues_total_usd": 1234567890,
  "revenue_by_region_notes": "Brief text citing regions/shares if available",
  "region_exposure_US": "low/medium/high",
  "region_exposure_Europe": "...",
  "region_exposure_China": "...",
  "region_exposure_India": "...",
  "supply_chain_regions": ["list of key regions/countries if mentioned"],
  "key_suppliers": ["..."],
  "key_customers": ["..."],
  "critical_dependencies": ["e.g., semiconductors, cloud, rare earths, data centers, lithium..."],
  "regulatory_dependencies": ["e.g., GDPR, CSRD, IRA, export controls, data localization..."],
  "sanctions_exposure": "short text if mentioned (countries/sectors)",
  "environmental_regulatory_risk": "low/medium/high",
  "labor_regulatory_risk": "low/medium/high",
  "cybersecurity_regulatory_risk": "low/medium/high",
  "ai_governance_risk": "low/medium/high",
  "overall_regulatory_risk_score": 0-100,
  "confidence": 0-1,
  "sources": ["Item 1", "Item 1A", "Item 7", ...]
}

Remember: be factual; cite only what is discernible in the provided text.
Respond STRICTLY with valid JSON—no prose.
"""

def build_user_prompt(tkr: str, company: str, snippets: Dict[str, str]) -> str:
    parts = [f"TICKER: {tkr}", f"COMPANY: {company}", "EXTRACTS:"]
    for k, v in snippets.items():
        if not v: 
            continue
        # Tronquer au besoin pour contenir le contexte
        vv = v[:20000]  # garde large, ajuste si tu as des erreurs de tokens
        parts.append(f"\n=== {k} ===\n{vv}\n")
    return "\n".join(parts)

@retry(
    reraise=True,
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=2, max=20),
    retry=retry_if_exception_type(Exception)
)

@retry(reraise=True, stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20),
       retry=retry_if_exception_type(Exception))
def call_bedrock_json(prompt: str, debug_tag: str = "unknown") -> Dict[str, Any]:
    response = brt.converse(
        modelId=MODEL_ID,
        messages=[{"role":"user","content":[{"text":prompt}]}],
        system=[{"text": SYSTEM_PROMPT + "\n\nRègle de sortie: réponds UNIQUEMENT par un JSON valide MINIFIÉ, sans markdown."}],
        inferenceConfig={"maxTokens": 2000, "temperature": 0.2, "topP": 0.9}
    )
    txt = _extract_text(response)
    try:
        return json.loads(txt)
    except Exception:
        sani = _sanitize_json_text(txt)
        try:
            return json.loads(sani)
        except Exception:
            # sauvegarde pour debug
            raw1 = os.path.join(CACHE_DIR, f"raw_{slugify(debug_tag)}.txt")
            with open(raw1, "w", encoding="utf-8") as f:
                f.write(txt)
            # 2e tour: on redemande la même réponse, au format JSON strict
            response2 = brt.converse(
                modelId=MODEL_ID,
                messages=[
                    {"role":"user","content":[{"text":prompt}]},
                    {"role":"assistant","content":[{"text":txt[:4000]}]},
                    {"role":"user","content":[{"text":"Reformate exactement la même réponse en JSON valide MINIFIÉ, une seule ligne, sans texte autour."}]},
                ],
                system=[{"text": SYSTEM_PROMPT}],
                inferenceConfig={"maxTokens": 5000, "temperature": 0.1, "topP": 0.9}
            )
            txt2 = _extract_text(response2)
            try:
                return json.loads(txt2)
            except Exception:
                sani2 = _sanitize_json_text(txt2)
                return json.loads(sani2)  # laisser lever si encore invalide



In [138]:
def infer_company_ticker_from_filename(company_dir: str, file: str) -> Tuple[str,str]:
    """
    Essaie de déduire (ticker, company) à partir du chemin.
    - company_dir est le nom de dossier (souvent le ticker, mais pas toujours).
    - le nom de fichier peut contenir la date-form-...-ticker.html
    """
    tkr = company_dir
    comp = company_dir
    base = file.replace('.html','')
    parts = base.split('-')
    if len(parts) >= 5:
        tkr = parts[4] or tkr
    return tkr, comp

def cache_path_for(company: str, ticker: str, filename: str) -> str:
    key = f"{company}|{ticker}|{filename}"
    return os.path.join(CACHE_DIR, f"{slugify(company)}_{slugify(ticker)}_{sha1(key)}.json")

def normalize_record(rec: Dict[str, Any]) -> Dict[str, Any]:
    rec = _normalize_keys(rec)
    # forcer toutes les colonnes attendues, remplir par NA
    out = {c: rec.get(c, None) for c in OUTPUT_COLUMNS}
    # normalisations simples
    if out.get("overall_regulatory_risk_score") is not None:
        try:
            out["overall_regulatory_risk_score"] = float(out["overall_regulatory_risk_score"])
        except:
            pass
    if out.get("confidence") is not None:
        try:
            out["confidence"] = float(out["confidence"])
        except:
            pass
    # sources liste -> string join pour CSV
    if isinstance(out.get("sources"), list):
        out["sources"] = "; ".join(out["sources"])
    # listes -> string
    for listy in ["supply_chain_regions","key_suppliers","key_customers","critical_dependencies","regulatory_dependencies"]:
        if isinstance(out.get(listy), list):
            out[listy] = "; ".join([str(x) for x in out[listy]])
    return out

def process_company_dir(base_dir: str, company: str) -> Optional[Dict[str, Any]]:
    company_path = os.path.join(base_dir, company)
    files = list_html_files(company_path)
    if not files:
        return None
    # On traite le premier 10-K prioritaire
    file = files[0]
    fp   = os.path.join(company_path, file)
    ticker, comp_guess = infer_company_ticker_from_filename(company, file)

    cpath = cache_path_for(company, ticker, file)
    if os.path.exists(cpath):
        with open(cpath, 'r', encoding='utf-8') as f:
            return json.load(f)

    # Charger et parser
    with open(fp, 'r', encoding='utf-8', errors='ignore') as f:
        html_content = f.read()
    sections = parse_10k_sections(html_content)
    # Si pas de sections, fallback texte brut (déjà fait dans parse_10k_sections)

    user_prompt = build_user_prompt(ticker, company, sections)
    try:
        result = call_bedrock_json(user_prompt, debug_tag=f"{ticker}_{company}")
    except Exception:
        # dernier recours: snippet business ou full
        snippet = sections.get("Item 1 - Business") or sections.get("FULL_TEXT_FALLBACK", "")
        try:
            result = call_bedrock_json(
                build_user_prompt(ticker, company, {"Item 1 - Business": snippet[:15000]}),
                debug_tag=f"{ticker}_{company}_fallback"
            )
        except Exception:
            # ⚠️ NE PAS PERDRE LA LIGNE — on écrit une sentinelle
            fail_row = {c: None for c in OUTPUT_COLUMNS}
            fail_row.update({
                "ticker": ticker,
                "company": company,
                "confidence": 0.0,
                "sources": "PARSE_OR_JSON_ERROR"
            })
            with open(cpath, 'w', encoding='utf-8') as f:
                json.dump(fail_row, f, ensure_ascii=False, indent=2)
            return fail_row

    record = normalize_record(result)
    # Remplissage minimal si Claude n'a pas renvoyé ticker/company
    record["ticker"]  = record.get("ticker")  or ticker
    record["company"] = record.get("company") or company

    with open(cpath, 'w', encoding='utf-8') as f:
        json.dump(record, f, ensure_ascii=False, indent=2)
    return record


In [139]:
companies = list_companies(FILLINGS_PATH)
print(f"Compagnies détectées: {len(companies)}")
print("Exemples:", companies[:10])

# ⚠️ Pendant le dev, commence par un petit batch pour valider
SAMPLE = companies[:500]   # élargis ensuite

rows = []
failed = []
for i, company in enumerate(SAMPLE, 1):
    try:
        rec = process_company_dir(FILLINGS_PATH, company)
        rows.append(rec)
        status = "OK" if rec.get("sources") != "PARSE_OR_JSON_ERROR" else "ERROR"
        print(f"[{i}/{len(SAMPLE)}] {company} -> {status}")
        if status == "ERROR":
            failed.append(company)
    except Exception as e:
        print(f"[{i}/{len(SAMPLE)}] {company} -> EXC: {e}")
        failed.append(company)

print("Failed companies:", failed)

df = pd.DataFrame(rows, columns=OUTPUT_COLUMNS)
print(f"Total lignes: {len(df)}")

# Sauvegardes
out_csv = "/home/sagemaker-user/shared/outputs/sec_matrix.csv"
out_parquet = "/home/sagemaker-user/shared/outputs/sec_matrix.parquet"
Path("/home/sagemaker-user/shared/outputs").mkdir(parents=True, exist_ok=True)
df.to_csv(out_csv, index=False)
df.to_parquet(out_parquet, index=False)
df.head(10)

Compagnies détectées: 500
Exemples: ['A', 'AAPL', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM']
[1/500] A -> OK
[2/500] AAPL -> OK
[3/500] ABBV -> OK
[4/500] ABNB -> OK
[5/500] ABT -> OK
[6/500] ACGL -> OK
[7/500] ACN -> OK
[8/500] ADBE -> OK
[9/500] ADI -> OK
[10/500] ADM -> OK
[11/500] ADP -> OK
[12/500] ADSK -> OK
[13/500] AEE -> OK
[14/500] AEP -> OK
[15/500] AES -> OK
[16/500] AFL -> OK
[17/500] AIG -> OK
[18/500] AIZ -> OK
[19/500] AJG -> OK
[20/500] AKAM -> OK
[21/500] ALB -> OK
[22/500] ALGN -> OK
[23/500] ALL -> OK
[24/500] ALLE -> OK
[25/500] AMAT -> OK
[26/500] AMCR -> OK
[27/500] AMD -> OK
[28/500] AME -> OK
[29/500] AMGN -> OK
[30/500] AMP -> OK
[31/500] AMT -> OK
[32/500] AMZN -> OK
[33/500] ANET -> OK
[34/500] AON -> OK
[35/500] AOS -> OK
[36/500] APA -> OK
[37/500] APD -> OK
[38/500] APH -> OK
[39/500] APO -> OK
[40/500] APTV -> OK
[41/500] ARE -> OK
[42/500] ATO -> OK
[43/500] AVB -> OK
[44/500] AVGO -> OK
[45/500] AVY -> OK
[46/500] AWK -> OK
[47/500] AX

ArrowInvalid: ("Could not convert 'Unknown' with type str: tried to convert to int64", 'Conversion failed for column revenues_total_usd with type object')