In [27]:
# canada–jordan Annex 301 scraper (fixed)
# pip install requests beautifulsoup4 lxml pandas

import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag, NavigableString
from pathlib import Path

URL = "https://www.international.gc.ca/trade-commerce/trade-agreements-accords-commerciaux/agr-acc/chile-chili/fta-ale/annex-d-annexe.aspx?lang=eng"

OUT_RULES_CSV = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo.csv"
OUT_NOTES_CSV = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/annex301_notes.csv"
Path(OUT_RULES_CSV).parent.mkdir(parents=True, exist_ok=True)
Path(OUT_NOTES_CSV).parent.mkdir(parents=True, exist_ok=True)

# ---------------- helpers ----------------
def clean(s: str) -> str:
    """Normalize small text quirks for downstream parsing."""
    if not s:
        return ""
    s = s.replace("\xa0", " ")  # collapse non-breaking space
    s = s.replace("•", "\n")    # map bullet to line break to prevent word glue
    s = re.sub(r"\bTop of page\b", "", s, flags=re.I).strip()
    return s

def insert_block_newlines(root: BeautifulSoup):
    """
    Ensure block elements create hard line breaks in the extracted text.
    Converts <br> to '\n' and inserts '\n' before/after common block elements.
    Inline tags are intentionally left untouched.
    """
    block_tags = {
        "p","li","tr","thead","tbody","tfoot","table",
        "div","section","article","header","footer",
        "h1","h2","h3","h4","h5","h6","dt","dd"
    }
    # Normalize explicit <br> into newline characters.
    for br in root.find_all("br"):
        br.replace_with("\n")
    # Wrap block-level tags with surrounding newlines so text extraction preserves structure.
    for el in root.find_all(block_tags):
        el.insert_before(NavigableString("\n"))
        el.append(NavigableString("\n"))

def node_text_with_breaks(node):
    """Extract text while preserving structural line breaks."""
    insert_block_newlines(node)
    return node.get_text("\n")

def strip_footnotes_from_dom(dom: BeautifulSoup):
    """
    Remove footnote markers/links and numeric-only footnote bubbles from the DOM
    so they won't leak into the linearized text stream.
    """
    selectors = [
        "sup",
        'a[role="doc-noteref"]',
        'a[href*="#fn"]', 'a[href*="#footnote"]',
        ".footnote", ".footnotes", ".noteref", ".note-ref", ".xref"
    ]
    for sel in selectors:
        for el in list(dom.select(sel)):
            if isinstance(el, Tag):
                el.decompose()
    # Additionally remove naked numeric footnote artifacts like <a>12</a>.
    for el in list(dom.find_all(["a", "span", "sup"])):
        if not isinstance(el, Tag):
            continue
        txt = (el.get_text(strip=True) or "")
        if re.fullmatch(r"\d{1,3}", txt or ""):
            el.decompose()

def strip_inline_footnote_numbers(text: str) -> str:
    """
    Remove inline footnote numerals without harming HS patterns.
    Handles superscripts (¹²³), parenthetical (12), bracketed [12], and
    stray small integers attached to words.
    """
    text = re.sub(r"(?<=\w)[\u2070-\u2079]+", "", text)              # superscripts
    text = re.sub(r"(?<=\w)\s*\((\d{1,3})\)", "", text)              # (...) after word
    text = re.sub(r"(?<=\w)\s*\[(\d{1,3})\]", "", text)              # [...] after word
    text = re.sub(r"(?<=\b[A-Za-z])\s+(?<!\.)\b(\d{1,3})\b(?![\d\.])", " ", text)  # loose small number
    return text

def normalize_hs_notation(s: str) -> str:
    """
    Normalize HS notations into a consistent dotted style:
      - 01.01 / 03.05.71 / 11.03.11-11.03.13 / 03.04-03.06
    Covers ranges and 2/4/6 digit normalization with dots.
    """
    # Normalize full 6-digit ranges written without dots in the chapter/heading parts.
    s = re.sub(
        r"\b(\d{4})\.(\d{2})\s*-\s*(\d{4})\.(\d{2})\b",
        lambda m: f"{m.group(1)[:2]}.{m.group(1)[2:]}.{m.group(2)}-{m.group(3)[:2]}.{m.group(3)[2:]}.{m.group(4)}",
        s
    )
    # Zero-padded 6-digit ranges like 0d dd.dd - 0d dd.dd
    s = re.sub(r"\b(0\d)(\d{2})\.(\d{2})\s*-\s*(0\d)(\d{2})\.(\d{2})\b", r"\1.\2.\3-\4.\5.\6", s)
    # Zero-padded 4-digit ranges like 0d dd - 0d dd
    s = re.sub(r"\b(0\d)(\d{2})\s*-\s*(0\d)(\d{2})\b", r"\1.\2-\3.\4", s)
    # 6-digit single code written as 4 digits + dot + 2 digits → add a dot after the first 2 digits
    s = re.sub(r"\b(\d{4})\.(\d{2})\b", lambda m: f"{m.group(1)[:2]}.{m.group(1)[2:]}.{m.group(2)}", s)
    # Zero-padded 6-digit single code already split → keep as is (idempotent)
    s = re.sub(r"\b(0\d)(\d{2})\.(\d{2})\b", r"\1.\2.\3", s)
    # 4-digit heading → add dot after the first 2 digits
    s = re.sub(r"\b(0\d)(\d{2})\b", r"\1.\2", s)
    return s

# ---------------- structure regex (FIXED) ----------------
# Allow “Section I Live Animals; …” with or without a dash/colon
RE_SECTION   = re.compile(r"^Section\s+([IVXLC]+)(?:\s*[-–—:]\s*)?(.*)$", re.I)
# Allow “Chapter 1: Live Animals / Chapter 1 - Live Animals / Chapter 1 Live Animals”
RE_CHAPTER   = re.compile(r"^(Ex\s+)?Chapter\s+(\d+)\s*[:.\-]?\s*(.+)$", re.I)

# 支持扩展如 1901.10.aa 或 1901.10.aa-1901.10.ab
# 匹配标准HS到扩展item（支持 .aa, .bb, .01a 等）
RE_RULE = re.compile(
    r"^(?P<hs>(\d{2}\.\d{2}(?:\.\d{2})?(?:\.[A-Za-z0-9]{1,4})?)(\s*-\s*\d{2}\.\d{2}(?:\.\d{2})?(?:\.[A-Za-z0-9]{1,4})?)?)\s+(?P<rule>.+?)\s*$",
    re.I,
)

RE_HS_ONLY = re.compile(
    r"^(?P<hs>(\d{2}\.\d{2}(?:\.\d{2})?(?:\.[A-Za-z0-9]{1,4})?)(\s*-\s*\d{2}\.\d{2}(?:\.\d{2})?(?:\.[A-Za-z0-9]{1,4})?)?)\s*$",
    re.I,
)


RE_RULE_TEXT = re.compile(r"^A\s+change(?:\s+to)?\b.+$", re.I)
RE_NOTE      = re.compile(r"^Note\s*(\d+)?\s*:\s*(.*)$", re.I)

# ---------------- fetch & parse ----------------
resp = requests.get(URL, timeout=60)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
main = soup.select_one("main") or soup

# 1) Drop footnotes in DOM to avoid text noise before linearization
strip_footnotes_from_dom(main)

# 2) Extract text with structural line breaks and compress multiple blank lines
text_all = node_text_with_breaks(main)
text_all = re.sub(r"\n{2,}", "\n", text_all)

# Clean per-line and remove empties
lines = [x for x in (clean(strip_inline_footnote_numbers(t)) for t in text_all.split("\n"))]
lines = [x for x in lines if x.strip()]

# ---------------- parse state ----------------
rules_rows = []
notes_rows = []

current_section_title = ""
current_ch_label = ""
current_ch_no = ""   # e.g., "61"
current_ch_name = ""

# Chapter-level Note accumulation (numbered notes under a chapter header)
chapter_note_buf = []
waiting_chapter_note = False
chapter_note_num = None  # e.g., "1","2"

# Item-level Note accumulation (notes attached to a specific HS line)
item_note_buf = []
waiting_item_note = False
item_note_hs = ""

# Markers for the last rule/hs we appended, to support continuation lines
last_rule_idx = None
last_rule_hs = None
last_hs_seen = None

# Tracks if we just saw a standalone 'or.' line needing to be joined
pending_or_continuation = False

# Deduplicate chapter header insertions (some pages repeat headers)
seen_chapter_headers = set()

def flush_chapter_note():
    """Commit a buffered chapter note to notes_rows if present."""
    if current_ch_label and chapter_note_buf:
        notes_rows.append({
            "scope_type": "chapter",
            "scope_id": current_ch_label,        # e.g., "Chapter 61"
            "chapter_name": current_ch_name,
            "note_number": chapter_note_num or "",
            "note_text": " ".join(chapter_note_buf).strip()
        })
    chapter_note_buf.clear()

def flush_item_note():
    """Commit a buffered item (HS-level) note to notes_rows if present."""
    if item_note_buf and item_note_hs:
        notes_rows.append({
            "scope_type": "item",
            "scope_id": item_note_hs,           # e.g., "62.05.20-62.05.30"
            "chapter_name": current_ch_name,
            "note_number": "",                  # item notes are usually unnumbered
            "note_text": " ".join(item_note_buf).strip()
        })
    item_note_buf.clear()

for raw in lines:
    line = normalize_hs_notation(raw)
    line_stripped = line.strip()
    low = line_stripped.lower()

    # Early guard: avoid accidentally appending 'chapter/section' lines to prior rule
    if low.startswith("chapter") or low.startswith("section"):
        pass  # let structure matching below handle it

    # ---- Section header ----
    m = RE_SECTION.match(line)
    if m:
        flush_item_note()
        flush_chapter_note()
        current_section_title = f"Section {m.group(1)} - {m.group(2).strip()}" if m.group(2) else f"Section {m.group(1)}"
        last_rule_idx = None
        pending_or_continuation = False
        continue

    # ---- Chapter header ----
    m = RE_CHAPTER.match(line)
    if m:
        flush_item_note()
        flush_chapter_note()
        chapter_note_num = None
        current_ch_no = m.group(2).strip().zfill(2)
        current_ch_name = m.group(3).strip()
        current_ch_label = f"{(m.group(1) or '').strip()}Chapter {current_ch_no}".strip()
        header_key = (
            re.sub(r"\s+", " ", current_ch_label.lower()),
            re.sub(r"\s+", " ", current_ch_name.lower())
        )
        # Insert one structural row per chapter to retain chapter boundaries.
        if header_key not in seen_chapter_headers:
            rules_rows.append({
                "Chapter/heading/ sub-heading": current_ch_label,
                "Description": current_ch_name,
                "Product Specific Rule of Origin": "",
                "needs_manual_check": 0
            })
            seen_chapter_headers.add(header_key)
        last_rule_idx = None
        last_rule_hs = None
        last_hs_seen = None
        pending_or_continuation = False
        continue

    # ---- Chapter Note start (numbered Notes right after a chapter header) ----
    m = RE_NOTE.match(line)
    if m and current_ch_label and not last_hs_seen:
        waiting_chapter_note = True
        chapter_note_num = m.group(1)
        chapter_note_buf.append(m.group(2).strip())
        continue

    # ---- Chapter Note accumulation mode ----
    if waiting_chapter_note:
        # A new structural header or another Note ends the current note.
        if RE_SECTION.match(line) or RE_CHAPTER.match(line) or RE_NOTE.match(line):
            flush_chapter_note()
            waiting_chapter_note = False
            # fall through to handle this line again
        else:
            # If the probe is an HS within the current chapter, we close the note.
            m_probe = RE_RULE.match(line) or RE_HS_ONLY.match(line)
            if m_probe:
                hs_probe = m_probe.group("hs")
                if hs_probe[:2] == current_ch_no:
                    flush_chapter_note()
                    waiting_chapter_note = False
                else:
                    chapter_note_buf.append(line.strip())
                    continue
            else:
                chapter_note_buf.append(line.strip())
                continue

    # ---- HS-only line (rule text comes on a subsequent line) ----
    m = RE_HS_ONLY.match(line)
    if m:
        flush_item_note()
        last_hs_seen = m.group("hs").replace(" ", "")
        last_rule_hs = None
        last_rule_idx = None
        continue

    # ---- Item-level Note start (a Note immediately following an HS-only line) ----
    m_item = RE_NOTE.match(line)
    if m_item and last_hs_seen:
        waiting_item_note = True
        item_note_hs = last_hs_seen
        item_note_buf.append(m_item.group(2).strip())
        continue

    # ---- Item-level Note accumulation mode ----
    if waiting_item_note:
        # A new structural header or Note terminates the current item note.
        if RE_SECTION.match(line) or RE_CHAPTER.match(line) or RE_NOTE.match(line):
            flush_item_note()
            waiting_item_note = False
        else:
            # If the next line is the actual rule text for this HS, commit it and mark as manual check.
            if RE_RULE_TEXT.match(line) and item_note_hs:
                flush_item_note()
                waiting_item_note = False
                rule_text = line.strip().rstrip(".")
                rules_rows.append({
                    "Chapter/heading/ sub-heading": item_note_hs,
                    "Description": current_ch_name,
                    "Product Specific Rule of Origin": rule_text,
                    "needs_manual_check": 1
                })
                last_rule_idx = len(rules_rows) - 1
                last_rule_hs = item_note_hs
                continue
            # If we are moving on to a new HS or a combined HS+rule line, close the item note.
            m_next_hs = RE_HS_ONLY.match(line) or RE_RULE.match(line)
            if m_next_hs:
                flush_item_note()
                waiting_item_note = False
            else:
                # Otherwise keep accumulating the item note.
                item_note_buf.append(line.strip())
                continue

    # ---- Standalone "or." line: append to the last rule's text ----
    if re.fullmatch(r"or\.?", line, flags=re.I) and last_rule_idx is not None:
        rules_rows[last_rule_idx]["Product Specific Rule of Origin"] += " or"
        pending_or_continuation = True
        continue

    # ---- Continuation right after an "or." line ----
    if pending_or_continuation:
        # Only treat as continuation if the line is not a new structural/HS/rule start.
        if not (RE_RULE.match(line) or RE_HS_ONLY.match(line) or RE_SECTION.match(line) or RE_CHAPTER.match(line) or RE_NOTE.match(line)):
            rules_rows[last_rule_idx]["Product Specific Rule of Origin"] += " " + line.strip()
            pending_or_continuation = False
            continue
        else:
            pending_or_continuation = False

    # ---- If we just appended a rule, absorb plain text until the next structure token ----
    if last_rule_idx is not None:
        if not (RE_RULE.match(line) or RE_HS_ONLY.match(line) or RE_SECTION.match(line) or RE_CHAPTER.match(line) or RE_NOTE.match(line)):
            rules_rows[last_rule_idx]["Product Specific Rule of Origin"] += " " + line.strip()
            continue

    # ---- Rule line: HS and rule present on the same line ----
    m = RE_RULE.match(line)
    if m:
        flush_item_note()
        hs = m.group("hs").replace(" ", "")
        rule_text = m.group("rule").strip().rstrip(".")
        rules_rows.append({
            "Chapter/heading/ sub-heading": hs,
            "Description": current_ch_name,
            "Product Specific Rule of Origin": rule_text,
            "needs_manual_check": 0
        })
        last_rule_idx = len(rules_rows) - 1
        last_rule_hs = hs
        last_hs_seen = hs
        continue

    # ---- Rule text line with no HS (typically follows an HS-only line) ----
    if RE_RULE_TEXT.match(line) and last_hs_seen:
        flush_item_note()
        rule_text = line.strip().rstrip(".")
        rules_rows.append({
            "Chapter/heading/ sub-heading": last_hs_seen,
            "Description": current_ch_name,
            "Product Specific Rule of Origin": rule_text,
            "needs_manual_check": 0
        })
        last_rule_idx = len(rules_rows) - 1
        last_rule_hs = last_hs_seen
        continue

# Finalize any buffered notes at EOF
flush_item_note()
flush_chapter_note()

# ---- Build DataFrames and persist CSVs ----
for r in rules_rows:
    r.setdefault("needs_manual_check", 0)

rules_df = pd.DataFrame(rules_rows, columns=[
    "Chapter/heading/ sub-heading",
    "Description",
    "Product Specific Rule of Origin",
    "needs_manual_check",
])
notes_df = pd.DataFrame(notes_rows, columns=[
    "scope_type",       # 'chapter' or 'item'
    "scope_id",         # 'Chapter 61' or '62.05.20-62.05.30'
    "chapter_name",     # chapter title where the note resides
    "note_number",      # numbered only for chapter-level notes; usually empty for item notes
    "note_text"
])

rules_df.to_csv(OUT_RULES_CSV, index=False)
notes_df.to_csv(OUT_NOTES_CSV, index=False)
print(f"Saved rules: {OUT_RULES_CSV}  ({len(rules_df)} rows)")



Saved rules: /Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo.csv  (1182 rows)


In [2]:
#extract the notes in a seperate file
# -*- coding: utf-8 -*-
# Extract ALL notes from Canada–Costa Rica Annex 301 with correct scope binding
# pip install requests beautifulsoup4 lxml pandas

import re
import requests
import pandas as pd
from bs4 import BeautifulSoup, Tag
from pathlib import Path

URL = "https://www.international.gc.ca/trade-commerce/trade-agreements-accords-commerciaux/agr-acc/chile-chili/fta-ale/annex-d-annexe.aspx?lang=eng"
OUT_NOTES_CSV = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/notes.csv"
Path(OUT_NOTES_CSV).parent.mkdir(parents=True, exist_ok=True)

# ---------- Regex ----------
RE_SECTION   = re.compile(r"^\s*Section\s+([IVXLC]+)\b(?:\s*[-–—:]\s*)?(.*)$", re.I)
RE_CHAPTER   = re.compile(r"^\s*(?:Ex\s+)?Chapter\s+(\d+)\s*[:.\-]?\s*(.+)$", re.I)
RE_NOTE      = re.compile(r"^\s*Note\s*(\d+)?\s*:\s*(.*)$", re.I)

# HS header patterns (support 01.01 / 01.01.10 / 01.01-01.06 / 01.01.10-01.01.90)
RE_HS_HEAD   = re.compile(r"^\s*(\d{2}\.\d{2}(?:\.\d{2})?)(?:\s*-\s*(\d{2}\.\d{2}(?:\.\d{2})?))?\b")
# Rule-text start (used to stop absorbing lines into a Note block)
RE_RULE_TEXT = re.compile(r"^\s*A\s+change(?:\s+to)?\b", re.I)

# Tags considered as block-level when linearizing content
BLOCK_TAGS = ("h1","h2","h3","h4","h5","p","li","div","dt","dd")

# ---------- Helpers ----------
def strip_footnotes_in_dom(root: Tag):
    """
    Remove footnote superscripts/links and numeric-only artifacts directly in the DOM
    so they do not pollute the text stream after linearization.
    """
    selectors = [
        "sup",
        'a[role="doc-noteref"]',
        'a[href*="#fn"]', 'a[href*="#footnote"]',
        ".footnote", ".footnotes", ".noteref", ".note-ref", ".xref"
    ]
    for sel in selectors:
        for el in list(root.select(sel)):
            el.decompose()
    # Also drop naked numeric bubbles like <a>12</a> or <span>3</span>.
    for el in list(root.find_all(["a","span","sup"])):
        t = (el.get_text(strip=True) or "")
        if re.fullmatch(r"\d{1,3}", t):
            el.decompose()

def block_text(el: Tag) -> str:
    """
    Convert a block element into normalized text:
      - <br> -> newline
      - NBSP -> space
      - bullets -> newline
      - collapse repeated spaces/newlines
      - remove 'Top of page' UI crumbs
    """
    for br in el.find_all("br"):
        br.replace_with("\n")
    txt = el.get_text("\n", strip=True)
    txt = txt.replace("\xa0", " ").replace("•", "\n")
    txt = re.sub(r"\bTop of page\b", "", txt, flags=re.I)
    txt = re.sub(r"[ \t]+", " ", txt)
    txt = re.sub(r"\n{2,}", "\n", txt)
    return txt.strip()

# ---------- Fetch ----------
html = requests.get(URL, timeout=60).text
soup = BeautifulSoup(html, "lxml")
main = soup.select_one("main") or soup
strip_footnotes_in_dom(main)

# ---------- Linearize content as blocks ----------
# We extract block-level text segments to preserve structural boundaries.
blocks = []
for el in main.find_all(BLOCK_TAGS):
    t = block_text(el)
    if t:
        blocks.append(t)

# ---------- Traverse & bind notes ----------
notes = []

# Current context (used to bind a Note to Section/Chapter/Item scope)
current_section_label = ""
current_section_title = ""
seen_something_in_section = False

current_chapter_label = ""
current_chapter_name  = ""
seen_hs_in_chapter    = False

last_hs_range = ""
last_token_type = ""  # 'section' | 'chapter' | 'hs' | 'rule' | 'note' | 'other'

def normalize_hs_range(start_hs: str, end_hs: str | None) -> str:
    """Return a consistent HS range string, e.g., '05.01-05.11' or '05.01'."""
    return f"{start_hs}-{end_hs}" if end_hs else start_hs

def emit_note(scope_type, scope_id, scope_title, note_no, note_text, hs_range=""):
    """
    Append a single normalized note record with full surrounding context
    so downstream scripts can filter by scope or location easily.
    """
    notes.append({
        "scope_type": scope_type,                  # section | chapter | item
        "scope_id": scope_id,                      # e.g., Section II / Chapter 05 / 05.01-05.11
        "scope_title": scope_title,                # Section/Chapter human title (if present)
        "section_label": current_section_label,    # Current Section container
        "chapter_label": current_chapter_label,    # Current Chapter container
        "chapter_name": current_chapter_name,      # Human-readable chapter name
        "hs_range": hs_range,                      # HS range when scope_type=='item'
        "note_number": (note_no or "").strip(),    # Empty for unnumbered notes
        "note_text": note_text.strip()
    })

current_chapter_label = ""

i = 0
while i < len(blocks):
    line = blocks[i]

    # Section header
    m = RE_SECTION.match(line)
    if m:
        current_section_label = f"Section {m.group(1)}"
        current_section_title = (m.group(2) or "").strip()
        seen_something_in_section = False

        # Reset Chapter/HS context when entering a new Section
        current_chapter_label = ""
        current_chapter_name  = ""
        seen_hs_in_chapter    = False
        last_hs_range         = ""
        last_token_type       = "section"
        i += 1
        continue

    # Chapter header
    m = RE_CHAPTER.match(line)
    if m:
        ch_no  = m.group(1).zfill(2)
        current_chapter_label = f"Chapter {ch_no}"
        current_chapter_name  = m.group(2).strip()
        seen_something_in_section = True
        seen_hs_in_chapter = False
        last_hs_range = ""
        last_token_type = "chapter"
        i += 1
        continue

    # HS header (records the most recent HS range context)
    m = RE_HS_HEAD.match(line)
    if m:
        start_hs = m.group(1)
        end_hs   = m.group(2)
        last_hs_range = normalize_hs_range(start_hs, end_hs)
        seen_something_in_section = True
        seen_hs_in_chapter = True
        last_token_type = "hs"
        i += 1
        continue

    # Rule line (used only as a boundary so Notes don't consume rule paragraphs)
    if RE_RULE_TEXT.match(line):
        seen_something_in_section = True
        last_token_type = "rule"
        i += 1
        continue

    # Note block (may span multiple following lines until next structural token)
    m = RE_NOTE.match(line)
    if m:
        note_no = m.group(1) or ""
        first = (m.group(2) or "").strip()
        buf = [first] if first else []

        # Consume subsequent lines into this note until a new structure boundary is encountered.
        j = i + 1
        while j < len(blocks):
            probe = blocks[j]
            if (RE_NOTE.match(probe) or RE_SECTION.match(probe) or RE_CHAPTER.match(probe) 
                or RE_HS_HEAD.match(probe) or RE_RULE_TEXT.match(probe)):
                break
            buf.append(probe.strip())
            j += 1

        note_text = " ".join(buf).strip()

        # Scope binding priority:
        #   1) Section-level: immediately after a Section, before any Chapter/HS
        #   2) Chapter-level: first notes after a Chapter, before any HS
        #   3) Item-level: adjacent to the most recent HS
        #   4) Fallback: Chapter if available, else Section
        if last_token_type == "section" and not current_chapter_label and not last_hs_range:
            emit_note(
                scope_type="section",
                scope_id=current_section_label,
                scope_title=current_section_title,
                note_no=note_no,
                note_text=note_text
            )

        elif last_token_type == "chapter" and not seen_hs_in_chapter:
            emit_note(
                scope_type="chapter",
                scope_id=current_chapter_label,
                scope_title=current_chapter_name,
                note_no=note_no,
                note_text=note_text
            )

        elif last_hs_range and last_token_type in ("hs", "note"):
            # Treat as item-level when the closest anchor is an HS (or we just handled a note near HS)
            emit_note(
                scope_type="item",
                scope_id=last_hs_range,
                scope_title=current_chapter_name,
                note_no=note_no,
                note_text=note_text,
                hs_range=last_hs_range
            )

        else:
            # Conservative fallback: prefer Chapter if available, else Section.
            if current_chapter_label:
                emit_note(
                    scope_type="chapter",
                    scope_id=current_chapter_label,
                    scope_title=current_chapter_name,
                    note_no=note_no,
                    note_text=note_text
                )
            else:
                emit_note(
                    scope_type="section",
                    scope_id=current_section_label,
                    scope_title=current_section_title,
                    note_no=note_no,
                    note_text=note_text
                )

        last_token_type = "note"
        i = j
        continue

    # Plain text (ignored for note-extraction; used only to mark activity)
    seen_something_in_section = True
    last_token_type = "other"
    i += 1

# ---------- Save ----------
df = pd.DataFrame(notes, columns=[
    "scope_type","scope_id","scope_title",
    "section_label","chapter_label","chapter_name","hs_range",
    "note_number","note_text"
])
df.to_csv(OUT_NOTES_CSV, index=False)
print(f"✅ Saved notes: {OUT_NOTES_CSV}  ({len(df)} rows)")


✅ Saved notes: /Users/tianqixu/Downloads/PTA-task/Canada-Chile/notes.csv  (45 rows)


In [41]:
# ==============================================================
# 1️⃣ Improved rule-splitting utilities (avoid splitting list items)
# ==============================================================

import re
import pandas as pd

SPLIT_PATTERN = re.compile(
    r"""
    ;\s*or\s*(?=\n)                                              # line ends with '; or' + newline
    |;\s*or\s+(?=(A\s+change|No\s+required\s+change|Provided\b|A\s+regional\s+value\s+content\b))  # '; or' + next full rule
    |\bor\s*\n+(?=(A\s+change|No\s+required\s+change|Provided\b|A\s+regional\s+value\s+content\b))           # standalone 'or' + next full rule
    """,
    re.IGNORECASE | re.VERBOSE
)

def tidy(s: str) -> str:
    """Normalize whitespace and newlines for stable pattern matching."""
    if not isinstance(s, str):
        return ""
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{2,}", "\n", s)
    return s.strip(" \n;")

def split_rule_text(text: str, max_splits: int = 3):
    """
    Split one rule string into up to 3 logical parts:
      main_rule, alt_rule, alt_rule_2

    Logic:
    - Split only when '; or' or standalone 'or' clearly introduce a new full sentence rule
      starting with 'A change', 'No required change', 'Provided', or similar.
    - Do NOT split when '; or' is followed by '(a)', '(b)', '(c)', etc.
    """
    if not isinstance(text, str) or not text.strip():
        return ["", "", ""]
    t = tidy(text)

    # sequentially find split points
    parts = []
    last_end = 0
    for m in SPLIT_PATTERN.finditer(t):
        # Look ahead context: if '; or (a)' etc., skip splitting
        after = t[m.end():m.end()+5].strip()
        if re.match(r"^\([a-z]\)", after, flags=re.I):  # skip list items
            continue
        head = tidy(t[last_end:m.start()])
        if head:
            parts.append(head)
        last_end = m.end()
        if len(parts) >= max_splits - 1:
            break
    tail = tidy(t[last_end:])
    if tail:
        parts.append(tail)
    while len(parts) < 3:
        parts.append("")
    return parts[:3]


In [42]:
# ==============================================================
# 2️⃣  Apply rule splitting to the existing CSV
# ==============================================================

IN_CSV  = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo.csv"   # original scraped file
OUT_CSV = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo（1）.csv" # intermediate output

MAIN_COL = "Product Specific Rule of Origin"

# --- Load dataset ---
df = pd.read_csv(IN_CSV)

# --- Basic validation ---
if MAIN_COL not in df.columns:
    raise ValueError(f"Column not found: {MAIN_COL}\nAvailable columns: {list(df.columns)}")

# --- Perform multi-rule splitting ---
rules = df[MAIN_COL].fillna("").astype(str).tolist()
split_main, split_alt, split_alt2 = [], [], []

for txt in rules:
    r1, r2, r3 = split_rule_text(txt)
    split_main.append(r1)
    split_alt.append(r2)
    split_alt2.append(r3)

# --- Write new columns ---
df["main_rule"]   = split_main
df["alt_rule"]    = split_alt
df["alt_rule_2"]  = split_alt2

# Binary flag: 1 if at least one alternative rule exists
df["has_alt_rule"] = (
    df["alt_rule"].str.len().gt(0) | df["alt_rule_2"].str.len().gt(0)
).astype(int)

# --- Save output ---
df.to_csv(OUT_CSV, index=False)
print(f"Saved: {OUT_CSV}  (rows={len(df)})")

# --- Quick distribution check ---
print(df["has_alt_rule"].value_counts(dropna=False))


Saved: /Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo（1）.csv  (rows=1182)
has_alt_rule
0    871
1    311
Name: count, dtype: int64


In [43]:
# ==============================================================
# 3️⃣  Final cleanup and column renaming
# ==============================================================

OUT_CLEAN = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo（1）.csv"

# Columns that are redundant for final release
cols_to_drop = ["Product Specific Rule of Origin", "needs_manual_check", "has_alt_rule"]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

# Ensure consistent column naming
df = df.rename(columns={"main_rule": "main_rule", "alt_rule": "alt_rule", "alt_rule_2": "alt_rule_2"})

# Save the cleaned dataset
df.to_csv(OUT_CLEAN, index=False)

print(f"\nCleaned file saved to: {OUT_CLEAN}  ({len(df)} rows)")
print("Final columns:", list(df.columns))



Cleaned file saved to: /Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo（1）.csv  (1182 rows)
Final columns: ['Chapter/heading/ sub-heading', 'Description', 'main_rule', 'alt_rule', 'alt_rule_2']


In [44]:
# === Flag rows ONLY where alt_rule_2 has content ===

import pandas as pd

# Load the cleaned file
file_path = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo（1）.csv"
df = pd.read_csv(file_path)

# Create flag: 1 if alt_rule_2 has non-empty text, else 0
df["flag"] = df["alt_rule_2"].fillna("").str.strip().ne("").astype(int)

# Summary of flag distribution
print("Flag summary (1 = alt_rule_2 has content, 0 = empty):")
print(df["flag"].value_counts(dropna=False))

# Optionally display a few flagged rows
if df["flag"].sum() > 0:
    print("\nSample rows with flag = 1 (non-empty alt_rule_2):")
    display(
        df.loc[df["flag"] == 1, 
               ["Chapter/heading/ sub-heading", "main_rule", "alt_rule", "alt_rule_2", "flag"]].head(10)
    )
else:
    print("\nNo rows have non-empty alt_rule_2.")

# Save updated file
out_path = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo_flagged.csv"
df.to_csv(out_path, index=False)
print(f"\nFlagged file saved to: {out_path}")


Flag summary (1 = alt_rule_2 has content, 0 = empty):
flag
0    1179
1       3
Name: count, dtype: int64

Sample rows with flag = 1 (non-empty alt_rule_2):


Unnamed: 0,Chapter/heading/ sub-heading,main_rule,alt_rule,alt_rule_2,flag
930,85.23.52,A change to any other “smart card” of subheadi...,A change to any other “smart card” of subheadi...,No required change in tariff classification to...,1
1014,87.08.50,"A change to a drive-axle with differential, wh...",A change to any other non-driving axle and par...,No required change in tariff classification to...,1
1148,95.03,A change to a doll representing only a human b...,A change to a doll representing only a human b...,A change to any other good of heading 95.03 fr...,1



Flagged file saved to: /Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo_flagged.csv


In [45]:
# ==== Remove redundant columns and rename ====

# Columns that are no longer needed in the final output
cols_to_drop = ["Product Specific Rule of Origin", "needs_manual_check", "has_alt_rule"]

# Drop only the columns that actually exist (for compatibility across different versions)
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

# Rename 'rule' → 'main_rule' for consistency across datasets
df = df.rename(columns={"rule": "main_rule"})

# ==== Save cleaned output ====
OUT_CLEAN = "/Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo_flagged.csv"
df.to_csv(OUT_CLEAN, index=False)

print(f" Cleaned file saved to:\n{OUT_CLEAN} ({len(df)} rows)")
print("Columns now:", list(df.columns))


 Cleaned file saved to:
/Users/tianqixu/Downloads/PTA-task/Canada-Chile/roo_flagged.csv (1182 rows)
Columns now: ['Chapter/heading/ sub-heading', 'Description', 'main_rule', 'alt_rule', 'alt_rule_2', 'flag']
