# Abrivation Extractor Logic for Main Pipeline
This script mines high-quality abbreviation → expansion mappings from a corpus of 3GPP DOCX files (Releases 15–19) and saves a cleaned master dictionary to JSON. The flow is:

Scan DOCX corpus (Rel-15 … Rel-19):
Recursively loads .docx files under /mnt/data/TSpec-LLM/3GPP-clean/, reading paragraph text and concatenating it per file. Errors are caught and logged so processing continues.

Regex-based extraction (per file):
Finds candidate definitions in two common patterns:

Long Form (ABBR) — e.g., Access Stratum (AS)

ABBR (Long Form) — e.g., AS (Access Stratum)
Matches are accumulated into a global map: ABBR → {set of candidate long forms}.

Smart validation & cleaning:
For each abbreviation:

Structural checks: reject overly long/dirty strings, one-word all-caps tokens, boilerplate starts (“this/the/a”), and generic/bad keywords (e.g., parameter, indicator).

Initials alignment: using a sliding window over the long form, ensure the initial letters align with the abbreviation (e.g., AS ↔ Access Stratum). Returns the best matching subphrase when present.

Selection of best expansion:
From each abbreviation’s candidate set, the first valid, initials-aligned expansion is chosen, normalized to Title Case, and stored as final_map[ABBR] = "Long Form".

Output:
The cleaned dictionary is saved to abbreviation_master_map.json, and a summary count of saved mappings is printed.

In [1]:
from pathlib import Path
from docx import Document
from collections import defaultdict
import json
import re
from itertools import islice

# Regex-based abbreviation extractor
def extract_abbreviation_defs(text):
    pattern1 = re.findall(r'\b([A-Z][a-zA-Z\s]{2,})\s+\((\b[A-Z]{2,6}\b)\)', text)
    pattern2 = re.findall(r'\b(\b[A-Z]{2,6}\b)\s+\(([A-Z][a-zA-Z\s]{2,})\)', text)

    abbrev_map = defaultdict(set)
    for long, short in pattern1:
        abbrev_map[short].add(long.strip())
    for short, long in pattern2:
        abbrev_map[short].add(long.strip())
    return abbrev_map

# Extract best matching subphrase for abbreviation
def extract_best_subphrase_for_abbr(abbr, expansion):
    abbr = abbr.upper()
    words = expansion.split()
    n = len(words)
    k = len(abbr)

    for i in range(n - k + 1):
        window = words[i:i + k]
        initials = [w[0].upper() for w in window]
        if initials == list(abbr):
            return ' '.join(window)
    return None

# Smart validation and cleaning
def is_valid_expansion(exp, abbr=None):
    if not exp:
        return False
    exp = exp.strip()

    if len(exp.split()) > 20 or any(c in exp for c in ['\n', '\t']):
        return False
    if exp.lower().startswith(("this", "the ", "a ")):
        return False
    if len(exp.split()) == 1 and (exp.isupper() or len(exp) <= 5):
        return False

    bad_keywords = {"request", "answer", "code", "function", "indicator", "parameter", "information"}
    if any(bad in exp.lower() for bad in bad_keywords):
        return False

    if abbr and len(abbr) >= 2:
        best_subphrase = extract_best_subphrase_for_abbr(abbr, exp)
        if not best_subphrase:
            return False
        return best_subphrase  # Return cleaned version

    return exp

# Main logic: scan first 100 DOCX files
global_abbrev_map = defaultdict(set)
docx_dir = Path("/mnt/data/TSpec-LLM/3GPP-clean/")

# Only process files under Rel-15 to Rel-19 directories
rel_folders = {"Rel-15", "Rel-16", "Rel-17", "Rel-18", "Rel-19"}

docx_files = [
    f for f in docx_dir.rglob("*.docx")
    if any(rel in f.parts for rel in rel_folders)
]

for file_path in docx_files:
    try:
        doc = Document(file_path)
        paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
        full_text = "\n".join(paragraphs)
        local_map = extract_abbreviation_defs(full_text)
        for abbr, expansions in local_map.items():
            global_abbrev_map[abbr].update(expansions)
    except Exception as e:
        print(f" Failed to read {file_path.name}: {e}")

# Final filtering and saving
final_map = {}

for abbr, expansions in global_abbrev_map.items():
    best = None
    for e in expansions:
        clean = is_valid_expansion(e, abbr)
        if clean:
            best = clean.strip().title()
            break
    if best:
        final_map[abbr.strip()] = best

# Save cleaned abbreviation map
with open("abbreviation_master_map.json", "w") as f:
    json.dump(final_map, f, indent=2)

print(f" Saved {len(final_map)} clean abbreviation mappings to abbreviation_master_map.json")

❌ Failed to read ._38.533 4.5.1.9+6.5.1.9 TT.docx: File is not a zip file
❌ Failed to read 22839-i10.docx: "no relationship of type 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' in collection"
❌ Failed to read RS+LDPC.docx: Opening and ending tag mismatch: rPr line 2 and ins, line 2, column 115185 (<string>, line 2)
❌ Failed to read Results for information from ASR vendors.docx: Opening and ending tag mismatch: rPr line 2 and ins, line 2, column 243668 (<string>, line 2)
❌ Failed to read ._38.533 4.5.1.9+6.5.1.9 TT.docx: File is not a zip file
❌ Failed to read RS+LDPC.docx: Opening and ending tag mismatch: rPr line 2 and ins, line 2, column 115185 (<string>, line 2)
❌ Failed to read Results for information from ASR vendors.docx: Opening and ending tag mismatch: rPr line 2 and ins, line 2, column 243668 (<string>, line 2)
❌ Failed to read 23792-120_rm.docx: Opening and ending tag mismatch: rPr line 20 and ins, line 20, column 35073 (<string>, line 