In [4]:
import spacy
from docx import Document

def create_nlp_pipeline():
    nlp = spacy.load("en_core_web_sm")
    
    
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    # Example: minimal patterns. Add or remove as needed.
    patterns = [
        {
            "label": "PHONE_NUMBER",
            "pattern": [
                {"TEXT": {"REGEX": r"^\+?\d{1,3}$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3,4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3,4}$"}}
            ]
        },
        
         {
            "label": "POST_ADDRESS_HU",
            "pattern": [
                {"TEXT": {"REGEX": r"^H-\d{4}$"}},
                {"TEXT": {"REGEX": r"^[A-Za-zÁÉÍÓÖŐÚÜŰáéíóöőúüű]+$"}}
            ]
        },
        
          {
            "label": "SALARY_HU",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d{1,3}(,\d{3})*$"}},  # e.g., "300,000" or "300000"
                {"TEXT": {"REGEX": r"^(Ft|HUF)$"}}
            ]
        },
        
        {
            "label": "DATE",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d{1,2}/\d{1,2}/\d{4}$"}}
            ]
        },
        
        {
            "label": "ACCOUNT_NUMBER",
            "pattern": [{"TEXT": {"REGEX": r"^\d{6,12}$"}}]
        },
        
        {
            "label": "CREDIT_CARD",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}}
            ]
        },
        {
            "label": "EMAIL",
            "pattern": [{"TEXT": {"REGEX": r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"}}]
        },
        {
            "label": "ID_NUMBER",
            "pattern": [{"TEXT": {"REGEX": r"^\d{11}$"}}]
        },
        {
            "label": "COORDINATES",
            "pattern": [
                {"TEXT": {"REGEX": r"^\($"}},
                {"TEXT": {"REGEX": r"^-?\d+(\.\d+)?$"}},
                {"TEXT": {"REGEX": r"^,$"}},
                {"TEXT": {"REGEX": r"^-?\d+(\.\d+)?$"}},
                {"TEXT": {"REGEX": r"^\)$"}}
            ]
        },
        {
            "label": "ADDRESS",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d+$"}},
                {"TEXT": {"REGEX": r"^[A-Za-z]+$"}},
                {"TEXT": {"REGEX": r"^(Street|St|Road|Rd|Ave|Avenue|Blvd|Boulevard|Lane|Ln)$"}},
                {"TEXT": {"REGEX": r"^,$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^[A-Za-z]+$"}}
            ]
        }
    ]
    
    ruler.add_patterns(patterns)
    return nlp

def anonymize_text(text, nlp, allowed_labels):
    
    doc = nlp(text)
    new_text = []
    last_end = 0

    for ent in doc.ents:
        # Only anonymize if label is in the allowed list
        if ent.label_ in allowed_labels:
            # Add the text from the end of the last entity up to the start of this entity
            new_text.append(text[last_end:ent.start_char])
            # Insert a placeholder, e.g. [PERSON], [PHONE_NUMBER], etc.
            new_text.append(f"[{ent.label_}]")
            last_end = ent.end_char
    
    # Add any remaining text after the last entity
    new_text.append(text[last_end:])
    return "".join(new_text)

def anonymize_docx(input_path, output_path, nlp, allowed_labels):
    doc = Document(input_path)
    for paragraph in doc.paragraphs:
        original_text = paragraph.text
        anonymized = anonymize_text(original_text, nlp, allowed_labels)
        paragraph.text = anonymized
    
    # Save the updated docx
    doc.save(output_path)

def main():
    nlp = create_nlp_pipeline()
    
    # Labels should
    allowed_labels = {
        "PERSON", "GPE", "ORG", "DATE", "MONEY", "CARDINAL",
        "NORP", "LOC", "EMAIL", "PHONE_NUMBER", "CREDIT_CARD",
        "ACCOUNT_NUMBER", "ID_NUMBER", "COORDINATES", "ADDRESS"
    }
    
    # File paths
    input_file = "C:/Users/Mike/Downloads/FROGS.docx"
    output_file = "C:/Users/Mike/Desktop/AN_DOCX/FROGS_AN.docx"
    
    # Anonymize the DOCX
    anonymize_docx(input_file, output_file, nlp, allowed_labels)
    print(f"Anonymized docx saved as: {output_file}")

if __name__ == "__main__":
    main()


Anonymized docx saved as: C:/Users/Mike/Desktop/AN_DOCX/FROGS_AN.docx
