In [9]:
import spacy
from docx import Document

def read_docx(file_path):
   
    document = Document(file_path)
    paragraphs = [p.text for p in document.paragraphs]
    return "\n".join(paragraphs)

def create_nlp_pipeline():
    nlp = spacy.load("hu_core_news_lg")
    # Insert EntityRuler before the built-in NER
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    
    #  ADDRESS Pattern 
    #    Matches: <number> <streetName> <streetSuffix> [optional comma] <city>
    #    e.g. "101 Frog Street, Bogtown"
    address_pattern = [
        {
            "label": "ADDRESS",
            "pattern": [
                # Building number
                {"TEXT": {"REGEX": r"^\d+$"}},
                # One-word street name (letters only)
                {"TEXT": {"REGEX": r"^[A-Za-z]+$"}},
                # Street suffix (common examples)
                {"TEXT": {"REGEX": r"^(Street|St|Road|Rd|Ave|Avenue|Blvd|Boulevard|Lane|Ln)$"}},
                # Optional comma
                {"TEXT": {"REGEX": r"^,$"}, "OP": "?"},
                # One-word city name (letters only)
                {"TEXT": {"REGEX": r"^[A-Za-z]+$"}}
            ]
        }
    ]

    # Coordinates Pattern (COORDINATES)
    #    Example: (40.7128, -74.0060)
    coordinates_pattern = [
        {
            "label": "COORDINATES",
            "pattern": [
                {"TEXT": {"REGEX": r"^\($"}},                 
                {"TEXT": {"REGEX": r"^-?\d+(\.\d+)?$"}},      
                {"TEXT": {"REGEX": r"^,$"}},                 
                {"TEXT": {"REGEX": r"^-?\d+(\.\d+)?$"}},      
                {"TEXT": {"REGEX": r"^\)$"}}                  
            ]
        }
    ]

    #  Multi-token pattern for 16-digit credit card with optional dashes/spaces
    #    Example: "4111-1111-1111-1111" or "4111 1111 1111 1111"
    credit_card_pattern = [
        {
            "label": "CREDIT_CARD",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}}
            ]
        }
    ]

    # Multi-token pattern for phone numbers
    phone_pattern = [
        {
            "label": "PHONE_NUMBER",
            "pattern": [
                {"TEXT": {"REGEX": r"^\+?\d{1,3}$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3,4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3,4}$"}}
            ]
        }
    ]

    # Other sensitive data patterns
    other_patterns = [
        {
            "label": "EMAIL",
            "pattern": [{"TEXT": {"REGEX": r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"}}]
        },
        {
            "label": "ACCOUNT_NUMBER",
            "pattern": [{"TEXT": {"REGEX": r"^\d{6,12}$"}}]
        },
        {
            "label": "ID_NUMBER",
            "pattern": [{"TEXT": {"REGEX": r"^\d{11}$"}}]
        }
    ]
    
    # ADDRESS first so it overrides default numeric handling.
    patterns = (
        address_pattern 
        + coordinates_pattern 
        + credit_card_pattern 
        + phone_pattern 
        + other_patterns
    )
    ruler.add_patterns(patterns)
    
    return nlp

def main():
    # 1. Path to your DOCX file
    file_path = "C:/Users/Mike/Downloads/FROGS.docx"
    
    # 2. Read the text
    text = read_docx(file_path)
    
    # 3. Create the spaCy pipeline
    nlp = create_nlp_pipeline()
    
    # 4. Process the text
    doc = nlp(text)
    
    # 5. Define allowed labels (default + custom)
    allowed_labels = {
        "PERSON", "GPE", "ORG", "DATE", "MONEY", 
        "CARDINAL", "NORP", "LOC", "EMAIL",
        "PHONE_NUMBER", "CREDIT_CARD", "ACCOUNT_NUMBER", 
        "ID_NUMBER", "COORDINATES", "ADDRESS"
    }
    
    # 6. Print detected entities
    print("=== Detected Entities ===")
    for ent in doc.ents:
        if ent.label_ in allowed_labels:
            print(f"Text: '{ent.text.strip()}', Label: {ent.label_}")

if __name__ == "__main__":
    main()




=== Detected Entities ===
Text: 'dr.jane@amphibians.edu', Label: EMAIL
Text: '123456789', Label: ACCOUNT_NUMBER
Text: '101 Frog Street, Bogtown', Label: ADDRESS
Text: 'USA', Label: LOC
Text: '9876543210', Label: ACCOUNT_NUMBER
Text: '(40.7128, -74.0060)', Label: COORDINATES
Text: 'michael.brown@frogs.org', Label: EMAIL


In [4]:
import spacy
from docx import Document

def create_nlp_pipeline():
    nlp = spacy.load("en_core_web_sm")
    
    # Insert EntityRuler before the built-in NER
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    # Example: minimal patterns. Add or remove as needed.
    patterns = [
        {
            "label": "PHONE_NUMBER",
            "pattern": [
                {"TEXT": {"REGEX": r"^\+?\d{1,3}$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3,4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{3,4}$"}}
            ]
        },
        
         {
            "label": "POST_ADDRESS_HU",
            "pattern": [
                {"TEXT": {"REGEX": r"^H-\d{4}$"}},
                {"TEXT": {"REGEX": r"^[A-Za-zÁÉÍÓÖŐÚÜŰáéíóöőúüű]+$"}}
            ]
        },
        
          {
            "label": "SALARY_HU",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d{1,3}(,\d{3})*$"}},  # e.g., "300,000" or "300000"
                {"TEXT": {"REGEX": r"^(Ft|HUF)$"}}
            ]
        },
        
        {
            "label": "DATE",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d{1,2}/\d{1,2}/\d{4}$"}}
            ]
        },
        
        {
            "label": "ACCOUNT_NUMBER",
            "pattern": [{"TEXT": {"REGEX": r"^\d{6,12}$"}}]
        },
        
        {
            "label": "CREDIT_CARD",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}},
                {"TEXT": {"REGEX": r"^[-.\s]+$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^\d{4}$"}}
            ]
        },
        {
            "label": "EMAIL",
            "pattern": [{"TEXT": {"REGEX": r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"}}]
        },
        {
            "label": "ID_NUMBER",
            "pattern": [{"TEXT": {"REGEX": r"^\d{11}$"}}]
        },
        {
            "label": "COORDINATES",
            "pattern": [
                {"TEXT": {"REGEX": r"^\($"}},
                {"TEXT": {"REGEX": r"^-?\d+(\.\d+)?$"}},
                {"TEXT": {"REGEX": r"^,$"}},
                {"TEXT": {"REGEX": r"^-?\d+(\.\d+)?$"}},
                {"TEXT": {"REGEX": r"^\)$"}}
            ]
        },
        {
            "label": "ADDRESS",
            "pattern": [
                {"TEXT": {"REGEX": r"^\d+$"}},
                {"TEXT": {"REGEX": r"^[A-Za-z]+$"}},
                {"TEXT": {"REGEX": r"^(Street|St|Road|Rd|Ave|Avenue|Blvd|Boulevard|Lane|Ln)$"}},
                {"TEXT": {"REGEX": r"^,$"}, "OP": "?"},
                {"TEXT": {"REGEX": r"^[A-Za-z]+$"}}
            ]
        }
    ]
    
    ruler.add_patterns(patterns)
    return nlp

def anonymize_text(text, nlp, allowed_labels):
    
    doc = nlp(text)
    new_text = []
    last_end = 0

    for ent in doc.ents:
        # Only anonymize if label is in the allowed list
        if ent.label_ in allowed_labels:
            # Add the text from the end of the last entity up to the start of this entity
            new_text.append(text[last_end:ent.start_char])
            # Insert a placeholder, e.g. [PERSON], [PHONE_NUMBER], etc.
            new_text.append(f"[{ent.label_}]")
            last_end = ent.end_char
    
    # Add any remaining text after the last entity
    new_text.append(text[last_end:])
    return "".join(new_text)

def anonymize_docx(input_path, output_path, nlp, allowed_labels):
    doc = Document(input_path)
    for paragraph in doc.paragraphs:
        original_text = paragraph.text
        anonymized = anonymize_text(original_text, nlp, allowed_labels)
        paragraph.text = anonymized
    
    # Save the updated docx
    doc.save(output_path)

def main():
    nlp = create_nlp_pipeline()
    
    # Labels should
    allowed_labels = {
        "PERSON", "GPE", "ORG", "DATE", "MONEY", "CARDINAL",
        "NORP", "LOC", "EMAIL", "PHONE_NUMBER", "CREDIT_CARD",
        "ACCOUNT_NUMBER", "ID_NUMBER", "COORDINATES", "ADDRESS"
    }
    
    # File paths
    input_file = "C:/Users/Mike/Downloads/FROGS.docx"
    output_file = "C:/Users/Mike/Desktop/AN_DOCX/FROGS_AN.docx"
    
    # Anonymize the DOCX
    anonymize_docx(input_file, output_file, nlp, allowed_labels)
    print(f"Anonymized docx saved as: {output_file}")

if __name__ == "__main__":
    main()


Anonymized docx saved as: C:/Users/Mike/Desktop/AN_DOCX/FROGS_AN.docx
