In [9]:
import re
import spacy

# Load spaCy's English NER model
nlp = spacy.load("en_core_web_sm")

def redact_personal_info(text):
    """
    Redacts phone numbers, email addresses, and named entities like names and locations.
    
    Args:
    text (str): The input text containing sensitive information.
    
    Returns:
    str: The redacted text.
    """

    # Patterns for bank account, credit card, TINs, and other financial identifiers
    patterns = {
        "bank_account": r'\b\d{10,16}\b',  # Detect 10-16 digit numbers (commonly bank account numbers)
        "credit_card": r'\b(?:\d[ -]*?){13,16}\b',  # Detect credit card numbers with or without spaces/hyphens
        "tin": r'\b\d{3}-\d{2}-\d{4}\b',  # TINs or SSN format (XXX-XX-XXXX)
        "salary_info": r'\b(\$|\₹)?\d{1,3}(,\d{3})*(\.\d{2})?\b',  # Detect salary/amount with currency symbols
        "ssn": r'\b\d{3}-\d{2}-\d{4}\b',  # SSN format (XXX-XX-XXXX)
        "passport_number": r'\b[A-Z0-9]{6,9}\b',  # Common passport number format (6-9 alphanumeric characters)
        "driving_license": r'\b[A-Z0-9]{8,12}\b',  # Generic driving license format
        "birthdate": r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',  # Dates like MM/DD/YYYY, DD-MM-YYYY
        "national_id": r'\b\d{9,12}\b'  # National ID numbers (9-12 digits)
    }

    # Redact phone numbers (e.g., formats like 123-456-7890, (123) 456-7890, 123 456 7890, etc.)
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-.●]?\(?(?:\d{1,4})\)?[-.●]?\d{1,4}[-.●]?\d{1,4}[-.●]?\d{1,9}\b'
    text = re.sub(phone_pattern, '[REDACTED]', text)

    # Redact email addresses
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    text = re.sub(email_pattern, '[REDACTED]', text)

    # Redact dates (e.g., "12/30/2021" or "December 30, 2021")
    date_pattern = r'\b(?:\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}|\b\w+\s\d{1,2},?\s\d{4})\b'
    text = re.sub(date_pattern, '[REDACTED]', text)

    # Redact financial amounts (e.g., "$10,000", "£5,000", "10,000 EUR")
    financial_pattern = r'\b(\$|£|€|₹)?\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s?(USD|EUR|GBP|INR)?\b'
    text = re.sub(financial_pattern, '[REDACTED]', text)

    # Redact IP-related numbers or codes (e.g., "Patent No. 12345", "Trademark ID 9876")
    ip_pattern = r'\b(Patent\s(No\.)?\s?\d+|Trademark\s(ID)?\s?\d+)\b'
    text = re.sub(ip_pattern, '[REDACTED]', text)

    for label, pattern in patterns.items():
        text = re.sub(pattern, '[REDACTED]', text)

    # Redact names, locations, and other sensitive information using NER with spaCy
    doc = nlp(text)
    for ent in doc.ents:
        # Only redact PERSON, GPE (Geopolitical entities like countries, cities), and ORG (Organizations) and LAW(Laws) and LOC(Locations)
        if ent.label_ in ["PERSON", "GPE", "ORG","LAW","LOC","MONEY","CARDINAL","DATE"]:
            text = text.replace(ent.text, '[REDACTED]')

    return text

# Sample text from user input containing sensitive information
user_input_text = """
John Doe lives in New York City. His phone number is (123) 456-7890, and his email is john.doe@example.com.
He works at Acme Corporation and frequently travels to London to ₹50,000 and 05/01/2023.
The bank account number for John Doe is 1234567890123456 and his salary is ₹1,20,000 annually.
His credit card number is 1234-5678-9101-1121 and his SSN is 123-45-6789. 
The mortgage loan amount is $500,000 with an interest rate of 4.5%. The tax identification number (TIN) is 987-65-4321.
His credit score is 750, and the loan amount was $50,000.
John Doe was born on 12/15/1985. His SSN is 123-45-6789 and his passport number is X1234567.
He holds a driving license number A123456789 and his national ID number is 987654321.
"""

# Redact sensitive information
redacted_text = redact_personal_info(user_input_text)

print("Original Text:")
print(user_input_text)
print("\nRedacted Text:")
print(redacted_text)


Original Text:

John Doe lives in New York City. His phone number is (123) 456-7890, and his email is john.doe@example.com.
He works at Acme Corporation and frequently travels to London to ₹50,000 and 05/01/2023.
The bank account number for John Doe is 1234567890123456 and his salary is ₹1,20,000 annually.
His credit card number is 1234-5678-9101-1121 and his SSN is 123-45-6789. 
The mortgage loan amount is $500,000 with an interest rate of 4.5%. The tax identification number (TIN) is 987-65-4321.
His credit score is 750, and the loan amount was $50,000.
John Doe was born on 12/15/1985. His SSN is 123-45-6789 and his passport number is X1234567.
He holds a driving license number A123456789 and his national ID number is 987654321.


Redacted Text:

[REDACTED] Doe lives in [REDACTED]. His phone number is ([[[REDACTED]]]) [[[REDACTED]]], and his email is [[[REDACTED]]].
He works at [REDACTED] and frequently travels to [REDACTED] to [REDACTED][[[REDACTED]]]and [[[REDACTED]]]/[[[REDACTED]]]