## Extract Text from Different File Types

In [7]:
import re
import pandas as pd
import pdfplumber
import docx
import os

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext == ".pdf":
        return extract_pdf_text(file_path)
    elif ext == ".csv":
        return extract_csv_text(file_path)
    elif ext == ".txt":
        return extract_txt_text(file_path)
    elif ext == ".docx":
        return extract_docx_text(file_path)
    else:
        raise ValueError("Unsupported file format")

def extract_pdf_text(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_csv_text(file_path):
    df = pd.read_csv(file_path)
    return "\n".join(df.to_string(index=False).split("\n"))

def extract_txt_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def extract_docx_text(file_path):
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs])

# Example Usage:
text = extract_text("example.txt")
print(text)


----------------------------------------
           INVOICE
----------------------------------------

Invoice Number: INV-20240301  
Invoice Date: 2024-03-01  
Due Date: 2024-03-15  

Bill To:  
John Doe  
ABC Corporation  
123 Business St, Suite 456  
New York, NY, 10001  
Email: johndoe@abccorp.com  

Ship To:  
Jane Smith  
XYZ Enterprises  
789 Commerce Ave  
San Francisco, CA, 94105  

----------------------------------------
Description          Qty  Unit Price  Total  
----------------------------------------
Product A            2    $50.00      $100.00  
Product B            5    $30.00      $150.00  
Service Fee          1    $75.00      $75.00  
----------------------------------------
Subtotal:                          $325.00  
Tax (10%):                         $32.50  
Total:                             $357.50  

Payment Method: Bank Transfer  
Bank: XYZ Bank  
Account No: 123456789  
SWIFT Code: XYZB1234  

Notes:  
Thank you for your business! Please make the payment 

In [9]:
import re
import spacy
import os

# Load SpaCy NLP model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

# Function to read text from a .txt file
def extract_txt_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

# Function to detect sensitive information using Regex
def detect_sensitive_info(text):
    patterns = {
        "email": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
        "phone": r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b",
        "invoice_number": r"INV-\d{6}",
        "bank_account": r"\b\d{9,18}\b",
        "address": r"\d{1,5}\s\w+(\s\w+)*,\s\w+,\s\w+,\s\d{5}"
    }
    
    detected_info = {}
    
    for key, pattern in patterns.items():
        matches = re.findall(pattern, text)
        if matches:
            detected_info[key] = matches
    
    return detected_info

# Function to detect PII using SpaCy (NER model)
def detect_pii_spacy(text):
    doc = nlp(text)
    sensitive_info = {}
    
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "EMAIL", "GPE", "ORG", "DATE", "MONEY"]:
            if ent.label_ not in sensitive_info:
                sensitive_info[ent.label_] = []
            sensitive_info[ent.label_].append(ent.text)
    
    return sensitive_info

# Function to anonymize detected sensitive information
def anonymize_text(text, sensitive_info):
    for category, values in sensitive_info.items():
        for value in values:
            text = text.replace(value, f"[{category.upper()}_REDACTED]")
    return text

# File path for example.txt
file_path = "example.txt"

# Step 1: Extract text from the file
text = extract_txt_text(file_path)

# Step 2: Detect sensitive data
regex_sensitive_info = detect_sensitive_info(text)
ner_sensitive_info = detect_pii_spacy(text)

# Combine regex and NER detections
combined_sensitive_info = {**regex_sensitive_info, **ner_sensitive_info}

# Step 3: Anonymize detected data
anonymized_text = anonymize_text(text, combined_sensitive_info)

# Step 4: Save anonymized text to a new file
anonymized_file_path = "anonymized_example.txt"
with open(anonymized_file_path, "w", encoding="utf-8") as f:
    f.write(anonymized_text)

# Print detected sensitive data and anonymized text
print("Detected Sensitive Information:", combined_sensitive_info)
print("\nAnonymized Text:\n", anonymized_text)
print(f"\nAnonymized text saved to: {anonymized_file_path}")


Detected Sensitive Information: {'email': ['johndoe@abccorp.com'], 'invoice_number': ['INV-202403'], 'bank_account': ['123456789'], 'DATE': ['2024-03-01', '2024-03-15', '10001', '94105'], 'PERSON': ['Bill', 'Jane Smith'], 'GPE': ['New York', 'San Francisco'], 'ORG': ['NY', 'XYZ Enterprises  \n789', 'CA', 'Description          Qty  Unit Price', 'XYZ Bank  \nAccount'], 'MONEY': ['50.00', '100.00', '30.00', '150.00', '75.00', '75.00', '325.00', '32.50', '357.50']}

Anonymized Text:
 ----------------------------------------
           INVOICE
----------------------------------------

Invoice Number: [INVOICE_NUMBER_REDACTED]01  
Invoice Date: [DATE_REDACTED]  
Due Date: [DATE_REDACTED]  

[PERSON_REDACTED] To:  
John Doe  
ABC Corporation  
123 Business St, Suite 456  
[GPE_REDACTED], [ORG_REDACTED], [DATE_REDACTED]  
Email: [EMAIL_REDACTED]  

Ship To:  
[PERSON_REDACTED]  
[ORG_REDACTED] Commerce Ave  
[GPE_REDACTED], [ORG_REDACTED], [DATE_REDACTED]  

-----------------------------------