<a href="https://colab.research.google.com/github/SushmithaKasimsettyRamesh/LLM-Privacy-Shield-Privacy-Preserving-NLP-Pipeline-using-GPT-spaCy-Hugging-Face/blob/main/Day1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# 🎯 Goal: Detect PII and replace with tokens, maintaining mapping for restoration

# ============================================================================
# SETUP & INSTALLATION
# ============================================================================
"""

In [None]:
!pip install spacy python-dotenv
!python -m spacy download en_core_web_sm

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
import re
from typing import Dict, List, Tuple, Any
import json
from dataclasses import dataclass
from collections import defaultdict

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
# ============================================================================
# CORE ANONYMIZER CLASS
# ============================================================================
@dataclass
class PIIMatch:
    """Represents a detected PII entity"""
    text: str
    label: str
    start: int
    end: int
    token: str

class PIIAnonymizer:
    """
    Core anonymizer that detects PII and replaces with tokens
    Maintains bidirectional mapping for restoration
    """

    def __init__(self):
        # Token counters for consistent mapping
        self.token_counters = defaultdict(int)

        # Bidirectional mapping
        self.real_to_token = {}  # "John Smith" -> "{{NAME_1}}"
        self.token_to_real = {}  # "{{NAME_1}}" -> "John Smith"

        # Regex patterns for PII detection
        self.patterns = {
            'EMAIL': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'PHONE': r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
            'SSN': r'\b\d{3}-?\d{2}-?\d{4}\b',
            'CREDIT_CARD': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
        }

        # spaCy entity labels we care about
        self.spacy_labels = {
            'PERSON': 'NAME',
            'ORG': 'ORGANIZATION',
            'GPE': 'LOCATION',
            'LOC': 'LOCATION',
            'DATE': 'DATE',
            'MONEY': 'MONEY'
        }

    def _get_next_token(self, label: str) -> str:
        """Generate next token for a given label type"""
        self.token_counters[label] += 1
        return f"{{{{{label}_{self.token_counters[label]}}}}}"

    def _detect_regex_pii(self, text: str) -> List[PIIMatch]:
        """Detect PII using regex patterns"""
        matches = []

        for label, pattern in self.patterns.items():
            for match in re.finditer(pattern, text):
                pii_text = match.group()

                # Check if we've seen this value before
                if pii_text in self.real_to_token:
                    token = self.real_to_token[pii_text]
                else:
                    token = self._get_next_token(label)
                    self.real_to_token[pii_text] = token
                    self.token_to_real[token] = pii_text

                matches.append(PIIMatch(
                    text=pii_text,
                    label=label,
                    start=match.start(),
                    end=match.end(),
                    token=token
                ))

        return matches

    def _detect_spacy_pii(self, text: str) -> List[PIIMatch]:
        """Detect PII using spaCy NER"""
        doc = nlp(text)
        matches = []

        for ent in doc.ents:
            if ent.label_ in self.spacy_labels:
                pii_text = ent.text
                label = self.spacy_labels[ent.label_]

                # Check if we've seen this value before
                if pii_text in self.real_to_token:
                    token = self.real_to_token[pii_text]
                else:
                    token = self._get_next_token(label)
                    self.real_to_token[pii_text] = token
                    self.token_to_real[token] = pii_text

                matches.append(PIIMatch(
                    text=pii_text,
                    label=label,
                    start=ent.start_char,
                    end=ent.end_char,
                    token=token
                ))

        return matches

    def anonymize(self, text: str) -> Tuple[str, Dict[str, str]]:
        """
        Main anonymization function

        Args:
            text: Input text to anonymize

        Returns:
            tuple: (anonymized_text, token_mapping)
        """
        # Detect all PII
        regex_matches = self._detect_regex_pii(text)
        spacy_matches = self._detect_spacy_pii(text)

        # Combine and sort by position (reverse order for replacement)
        all_matches = sorted(
            regex_matches + spacy_matches,
            key=lambda x: x.start,
            reverse=True
        )

        # Remove overlapping matches (keep first detected)
        filtered_matches = []
        for match in all_matches:
            overlap = False
            for existing in filtered_matches:
                if (match.start < existing.end and match.end > existing.start):
                    overlap = True
                    break
            if not overlap:
                filtered_matches.append(match)

        # Apply replacements
        anonymized_text = text
        for match in filtered_matches:
            anonymized_text = (
                anonymized_text[:match.start] +
                match.token +
                anonymized_text[match.end:]
            )

        return anonymized_text, dict(self.token_to_real)

    def deanonymize(self, text: str) -> str:
        """
        Restore original values from anonymized text

        Args:
            text: Anonymized text with tokens

        Returns:
            str: Text with original values restored
        """
        result = text
        for token, real_value in self.token_to_real.items():
            result = result.replace(token, real_value)
        return result

    def get_mapping_summary(self) -> Dict[str, Any]:
        """Get summary of current mappings for debugging"""
        return {
            'token_counts': dict(self.token_counters),
            'total_mappings': len(self.token_to_real),
            'mappings': dict(self.token_to_real)
        }


In [None]:
# ============================================================================
# TESTING FRAMEWORK
# ============================================================================

def test_anonymizer():
    """Test the anonymizer with sample inputs"""

    anonymizer = PIIAnonymizer()

    # Test cases
    test_cases = [
        "Hi, I'm John Smith from Acme Corp. Email me at john.smith@acme.com or call 555-123-4567",
        "Sarah Johnson lives in New York and works at Google. Her SSN is 123-45-6789",
        "Contact Dr. Michael Brown at michael@hospital.org about the patient in Los Angeles"
    ]

    print("🔒 LLM Privacy Shield - Day 1 Testing")
    print("=" * 50)

    for i, test_text in enumerate(test_cases, 1):
        print(f"\n📝 TEST CASE {i}:")
        print(f"Original: {test_text}")

        # Anonymize
        anonymized, mapping = anonymizer.anonymize(test_text)
        print(f"Anonymized: {anonymized}")

        # Test deanonymization
        restored = anonymizer.deanonymize(anonymized)
        print(f"Restored: {restored}")

        # Verify restoration
        matches_original = restored == test_text
        print(f"✅ Restoration successful: {matches_original}")

        print(f"\nToken Mapping:")
        for token, real in mapping.items():
            print(f"  {token} ← {real}")

        print("-" * 30)

    # Print final summary
    print(f"\n📊 SUMMARY:")
    summary = anonymizer.get_mapping_summary()
    print(f"Total tokens created: {summary['total_mappings']}")
    print(f"Token types: {summary['token_counts']}")

    return anonymizer


In [None]:
# ============================================================================
# DEMO FUNCTION
# ============================================================================

def demo_privacy_shield():
    """Interactive demo of the privacy shield"""
    print("🛡️  LLM Privacy Shield Demo")
    print("Enter text to anonymize (or 'quit' to exit):")

    anonymizer = PIIAnonymizer()

    while True:
        user_input = input("\n> ")
        if user_input.lower() in ['quit', 'exit', 'q']:
            break

        if user_input.strip():
            anonymized, mapping = anonymizer.anonymize(user_input)

            print(f"\n🔒 Anonymized: {anonymized}")
            print(f"📋 Tokens created: {len(mapping)}")

            if mapping:
                print("🗝️  Token mapping:")
                for token, real in mapping.items():
                    print(f"   {token} ← {real}")

            # Show what would be sent to LLM
            print(f"\n📤 What LLM sees: '{anonymized}'")

            # Show restoration
            restored = anonymizer.deanonymize(anonymized)
            print(f"🔓 After restoration: '{restored}'")

In [None]:

# RUN TESTS
# ============================================================================

if __name__ == "__main__":
    # Run automated tests
    test_anonymizer = test_anonymizer()

    print("\n" + "="*50)
    print("Ready for interactive demo!")
    print("Call demo_privacy_shield() to try it out")

    # Uncomment to run interactive demo
    # demo_privacy_shield()

🔒 LLM Privacy Shield - Day 1 Testing

📝 TEST CASE 1:
Original: Hi, I'm John Smith from Acme Corp. Email me at john.smith@acme.com or call 555-123-4567
Anonymized: Hi, I'm {{NAME_1}} from {{ORGANIZATION_1}} Email me at {{EMAIL_1}} or call {{PHONE_1}}
Restored: Hi, I'm John Smith from Acme Corp. Email me at john.smith@acme.com or call 555-123-4567
✅ Restoration successful: True

Token Mapping:
  {{EMAIL_1}} ← john.smith@acme.com
  {{PHONE_1}} ← 555-123-4567
  {{NAME_1}} ← John Smith
  {{ORGANIZATION_1}} ← Acme Corp.
------------------------------

📝 TEST CASE 2:
Original: Sarah Johnson lives in New York and works at Google. Her SSN is 123-45-6789
Anonymized: {{NAME_2}} lives in {{LOCATION_1}} and works at {{ORGANIZATION_2}}. Her {{ORGANIZATION_3}} is {{SSN_1}}
Restored: Sarah Johnson lives in New York and works at Google. Her SSN is 123-45-6789
✅ Restoration successful: True

Token Mapping:
  {{EMAIL_1}} ← john.smith@acme.com
  {{PHONE_1}} ← 555-123-4567
  {{NAME_1}} ← John Smith
  {{ORG