<a href="https://colab.research.google.com/github/SushmithaKasimsettyRamesh/LLM-Privacy-Shield-Privacy-Preserving-NLP-Pipeline-using-GPT-spaCy-Hugging-Face/blob/main/Day3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
DAY 3: Hugging Face Models + Smart Detection
Advanced PII detection using transformer models with comparison to spaCy
"""

import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
import warnings
warnings.filterwarnings("ignore")

@dataclass
class PIIEntity:
    """Structured PII entity with metadata"""
    text: str
    label: str
    start: int
    end: int
    confidence: float
    source: str  # 'spacy', 'huggingface', or 'regex'

class PIIDetector:
    def __init__(self, use_gpu: bool = False):
        """Initialize both spaCy and Hugging Face models"""
        print("🔧 Loading models...")

        # Load spaCy model
        try:
            self.nlp = spacy.load("en_core_web_sm")
            print("✅ spaCy model loaded")
        except OSError:
            print("❌ spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None

        # Load Hugging Face NER model
        try:
            model_name = "dslim/bert-base-NER"
            self.hf_tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.hf_model = AutoModelForTokenClassification.from_pretrained(model_name)

            # Create pipeline
            device = 0 if use_gpu else -1
            self.hf_pipeline = pipeline(
                "ner",
                model=self.hf_model,
                tokenizer=self.hf_tokenizer,
                aggregation_strategy="simple",
                device=device
            )
            print("✅ Hugging Face BERT-NER model loaded")
        except Exception as e:
            print(f"❌ Error loading Hugging Face model: {e}")
            self.hf_pipeline = None

        # Regex patterns for additional PII
        self.regex_patterns = {
            'EMAIL': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'PHONE': r'\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b',
            'SSN': r'\b\d{3}-\d{2}-\d{4}\b',
            'CREDIT_CARD': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
            'IP_ADDRESS': r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
        }

    def detect_pii_spacy(self, text: str) -> List[PIIEntity]:
        """Detect PII using spaCy NER"""
        if not self.nlp:
            return []

        doc = self.nlp(text)
        entities = []

        for ent in doc.ents:
            if ent.label_ in ['PERSON', 'ORG', 'GPE', 'LOC']:
                entities.append(PIIEntity(
                    text=ent.text,
                    label=ent.label_,
                    start=ent.start_char,
                    end=ent.end_char,
                    confidence=0.85,  # spaCy doesn't provide confidence scores
                    source='spacy'
                ))

        return entities

    def detect_pii_hf(self, text: str) -> List[PIIEntity]:
        """Detect PII using Hugging Face transformer model"""
        if not self.hf_pipeline:
            return []

        try:
            # Get NER predictions
            predictions = self.hf_pipeline(text)
            entities = []

            for pred in predictions:
                # Map HF labels to our standard labels
                label_mapping = {
                    'PER': 'PERSON',
                    'ORG': 'ORG',
                    'LOC': 'GPE',
                    'MISC': 'MISC'
                }

                # Extract label (remove B- or I- prefix)
                hf_label = pred['entity_group'] if 'entity_group' in pred else pred['entity']
                clean_label = hf_label.replace('B-', '').replace('I-', '')
                standard_label = label_mapping.get(clean_label, clean_label)

                entities.append(PIIEntity(
                    text=pred['word'],
                    label=standard_label,
                    start=pred['start'],
                    end=pred['end'],
                    confidence=pred['score'],
                    source='huggingface'
                ))

            return entities

        except Exception as e:
            print(f"Error in HF detection: {e}")
            return []

    def detect_pii_regex(self, text: str) -> List[PIIEntity]:
        """Detect PII using regex patterns"""
        entities = []

        for label, pattern in self.regex_patterns.items():
            for match in re.finditer(pattern, text):
                entities.append(PIIEntity(
                    text=match.group(),
                    label=label,
                    start=match.start(),
                    end=match.end(),
                    confidence=0.95,
                    source='regex'
                ))

        return entities


In [None]:
def detect_pii_combined(self, text: str, use_spacy: bool = True, use_hf: bool = True, use_regex: bool = True) -> List[PIIEntity]:
        """
        Combine all detection methods and deduplicate results
        """
        all_entities = []

        # Collect entities from all sources
        if use_spacy:
            all_entities.extend(self.detect_pii_spacy(text))

        if use_hf:
            all_entities.extend(self.detect_pii_hf(text))

        if use_regex:
            all_entities.extend(self.detect_pii_regex(text))

        # Deduplicate overlapping entities (keep highest confidence)
        final_entities = self._deduplicate_entities(all_entities)

        # Sort by start position
        final_entities.sort(key=lambda x: x.start)

        return final_entities

    def _deduplicate_entities(self, entities: List[PIIEntity]) -> List[PIIEntity]:
        """Remove overlapping entities, keeping the one with highest confidence"""
        if not entities:
            return []

        # Sort by start position, then by confidence (descending)
        entities.sort(key=lambda x: (x.start, -x.confidence))

        deduplicated = []

        for entity in entities:
            # Check if this entity overlaps with any already accepted entity
            overlaps = False
            for accepted in deduplicated:
                if self._entities_overlap(entity, accepted):
                    overlaps = True
                    break

            if not overlaps:
                deduplicated.append(entity)

        return deduplicated

    def _entities_overlap(self, entity1: PIIEntity, entity2: PIIEntity) -> bool:
        """Check if two entities overlap"""
        return not (entity1.end <= entity2.start or entity2.end <= entity1.start)

    def compare_models(self, text: str) -> Dict:
        """Compare detection results between spaCy and Hugging Face"""
        spacy_entities = self.detect_pii_spacy(text)
        hf_entities = self.detect_pii_hf(text)
        regex_entities = self.detect_pii_regex(text)
        combined_entities = self.detect_pii_combined(text)

        return {
            'text': text,
            'spacy': spacy_entities,
            'huggingface': hf_entities,
            'regex': regex_entities,
            'combined': combined_entities,
            'stats': {
                'spacy_count': len(spacy_entities),
                'hf_count': len(hf_entities),
                'regex_count': len(regex_entities),
                'combined_count': len(combined_entities)
            }
        }

    def create_token_mapping(self, entities: List[PIIEntity]) -> Dict[str, str]:
        """Create mapping from original text to tokens"""
        token_map = {}
        entity_counts = {}

        for entity in entities:
            # Count occurrences of each label type
            label = entity.label
            if label not in entity_counts:
                entity_counts[label] = 0
            entity_counts[label] += 1

            # Create token
            token = f"{{{{{label}_{entity_counts[label]}}}}}"
            token_map[entity.text] = token

        return token_map

    def anonymize_text(self, text: str, method: str = 'combined') -> Tuple[str, Dict[str, str]]:
        """
        Anonymize text by replacing PII with tokens
        Returns: (anonymized_text, token_mapping)
        """
        if method == 'spacy':
            entities = self.detect_pii_spacy(text)
        elif method == 'huggingface':
            entities = self.detect_pii_hf(text)
        elif method == 'regex':
            entities = self.detect_pii_regex(text)
        else:  # combined
            entities = self.detect_pii_combined(text)

        # Create token mapping
        token_map = self.create_token_mapping(entities)

        # Replace text (from end to start to maintain indices)
        anonymized = text
        for entity in sorted(entities, key=lambda x: x.start, reverse=True):
            token = token_map[entity.text]
            anonymized = anonymized[:entity.start] + token + anonymized[entity.end:]

        return anonymized, token_map


def test_tricky_inputs():
    """Test the detector on challenging inputs"""
    detector = PIIDetector()

    test_cases = [
        "Hi, I'm John Smith from john.smith@gmail.com, working at Google Inc.",
        "Call me at 555-123-4567 or email sarah.jones@company.org",
        "Dr. Jane Doe from New York University contacted Microsoft about the project",
        "My SSN is 123-45-6789 and my credit card is 4532-1234-5678-9012",
        "John called, then John texted from his iPhone. John Smith is persistent!",
        "I work at Apple Inc. in San Francisco, CA. My boss Tim Cook is great.",
        "Contact info: jane@doe.com, phone: (555) 123-4567, address: 123 Main St, Boston, MA"
    ]

    print("🧪 TESTING TRICKY INPUTS\n" + "="*50)

    for i, test_text in enumerate(test_cases, 1):
        print(f"\n📝 Test Case {i}:")
        print(f"Input: {test_text}")
        print("-" * 40)

        results = detector.compare_models(test_text)

        print(f"spaCy found: {results['stats']['spacy_count']} entities")
        print(f"HuggingFace found: {results['stats']['hf_count']} entities")
        print(f"Regex found: {results['stats']['regex_count']} entities")
        print(f"Combined found: {results['stats']['combined_count']} entities")

        print("\n🎯 Combined Detection Results:")
        for entity in results['combined']:
            print(f"  • {entity.text} → {entity.label} (conf: {entity.confidence:.2f}, source: {entity.source})")

        # Show anonymized version
        anonymized, token_map = detector.anonymize_text(test_text)
        print(f"\n🔒 Anonymized: {anonymized}")
        print(f"📋 Token Map: {token_map}")
        print("\n" + "="*50)


def interactive_demo():
    """Interactive demo for testing"""
    detector = PIIDetector()

    print("🚀 LLM Privacy Shield - Interactive Demo")
    print("Type 'quit' to exit, 'compare' to see model comparison")
    print("-" * 50)

    while True:
        user_input = input("\n📝 Enter text to analyze: ").strip()

        if user_input.lower() == 'quit':
            break

        if user_input.lower() == 'compare':
            test_tricky_inputs()
            continue

        if not user_input:
            continue

        # Show results
        results = detector.compare_models(user_input)

        print(f"\n📊 Detection Results:")
        print(f"spaCy: {results['stats']['spacy_count']} entities")
        print(f"HuggingFace: {results['stats']['hf_count']} entities")
        print(f"Regex: {results['stats']['regex_count']} entities")
        print(f"Combined: {results['stats']['combined_count']} entities")

        print(f"\n🔍 Detailed Results:")
        for entity in results['combined']:
            print(f"  • '{entity.text}' → {entity.label} (conf: {entity.confidence:.2f}, {entity.source})")

        # Show anonymized version
        anonymized, token_map = detector.anonymize_text(user_input)
        print(f"\n🔒 Anonymized: {anonymized}")

        # Show reverse mapping
        if token_map:
            print(f"📋 Token Mapping:")
            for original, token in token_map.items():
                print(f"  {token} → {original}")


if __name__ == "__main__":
    print("🛡️  LLM Privacy Shield - Day 3: Hugging Face Integration")
    print("="*60)

    # First, run the tricky inputs test
    test_tricky_inputs()

    # Then start interactive demo
    interactive_demo()

In [None]:
"""
DAY 3: Hugging Face Models + Smart Detection
Advanced PII detection using transformer models with comparison to spaCy
"""

import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
import warnings
warnings.filterwarnings("ignore")

@dataclass
class PIIEntity:
    """Structured PII entity with metadata"""
    text: str
    label: str
    start: int
    end: int
    confidence: float
    source: str  # 'spacy', 'huggingface', or 'regex'

class PIIDetector:
    def __init__(self, use_gpu: bool = False):
        """Initialize both spaCy and Hugging Face models"""
        print("🔧 Loading models...")

        # Load spaCy model
        try:
            self.nlp = spacy.load("en_core_web_sm")
            print("✅ spaCy model loaded")
        except OSError:
            print("❌ spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None

        # Load Hugging Face NER model
        try:
            model_name = "dslim/bert-base-NER"
            self.hf_tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.hf_model = AutoModelForTokenClassification.from_pretrained(model_name)

            # Create pipeline
            device = 0 if use_gpu else -1
            self.hf_pipeline = pipeline(
                "ner",
                model=self.hf_model,
                tokenizer=self.hf_tokenizer,
                aggregation_strategy="simple",
                device=device
            )
            print("✅ Hugging Face BERT-NER model loaded")
        except Exception as e:
            print(f"❌ Error loading Hugging Face model: {e}")
            self.hf_pipeline = None

        # Regex patterns for additional PII
        self.regex_patterns = {
            'EMAIL': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'PHONE': r'\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b',
            'SSN': r'\b\d{3}-\d{2}-\d{4}\b',
            'CREDIT_CARD': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
            'IP_ADDRESS': r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
        }

    def detect_pii_spacy(self, text: str) -> List[PIIEntity]:
        """Detect PII using spaCy NER"""
        if not self.nlp:
            return []

        doc = self.nlp(text)
        entities = []

        for ent in doc.ents:
            if ent.label_ in ['PERSON', 'ORG', 'GPE', 'LOC']:
                entities.append(PIIEntity(
                    text=ent.text,
                    label=ent.label_,
                    start=ent.start_char,
                    end=ent.end_char,
                    confidence=0.85,  # spaCy doesn't provide confidence scores
                    source='spacy'
                ))

        return entities

    def detect_pii_hf(self, text: str) -> List[PIIEntity]:
        """Detect PII using Hugging Face transformer model"""
        if not self.hf_pipeline:
            return []

        try:
            # Get NER predictions
            predictions = self.hf_pipeline(text)
            entities = []

            for pred in predictions:
                # Map HF labels to our standard labels
                label_mapping = {
                    'PER': 'PERSON',
                    'ORG': 'ORG',
                    'LOC': 'GPE',
                    'MISC': 'MISC'
                }

                # Extract label (remove B- or I- prefix)
                hf_label = pred['entity_group'] if 'entity_group' in pred else pred['entity']
                clean_label = hf_label.replace('B-', '').replace('I-', '')
                standard_label = label_mapping.get(clean_label, clean_label)

                entities.append(PIIEntity(
                    text=pred['word'],
                    label=standard_label,
                    start=pred['start'],
                    end=pred['end'],
                    confidence=pred['score'],
                    source='huggingface'
                ))

            return entities

        except Exception as e:
            print(f"Error in HF detection: {e}")
            return []

    def detect_pii_regex(self, text: str) -> List[PIIEntity]:
        """Detect PII using regex patterns"""
        entities = []

        for label, pattern in self.regex_patterns.items():
            for match in re.finditer(pattern, text):
                entities.append(PIIEntity(
                    text=match.group(),
                    label=label,
                    start=match.start(),
                    end=match.end(),
                    confidence=0.95,
                    source='regex'
                ))

        return entities

    def detect_pii_combined(self, text: str, use_spacy: bool = True, use_hf: bool = True, use_regex: bool = True) -> List[PIIEntity]:
        """
        Combine all detection methods and deduplicate results
        """
        all_entities = []

        # Collect entities from all sources
        if use_spacy:
            all_entities.extend(self.detect_pii_spacy(text))

        if use_hf:
            all_entities.extend(self.detect_pii_hf(text))

        if use_regex:
            all_entities.extend(self.detect_pii_regex(text))

        # Deduplicate overlapping entities (keep highest confidence)
        final_entities = self._deduplicate_entities(all_entities)

        # Sort by start position
        final_entities.sort(key=lambda x: x.start)

        return final_entities

    def _deduplicate_entities(self, entities: List[PIIEntity]) -> List[PIIEntity]:
        """Remove overlapping entities, keeping the one with highest confidence"""
        if not entities:
            return []

        # Sort by start position, then by confidence (descending)
        entities.sort(key=lambda x: (x.start, -x.confidence))

        deduplicated = []

        for entity in entities:
            # Check if this entity overlaps with any already accepted entity
            overlaps = False
            for accepted in deduplicated:
                if self._entities_overlap(entity, accepted):
                    overlaps = True
                    break

            if not overlaps:
                deduplicated.append(entity)

        return deduplicated

    def _entities_overlap(self, entity1: PIIEntity, entity2: PIIEntity) -> bool:
        """Check if two entities overlap"""
        return not (entity1.end <= entity2.start or entity2.end <= entity1.start)

    def compare_models(self, text: str) -> Dict:
        """Compare detection results between spaCy and Hugging Face"""
        spacy_entities = self.detect_pii_spacy(text)
        hf_entities = self.detect_pii_hf(text)
        regex_entities = self.detect_pii_regex(text)
        combined_entities = self.detect_pii_combined(text)

        return {
            'text': text,
            'spacy': spacy_entities,
            'huggingface': hf_entities,
            'regex': regex_entities,
            'combined': combined_entities,
            'stats': {
                'spacy_count': len(spacy_entities),
                'hf_count': len(hf_entities),
                'regex_count': len(regex_entities),
                'combined_count': len(combined_entities)
            }
        }

    def create_token_mapping(self, entities: List[PIIEntity]) -> Dict[str, str]:
        """Create mapping from original text to tokens"""
        token_map = {}
        entity_counts = {}

        for entity in entities:
            # Count occurrences of each label type
            label = entity.label
            if label not in entity_counts:
                entity_counts[label] = 0
            entity_counts[label] += 1

            # Create token
            token = f"{{{{{label}_{entity_counts[label]}}}}}"
            token_map[entity.text] = token

        return token_map

    def anonymize_text(self, text: str, method: str = 'combined') -> Tuple[str, Dict[str, str]]:
        """
        Anonymize text by replacing PII with tokens
        Returns: (anonymized_text, token_mapping)
        """
        if method == 'spacy':
            entities = self.detect_pii_spacy(text)
        elif method == 'huggingface':
            entities = self.detect_pii_hf(text)
        elif method == 'regex':
            entities = self.detect_pii_regex(text)
        else:  # combined
            entities = self.detect_pii_combined(text)

        # Create token mapping
        token_map = self.create_token_mapping(entities)

        # Replace text (from end to start to maintain indices)
        anonymized = text
        for entity in sorted(entities, key=lambda x: x.start, reverse=True):
            token = token_map[entity.text]
            anonymized = anonymized[:entity.start] + token + anonymized[entity.end:]

        return anonymized, token_map


def test_tricky_inputs():
    """Test the detector on challenging inputs"""
    detector = PIIDetector()

    test_cases = [
        "Hi, I'm John Smith from john.smith@gmail.com, working at Google Inc.",
        "Call me at 555-123-4567 or email sarah.jones@company.org",
        "Dr. Jane Doe from New York University contacted Microsoft about the project",
        "My SSN is 123-45-6789 and my credit card is 4532-1234-5678-9012",
        "John called, then John texted from his iPhone. John Smith is persistent!",
        "I work at Apple Inc. in San Francisco, CA. My boss Tim Cook is great.",
        "Contact info: jane@doe.com, phone: (555) 123-4567, address: 123 Main St, Boston, MA"
    ]

    print("🧪 TESTING TRICKY INPUTS\n" + "="*50)

    for i, test_text in enumerate(test_cases, 1):
        print(f"\n📝 Test Case {i}:")
        print(f"Input: {test_text}")
        print("-" * 40)

        results = detector.compare_models(test_text)

        print(f"spaCy found: {results['stats']['spacy_count']} entities")
        print(f"HuggingFace found: {results['stats']['hf_count']} entities")
        print(f"Regex found: {results['stats']['regex_count']} entities")
        print(f"Combined found: {results['stats']['combined_count']} entities")

        print("\n🎯 Combined Detection Results:")
        for entity in results['combined']:
            print(f"  • {entity.text} → {entity.label} (conf: {entity.confidence:.2f}, source: {entity.source})")

        # Show anonymized version
        anonymized, token_map = detector.anonymize_text(test_text)
        print(f"\n🔒 Anonymized: {anonymized}")
        print(f"📋 Token Map: {token_map}")
        print("\n" + "="*50)


def interactive_demo():
    """Interactive demo for testing"""
    detector = PIIDetector()

    print("🚀 LLM Privacy Shield - Interactive Demo")
    print("Type 'quit' to exit, 'compare' to see model comparison")
    print("-" * 50)

    while True:
        user_input = input("\n📝 Enter text to analyze: ").strip()

        if user_input.lower() == 'quit':
            break

        if user_input.lower() == 'compare':
            test_tricky_inputs()
            continue

        if not user_input:
            continue

        # Show results
        results = detector.compare_models(user_input)

        print(f"\n📊 Detection Results:")
        print(f"spaCy: {results['stats']['spacy_count']} entities")
        print(f"HuggingFace: {results['stats']['hf_count']} entities")
        print(f"Regex: {results['stats']['regex_count']} entities")
        print(f"Combined: {results['stats']['combined_count']} entities")

        print(f"\n🔍 Detailed Results:")
        for entity in results['combined']:
            print(f"  • '{entity.text}' → {entity.label} (conf: {entity.confidence:.2f}, {entity.source})")

        # Show anonymized version
        anonymized, token_map = detector.anonymize_text(user_input)
        print(f"\n🔒 Anonymized: {anonymized}")

        # Show reverse mapping
        if token_map:
            print(f"📋 Token Mapping:")
            for original, token in token_map.items():
                print(f"  {token} → {original}")


if __name__ == "__main__":
    print("🛡️  LLM Privacy Shield - Day 3: Hugging Face Integration")
    print("="*60)

    # First, run the tricky inputs test
    test_tricky_inputs()

    # Then start interactive demo
    interactive_demo()

🛡️  LLM Privacy Shield - Day 3: Hugging Face Integration
🔧 Loading models...
✅ spaCy model loaded


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


✅ Hugging Face BERT-NER model loaded
🧪 TESTING TRICKY INPUTS

📝 Test Case 1:
Input: Hi, I'm John Smith from john.smith@gmail.com, working at Google Inc.
----------------------------------------
spaCy found: 2 entities
HuggingFace found: 2 entities
Regex found: 1 entities
Combined found: 3 entities

🎯 Combined Detection Results:
  • John Smith → PERSON (conf: 1.00, source: huggingface)
  • john.smith@gmail.com → EMAIL (conf: 0.95, source: regex)
  • Google Inc → ORG (conf: 1.00, source: huggingface)

🔒 Anonymized: Hi, I'm {{PERSON_1}} from {{EMAIL_1}}, working at {{ORG_1}}.
📋 Token Map: {'John Smith': '{{PERSON_1}}', 'john.smith@gmail.com': '{{EMAIL_1}}', 'Google Inc': '{{ORG_1}}'}


📝 Test Case 2:
Input: Call me at 555-123-4567 or email sarah.jones@company.org
----------------------------------------
spaCy found: 0 entities
HuggingFace found: 0 entities
Regex found: 2 entities
Combined found: 2 entities

🎯 Combined Detection Results:
  • 555-123-4567 → PHONE (conf: 0.95, source: regex)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


✅ Hugging Face BERT-NER model loaded
🚀 LLM Privacy Shield - Interactive Demo
Type 'quit' to exit, 'compare' to see model comparison
--------------------------------------------------

📊 Detection Results:
spaCy: 0 entities
HuggingFace: 0 entities
Regex: 0 entities
Combined: 0 entities

🔍 Detailed Results:

🔒 Anonymized: hi im karthik

📊 Detection Results:
spaCy: 0 entities
HuggingFace: 0 entities
Regex: 0 entities
Combined: 0 entities

🔍 Detailed Results:

🔒 Anonymized: my name is eva i love wotking, my id is 556
