<a href="https://colab.research.google.com/github/Pranav-K-B/githubemc1/blob/main/Phone%20Number%20Detection%20and%20Deletion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Run this cell first - it will take 2-3 minutes
!pip install spacy pandas numpy scikit-learn
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m103.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
# Job Description Phone Number Sanitization System
# Complete implementation with hybrid detection pipeline

import re
import spacy
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict, Optional
from dataclasses import dataclass
import warnings
import time
from functools import lru_cache
import threading
from concurrent.futures import ThreadPoolExecutor
import pickle
import os

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

@dataclass
class DetectionResult:
    """Data class to store detection results"""
    original_text: str
    cleaned_text: str
    detected_numbers: List[str]
    detection_method: List[str]
    warning_message: str

class PhoneNumberSanitizer:
    """
    Hybrid phone number detection and sanitization system
    Implements a three-pass detection pipeline for maximum accuracy
    """

    def __init__(self):
        self.nlp = None
        self.crf_model = None
        self._load_models()
        self.warning_message = "⚠️ Please do not include your phone number in the job description."

        # Comprehensive regex patterns for different phone number formats
        self.phone_patterns = [
            # International formats with country codes
            r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{4,10}',
            r'\+\d{1,3}\s?\(\d{1,4}\)\s?\d{1,4}[-.\s]?\d{4,10}',

            # US/Canada formats
            r'\(\d{3}\)\s?\d{3}[-.\s]?\d{4}',
            r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',
            r'1[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',

            # Indian formats
            r'\+91[-.\s]?\d{10}',
            r'91[-.\s]?\d{10}',
            r'\d{10}',

            # European formats
            r'\+\d{2}[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{3}',

            # General patterns for obfuscated numbers
            r'\b\d{3}[-.\s]*\d{3}[-.\s]*\d{4}\b',
            r'\b\d{4}[-.\s]*\d{3}[-.\s]*\d{3}\b',
            r'\b\d{2}[-.\s]*\d{4}[-.\s]*\d{4}\b',

            # Patterns with parentheses and mixed separators
            r'\(\d{2,4}\)[-.\s]?\d{3,4}[-.\s]?\d{3,4}',
            r'\d{2,4}[-.\s]?\(\d{2,4}\)[-.\s]?\d{3,4}[-.\s]?\d{3,4}',
        ]

        # Compile regex patterns for efficiency
        self.compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.phone_patterns]

        # Word-to-number mapping for NER
        self.word_to_num = {
            'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
            'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9'
        }

    def _load_models(self):
        """Load NLP models with caching"""
        try:
            # Load spaCy model (download if not available)
            if not spacy.util.is_package("en_core_web_sm"):
                print("Downloading spaCy English model...")
                os.system("python -m spacy download en_core_web_sm")

            self.nlp = spacy.load("en_core_web_sm")
            print("✅ spaCy model loaded successfully")

        except Exception as e:
            print(f"⚠️ Could not load spaCy model: {e}")
            self.nlp = None

    def _first_pass_regex(self, text: str) -> Tuple[str, List[str], List[str]]:
        """
        First pass: Regex-based detection for structured phone numbers
        Returns: (cleaned_text, detected_numbers, detection_methods)
        """
        detected_numbers = []
        detection_methods = []
        cleaned_text = text

        for pattern in self.compiled_patterns:
            matches = pattern.findall(text)
            for match in matches:
                # Filter out false positives (salary, years, etc.)
                if self._is_likely_phone_number(match):
                    detected_numbers.append(match)
                    detection_methods.append("regex")
                    # Replace with asterisks to maintain text structure
                    cleaned_text = cleaned_text.replace(match, "*" * len(match))

        return cleaned_text, detected_numbers, detection_methods

    def _is_likely_phone_number(self, candidate: str) -> bool:
        """
        Heuristic to reduce false positives
        Returns True if the candidate is likely a phone number
        """
        # Remove all non-digit characters for analysis
        digits_only = re.sub(r'\D', '', candidate)

        # Check length (phone numbers typically 7-15 digits)
        if len(digits_only) < 7 or len(digits_only) > 15:
            return False

        # Check for common false positives
        false_positive_patterns = [
            r'^\d{4}$',  # Just a year
            r'^\d{1,2}k$',  # Salary abbreviation like "50k"
            r'^\d{1,3}%$',  # Percentage
            r'^\$\d+',  # Money amount
        ]

        for fp_pattern in false_positive_patterns:
            if re.match(fp_pattern, candidate, re.IGNORECASE):
                return False

        # Additional context-based checks
        if len(digits_only) == 4 and digits_only.startswith(('19', '20')):
            return False  # Likely a year

        return True

    def _second_pass_ner(self, text: str) -> Tuple[str, List[str], List[str]]:
        """
        Second pass: NER-based detection for text-based numbers
        """
        if not self.nlp:
            return text, [], []

        detected_numbers = []
        detection_methods = []
        cleaned_text = text

        try:
            doc = self.nlp(text)

            # Look for sequences of number words
            tokens = [token.text.lower() for token in doc]
            i = 0
            while i < len(tokens):
                if tokens[i] in self.word_to_num:
                    # Found start of potential phone number in words
                    number_sequence = []
                    j = i
                    while j < len(tokens) and tokens[j] in self.word_to_num:
                        number_sequence.append(self.word_to_num[tokens[j]])
                        j += 1

                    if len(number_sequence) >= 7:  # Minimum phone number length
                        word_sequence = ' '.join(tokens[i:j])
                        digit_sequence = ''.join(number_sequence)

                        detected_numbers.append(word_sequence)
                        detection_methods.append("ner")
                        cleaned_text = cleaned_text.replace(word_sequence, "*" * len(word_sequence))

                    i = j
                else:
                    i += 1

            # Also check for CARDINAL entities that might be phone numbers
            for ent in doc.ents:
                if ent.label_ == "CARDINAL" and len(re.sub(r'\D', '', ent.text)) >= 7:
                    if self._is_likely_phone_number(ent.text):
                        detected_numbers.append(ent.text)
                        detection_methods.append("ner")
                        cleaned_text = cleaned_text.replace(ent.text, "*" * len(ent.text))

        except Exception as e:
            print(f"NER processing error: {e}")

        return cleaned_text, detected_numbers, detection_methods

    def _third_pass_contextual(self, text: str) -> Tuple[str, List[str], List[str]]:
        """
        Third pass: Contextual analysis for ambiguous cases
        Simplified implementation without full CRF for Colab compatibility
        """
        detected_numbers = []
        detection_methods = []
        cleaned_text = text

        # Look for contextual clues
        phone_context_patterns = [
            r'(?:call|contact|reach|phone|mobile|cell)\s*(?:me|us)?\s*(?:at|on)?\s*[:\-]?\s*([0-9\-\(\)\s\.]+)',
            r'(?:number|no\.?|#)\s*[:\-]?\s*([0-9\-\(\)\s\.]+)',
            r'([0-9\-\(\)\s\.]{10,})\s*(?:only|calls?|texts?)',
        ]

        for pattern in phone_context_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                candidate = match.group(1).strip()
                if self._is_likely_phone_number(candidate):
                    detected_numbers.append(candidate)
                    detection_methods.append("contextual")
                    cleaned_text = cleaned_text.replace(candidate, "*" * len(candidate))

        return cleaned_text, detected_numbers, detection_methods

    def sanitize_job_description(self, job_description: str) -> DetectionResult:
        """
        Main method to sanitize job description
        Implements the three-pass hybrid detection pipeline
        """
        if not job_description or not job_description.strip():
            return DetectionResult(
                original_text=job_description,
                cleaned_text=job_description,
                detected_numbers=[],
                detection_method=[],
                warning_message=""
            )

        start_time = time.time()

        # Initialize tracking variables
        all_detected_numbers = []
        all_detection_methods = []
        current_text = job_description

        # First Pass: Regex Detection
        current_text, regex_numbers, regex_methods = self._first_pass_regex(current_text)
        all_detected_numbers.extend(regex_numbers)
        all_detection_methods.extend(regex_methods)

        # Second Pass: NER Detection
        current_text, ner_numbers, ner_methods = self._second_pass_ner(current_text)
        all_detected_numbers.extend(ner_numbers)
        all_detection_methods.extend(ner_methods)

        # Third Pass: Contextual Detection
        current_text, context_numbers, context_methods = self._third_pass_contextual(current_text)
        all_detected_numbers.extend(context_numbers)
        all_detection_methods.extend(context_methods)

        # Clean up the text (remove excessive asterisks, normalize whitespace)
        cleaned_text = self._post_process_text(current_text)

        processing_time = time.time() - start_time

        # Generate warning message if numbers were detected
        warning = self.warning_message if all_detected_numbers else ""

        print(f"🔍 Processing completed in {processing_time:.3f} seconds")
        print(f"📊 Detected {len(all_detected_numbers)} phone number(s)")

        return DetectionResult(
            original_text=job_description,
            cleaned_text=cleaned_text,
            detected_numbers=all_detected_numbers,
            detection_method=all_detection_methods,
            warning_message=warning
        )

    def _post_process_text(self, text: str) -> str:
        """Clean up the processed text"""
        # Replace multiple asterisks with a placeholder
        text = re.sub(r'\*{5,}', '[PHONE NUMBER REMOVED]', text)
        text = re.sub(r'\*{3,4}', '[REMOVED]', text)

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()

        return text

    def batch_process(self, job_descriptions: List[str]) -> List[DetectionResult]:
        """Process multiple job descriptions efficiently"""
        results = []

        print(f"🔄 Processing {len(job_descriptions)} job descriptions...")

        # Use ThreadPoolExecutor for parallel processing
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = [executor.submit(self.sanitize_job_description, desc)
                      for desc in job_descriptions]

            for i, future in enumerate(futures):
                try:
                    result = future.result(timeout=30)
                    results.append(result)
                    print(f"✅ Processed job description {i+1}/{len(job_descriptions)}")
                except Exception as e:
                    print(f"❌ Error processing job description {i+1}: {e}")
                    # Add empty result for failed processing
                    results.append(DetectionResult(
                        original_text=job_descriptions[i],
                        cleaned_text=job_descriptions[i],
                        detected_numbers=[],
                        detection_method=[],
                        warning_message="Processing failed"
                    ))

        return results

    def get_statistics(self, results: List[DetectionResult]) -> Dict:
        """Generate statistics from processing results"""
        stats = {
            'total_processed': len(results),
            'total_detections': sum(len(r.detected_numbers) for r in results),
            'jobs_with_phone_numbers': sum(1 for r in results if r.detected_numbers),
            'detection_methods': {},
            'average_detections_per_job': 0
        }

        # Count detection methods
        all_methods = []
        for result in results:
            all_methods.extend(result.detection_method)

        for method in set(all_methods):
            stats['detection_methods'][method] = all_methods.count(method)

        if stats['total_processed'] > 0:
            stats['average_detections_per_job'] = stats['total_detections'] / stats['total_processed']

        return stats

# Demo and Testing Functions
def create_test_job_descriptions():
    """Create sample job descriptions with various phone number formats for testing"""
    return [
        # Test case 1: US format phone numbers
        """
        Software Engineer Position

        We are looking for a talented software engineer to join our team.
        Requirements include 3+ years of experience in Python development.
        Salary range: $80,000 - $120,000.

        Contact us at (555) 123-4567 or apply online.
        """,

        # Test case 2: International format
        """
        Marketing Manager - Remote

        Exciting opportunity for a marketing professional with 5+ years experience.
        Must have MBA or equivalent degree.

        Call +91 98765 43210 for immediate consideration.
        Please call only between 9 AM - 6 PM IST.
        """,

        # Test case 3: Obfuscated phone numbers
        """
        Customer Service Representative

        Looking for friendly customer service reps. No experience necessary.
        Training provided. Starting salary $15/hour.

        Text me at nine one eight - five five five - one two three four
        or reach out at 918.555.1234 for quick response.
        """,

        # Test case 4: Multiple formats
        """
        Sales Associate - Full Time

        Join our growing sales team! We need motivated individuals.
        Base salary + commission structure.

        Primary contact: +1 (555) 987-6543
        Secondary: 555-987-6543
        WhatsApp: +91-9876543210
        """,

        # Test case 5: No phone numbers (control)
        """
        Data Scientist Position

        We are seeking a data scientist with experience in machine learning
        and statistical analysis. Requirements include:
        - Master's degree in Statistics, CS, or related field
        - 3+ years experience with Python/R
        - Experience with SQL databases

        Competitive salary range: $90,000 - $140,000
        Excellent benefits package included.
        """,

        # Test case 6: False positives (should NOT be detected)
        """
        Project Manager - Tech Startup

        Exciting opportunity at a fast-growing startup founded in 2020.
        We've grown 300% in the last 2 years and secured $5M in funding.

        Looking for someone with:
        - 5+ years of project management experience
        - PMP certification preferred
        - Experience managing teams of 10+ people

        Salary: $85,000 - $115,000 annually
        """
    ]

def run_demonstration():
    """Run a comprehensive demonstration of the system"""
    print("=" * 80)
    print("🚀 JOB DESCRIPTION PHONE NUMBER SANITIZATION SYSTEM")
    print("=" * 80)

    # Initialize the sanitizer
    print("\n📦 Initializing Phone Number Sanitizer...")
    sanitizer = PhoneNumberSanitizer()

    # Create test data
    print("\n📝 Creating test job descriptions...")
    test_descriptions = create_test_job_descriptions()

    # Process each description individually for detailed output
    results = []
    for i, description in enumerate(test_descriptions, 1):
        print(f"\n" + "="*60)
        print(f"📋 TEST CASE {i}")
        print("="*60)

        result = sanitizer.sanitize_job_description(description)
        results.append(result)

        print(f"\n📄 ORIGINAL TEXT:")
        print("-" * 40)
        print(result.original_text.strip())

        print(f"\n✅ CLEANED TEXT:")
        print("-" * 40)
        print(result.cleaned_text.strip())

        if result.detected_numbers:
            print(f"\n🔍 DETECTED PHONE NUMBERS:")
            print("-" * 40)
            for j, (number, method) in enumerate(zip(result.detected_numbers, result.detection_method)):
                print(f"  {j+1}. '{number}' (detected by: {method})")

            print(f"\n⚠️ WARNING MESSAGE:")
            print("-" * 40)
            print(result.warning_message)
        else:
            print(f"\n✅ No phone numbers detected - Clean job description!")

    # Generate and display statistics
    print(f"\n" + "="*60)
    print("📊 PROCESSING STATISTICS")
    print("="*60)

    stats = sanitizer.get_statistics(results)
    print(f"Total job descriptions processed: {stats['total_processed']}")
    print(f"Total phone numbers detected: {stats['total_detections']}")
    print(f"Job descriptions containing phone numbers: {stats['jobs_with_phone_numbers']}")
    print(f"Average detections per job: {stats['average_detections_per_job']:.2f}")

    if stats['detection_methods']:
        print(f"\nDetection method breakdown:")
        for method, count in stats['detection_methods'].items():
            print(f"  - {method}: {count} detections")

    print(f"\n" + "="*80)
    print("✅ DEMONSTRATION COMPLETED SUCCESSFULLY!")
    print("="*80)

    return results, stats

# Main execution
if __name__ == "__main__":
    # Run the demonstration
    results, statistics = run_demonstration()

🚀 JOB DESCRIPTION PHONE NUMBER SANITIZATION SYSTEM

📦 Initializing Phone Number Sanitizer...
✅ spaCy model loaded successfully

📝 Creating test job descriptions...

📋 TEST CASE 1
🔍 Processing completed in 0.059 seconds
📊 Detected 2 phone number(s)

📄 ORIGINAL TEXT:
----------------------------------------
Software Engineer Position
        
        We are looking for a talented software engineer to join our team. 
        Requirements include 3+ years of experience in Python development.
        Salary range: $80,000 - $120,000.
        
        Contact us at (555) 123-4567 or apply online.

✅ CLEANED TEXT:
----------------------------------------
Software Engineer Position We are looking for a talented software engineer to join our team. Requirements include 3+ years of experience in Python development. Salary range: $80,000 - $120,000. Contact us at [PHONE NUMBER REMOVED] or apply online.

🔍 DETECTED PHONE NUMBERS:
----------------------------------------
  1. '(555) 123-4567' (detec

In [3]:
# Run this to see the system in action
results, statistics = run_demonstration()

🚀 JOB DESCRIPTION PHONE NUMBER SANITIZATION SYSTEM

📦 Initializing Phone Number Sanitizer...
✅ spaCy model loaded successfully

📝 Creating test job descriptions...

📋 TEST CASE 1
🔍 Processing completed in 0.022 seconds
📊 Detected 2 phone number(s)

📄 ORIGINAL TEXT:
----------------------------------------
Software Engineer Position
        
        We are looking for a talented software engineer to join our team. 
        Requirements include 3+ years of experience in Python development.
        Salary range: $80,000 - $120,000.
        
        Contact us at (555) 123-4567 or apply online.

✅ CLEANED TEXT:
----------------------------------------
Software Engineer Position We are looking for a talented software engineer to join our team. Requirements include 3+ years of experience in Python development. Salary range: $80,000 - $120,000. Contact us at [PHONE NUMBER REMOVED] or apply online.

🔍 DETECTED PHONE NUMBERS:
----------------------------------------
  1. '(555) 123-4567' (detec