<a href="https://colab.research.google.com/github/SushmithaKasimsettyRamesh/LLM-Privacy-Shield-Privacy-Preserving-NLP-Pipeline-using-GPT-spaCy-Hugging-Face/blob/main/pipeline_fullday2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# A tool that masks PII, sends to OpenAI, and remaps the response

import openai
import re
import json
from typing import Dict, Tuple, Any

# =============================================================================
# 🔧 SETUP: OpenAI API Configuration
# =============================================================================

# Set your OpenAI API key here
# openai.api_key = "your-api-key-here"  # Replace with your actual API key

# For demonstration, we'll use a mock response function
USE_MOCK_RESPONSE = True  # Set to False when you have a real API key

def setup_openai_api(api_key: str = None):
    """
    Setup OpenAI API with your key
    """
    if api_key:
        openai.api_key = api_key
        print("✅ OpenAI API key configured")
    else:
        print("⚠️  Using mock responses. Set your API key to use real OpenAI.")


In [None]:
# =============================================================================
# 🎭 MASKING ENGINE (From Day 1)
# =============================================================================

def detect_and_mask_pii(text: str) -> Tuple[str, Dict[str, str]]:
    """
    Detect PII in text and replace with tokens
    Returns: (masked_text, mapping_dict)
    """
    mapping = {}
    masked_text = text

    # PII patterns with their token prefixes
    patterns = {
        'EMAIL': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        'PHONE': r'\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b',
        'SSN': r'\b\d{3}-\d{2}-\d{4}\b',
        'CREDIT_CARD': r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
        'NAME': r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b',  # Simple name pattern
        'ADDRESS': r'\b\d+\s[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Lane|Ln|Boulevard|Blvd)\b'
    }

    token_counters = {key: 1 for key in patterns.keys()}

    for pii_type, pattern in patterns.items():
        matches = re.finditer(pattern, masked_text, re.IGNORECASE)
        for match in matches:
            original_value = match.group()
            token = f"{{{{{pii_type}_{token_counters[pii_type]}}}}}"

            # Store the mapping
            mapping[token] = original_value

            # Replace in text
            masked_text = masked_text.replace(original_value, token, 1)
            token_counters[pii_type] += 1

    return masked_text, mapping

In [None]:
# =============================================================================
# 🤖 LLM INTERFACE
# =============================================================================

def call_llm(prompt: str, model: str = "gpt-3.5-turbo") -> str:
    """
    Send masked prompt to OpenAI and get response
    """
    if USE_MOCK_RESPONSE:
        # Mock response for demonstration
        return generate_mock_response(prompt)

    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant. Respond naturally to the user's request."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500,
            temperature=0.7
        )

        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"❌ Error calling OpenAI API: {e}")
        return f"Error: Could not get response from LLM - {str(e)}"

def generate_mock_response(prompt: str) -> str:
    """
    Generate a mock response that includes some of the tokens for testing
    """
    # This simulates what GPT might respond with
    if "help me write an email" in prompt.lower():
        return f"""Here's a professional email draft:

Subject: Project Update

Dear [Boss's Name],

I hope this email finds you well. I'm {extract_name_token(prompt)} writing to provide an update on our current project status.

Best regards,
{extract_name_token(prompt)}

Feel free to contact me at {extract_email_token(prompt)} if you have any questions."""

    elif "schedule a meeting" in prompt.lower():
        return f"""I'd be happy to help you schedule a meeting. Here's a suggested approach:

1. Contact {extract_name_token(prompt)} at {extract_email_token(prompt)}
2. Propose a few time slots that work for you
3. Confirm the meeting details

Would you like me to help you draft the scheduling email?"""

    else:
        # Generic response that might include tokens
        tokens_in_prompt = re.findall(r'\{\{[^}]+\}\}', prompt)
        response = f"I understand you're asking about something related to "
        if tokens_in_prompt:
            response += f"{', '.join(tokens_in_prompt[:2])}. "
        response += "I'd be happy to help you with this request."
        return response

def extract_name_token(text: str) -> str:
    """Extract the first NAME token from text"""
    match = re.search(r'\{\{NAME_\d+\}\}', text)
    return match.group() if match else "{{NAME_1}}"

def extract_email_token(text: str) -> str:
    """Extract the first EMAIL token from text"""
    match = re.search(r'\{\{EMAIL_\d+\}\}', text)
    return match.group() if match else "{{EMAIL_1}}"

In [None]:
# =============================================================================
# 🔄 REMAPPING ENGINE
# =============================================================================

def remap_output(response: str, mapping: Dict[str, str]) -> str:
    """
    Replace tokens in LLM response with original values
    """
    remapped_response = response

    # Track which tokens were actually used
    used_tokens = []
    unused_tokens = []

    for token, original_value in mapping.items():
        if token in response:
            remapped_response = remapped_response.replace(token, original_value)
            used_tokens.append(token)
        else:
            unused_tokens.append(token)

    # Log token usage
    if used_tokens:
        print(f"✅ Remapped tokens: {', '.join(used_tokens)}")
    if unused_tokens:
        print(f"ℹ️  Unused tokens: {', '.join(unused_tokens)}")

    return remapped_response

In [None]:
# =============================================================================
# 🔗 COMPLETE PIPELINE
# =============================================================================

def privacy_shield_pipeline(user_input: str, model: str = "gpt-3.5-turbo") -> Dict[str, Any]:
    """
    Complete pipeline: input → mask → LLM → remap → output
    """
    print("🛡️  LLM Privacy Shield - Processing Request")
    print("=" * 50)

    # Stage 1: Detect and mask PII
    print("📝 Stage 1: Masking PII...")
    masked_text, mapping = detect_and_mask_pii(user_input)

    print(f"Original: {user_input}")
    print(f"Masked:   {masked_text}")
    print(f"Mapping:  {json.dumps(mapping, indent=2)}")
    print()

    # Stage 2: Send to LLM
    print("🤖 Stage 2: Calling LLM...")
    llm_response = call_llm(masked_text, model)

    print(f"LLM Response: {llm_response}")
    print()

    # Stage 3: Remap original values
    print("🔄 Stage 3: Remapping values...")
    final_output = remap_output(llm_response, mapping)

    print(f"Final Output: {final_output}")
    print()

    # Return complete results
    return {
        "original_input": user_input,
        "masked_input": masked_text,
        "pii_mapping": mapping,
        "llm_response": llm_response,
        "final_output": final_output,
        "tokens_detected": len(mapping)
    }


In [None]:
# =============================================================================
# 🧪 TEST SUITE
# =============================================================================

def run_test_cases():
    """
    Test the complete pipeline with various scenarios
    """
    print("🧪 Running Test Cases")
    print("=" * 50)

    test_cases = [
        {
            "name": "Email Writing",
            "input": "Hi, I'm John Smith from john.smith@company.com. Help me write a professional email to my boss about the project delay."
        },
        {
            "name": "Meeting Scheduling",
            "input": "I need to schedule a meeting with Sarah Johnson. Her email is sarah.j@email.com and her phone is 555-123-4567."
        },
        {
            "name": "Personal Information",
            "input": "My name is Alice Brown, I live at 123 Main Street, and my SSN is 123-45-6789. Can you help me with a resume?"
        },
        {
            "name": "No PII",
            "input": "What's the weather like today? I need help with a general question."
        }
    ]

    for i, test_case in enumerate(test_cases, 1):
        print(f"\n{'='*20} Test Case {i}: {test_case['name']} {'='*20}")
        result = privacy_shield_pipeline(test_case['input'])

        print(f"✅ Test completed. Detected {result['tokens_detected']} PII tokens.")
        print("-" * 80)


In [None]:

# =============================================================================
# 🚀 MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    # Setup (uncomment and add your API key to use real OpenAI)
    # setup_openai_api("your-api-key-here")
    setup_openai_api()  # Using mock for demonstration

    print("\n🎯 LLM Privacy Shield - Day 2 Complete Pipeline")
    print("=" * 60)

    # Run automated tests
    run_test_cases()

    # Interactive demo
    print("\n" + "="*60)
    print("🎮 Interactive Demo")
    print("Try your own input! (or press Enter to skip)")

    user_input = input("\nEnter your text: ").strip()

    if user_input:
        print("\n" + "="*60)
        result = privacy_shield_pipeline(user_input)

        print(f"""
🎉 PIPELINE COMPLETE!
==================
✅ Original safely processed
✅ {result['tokens_detected']} PII tokens detected and protected
✅ LLM response successfully remapped
✅ Zero data leakage to external services
""")
    else:
        print("\n✅ Demo completed with test cases!")

# =============================================================================
# 📊 PIPELINE STATISTICS
# =============================================================================

def get_pipeline_stats(results: Dict[str, Any]) -> str:
    """
    Generate statistics about the pipeline execution
    """
    stats = f"""
📊 Pipeline Statistics:
- PII Types Detected: {len(set(token.split('_')[0] for token in results['pii_mapping'].keys()))}
- Total Tokens: {results['tokens_detected']}
- Original Length: {len(results['original_input'])} chars
- Final Length: {len(results['final_output'])} chars
- Privacy Protection: {'✅ ACTIVE' if results['tokens_detected'] > 0 else '⚠️ No PII detected'}
"""
    return stats

print("\n" + "="*60)
print("🛡️  LLM Privacy Shield Day 2 - READY TO RUN!")
print("="*60)
print("""
✅ Features implemented:
- PII detection and masking
- OpenAI API integration (with mock fallback)
- Smart remapping engine
- Complete pipeline automation
- Comprehensive testing suite
- Error handling and logging

🚀 To use with real OpenAI API:
1. Get your API key from OpenAI
2. Set USE_MOCK_RESPONSE = False
3. Add your API key to setup_openai_api()
4. Run the pipeline!
""")

⚠️  Using mock responses. Set your API key to use real OpenAI.

🎯 LLM Privacy Shield - Day 2 Complete Pipeline
🧪 Running Test Cases

🛡️  LLM Privacy Shield - Processing Request
📝 Stage 1: Masking PII...
Original: Hi, I'm John Smith from john.smith@company.com. Help me write a professional email to my boss about the project delay.
Masked:   Hi, I'm {{NAME_1}} from {{EMAIL_1}}. {{NAME_2}} write a {{NAME_3}} {{NAME_4}} {{NAME_5}} {{NAME_6}} delay.
Mapping:  {
  "{{EMAIL_1}}": "john.smith@company.com",
  "{{NAME_1}}": "John Smith",
  "{{NAME_2}}": "Help me",
  "{{NAME_3}}": "professional email",
  "{{NAME_4}}": "to my",
  "{{NAME_5}}": "boss about",
  "{{NAME_6}}": "the project"
}

🤖 Stage 2: Calling LLM...
LLM Response: I understand you're asking about something related to {{NAME_1}}, {{EMAIL_1}}. I'd be happy to help you with this request.

🔄 Stage 3: Remapping values...
✅ Remapped tokens: {{EMAIL_1}}, {{NAME_1}}
ℹ️  Unused tokens: {{NAME_2}}, {{NAME_3}}, {{NAME_4}}, {{NAME_5}}, {{NAME_6