In [1]:
# Install required packages for local reasoning
!pip install transformers torch accelerate --quiet

# Import libraries
import pandas as pd
import numpy as np
import torch
import warnings
import json
import os
from pathlib import Path
import time
from datetime import datetime

warnings.filterwarnings('ignore')

print("🚀 Fraud Detection Reasoning Environment Setup")
print("=" * 50)
print(f"✅ GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("⚠️  Using CPU - consider enabling GPU accelerator")

print("✅ Environment ready for local reasoning!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# 🧠 Local Fraud Detection Reasoning on Kaggle

This notebook provides **local AI-powered reasoning** for fraud detection using Kaggle's GPU resources instead of paid APIs.

## 🚀 SETUP INSTRUCTIONS FOR KAGGLE

### Step 1: Upload Your Trained Model
1. **Create a Kaggle Dataset** with your trained model files:
   - Upload your `distilbert_model/` folder (contains config.json, model.safetensors)
   - Upload your `distilbert_tokenizer/` folder (contains tokenizer files)
   - Name your dataset (e.g., "fraud-detection-models")

### Step 2: Add Dataset to This Notebook
1. **Add your dataset as input** to this notebook:
   - Click "Add data" → "Your datasets" → Select your model dataset
   - This makes your model files available at `/kaggle/input/your-dataset-name/`

### Step 3: Update Model Paths
1. **Update the paths in Cell 4** to match your dataset name:
   ```python
   MODEL_PATH = '/kaggle/input/YOUR-DATASET-NAME/distilbert_model'
   TOKENIZER_PATH = '/kaggle/input/YOUR-DATASET-NAME/distilbert_tokenizer'
   ```

### Step 4: Run the Notebook
1. **Enable GPU accelerator** for faster inference
2. **Run all cells** to load your model and start analyzing texts

## 📊 Pipeline Flow
1. **Load** your trained DistilBERT model from Kaggle dataset
2. **Classify** texts into fraud categories using your actual model  
3. **Generate reasoning** using local LM for non-legitimate classifications
4. **Download** results with explanations

## 🔧 Requirements
- Your trained `distilbert_model/` and `distilbert_tokenizer/` uploaded as a Kaggle dataset
- GPU accelerator enabled for faster inference
- No API keys needed - everything runs locally!

## ⚠️ Important Notes
- **Without proper model upload**: Notebook will run in demo mode with simulated results
- **With proper model upload**: You get real AI-powered fraud detection and reasoning
- The reasoning engine works with ANY classification result (real or demo)

In [2]:
# Load Local Language Model for Reasoning (Free Alternative to APIs)
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

print("🧠 Loading Local Language Model for Reasoning...")

# Use a smaller, efficient model that works well on Kaggle's free tier
# Options: 'microsoft/DialoGPT-medium', 'gpt2', 'distilgpt2'
reasoning_model_name = "microsoft/DialoGPT-medium"  # Good balance of quality and speed

try:
    # Initialize reasoning pipeline
    reasoning_pipe = pipeline(
        "text-generation",
        model=reasoning_model_name,
        device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
        do_sample=True,
        temperature=0.7,
        max_length=512,
        pad_token_id=50256  # Set pad token to avoid warnings
    )
    
    print(f"✅ Local reasoning model loaded: {reasoning_model_name}")
    print("💡 This model will generate explanations locally (no API costs!)")
    
    # Test the reasoning model
    test_prompt = "This text appears to be a scam because"
    test_response = reasoning_pipe(test_prompt, max_length=50, num_return_sequences=1)
    print("🧪 Model test successful!")
    
except Exception as e:
    print(f"⚠️  Error loading model: {e}")
    print("Falling back to simpler model...")
    # Fallback to smaller model
    reasoning_pipe = pipeline("text-generation", model="distilgpt2", device=0 if torch.cuda.is_available() else -1)

2025-09-16 15:42:53.796829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758037373.976640      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758037374.031951      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🧠 Loading Local Language Model for Reasoning...


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


✅ Local reasoning model loaded: microsoft/DialoGPT-medium
💡 This model will generate explanations locally (no API costs!)
🧪 Model test successful!


In [9]:
# Load Your Trained DistilBERT Fraud Detection Model
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

print("📦 Loading Your Trained DistilBERT Model...")

# KAGGLE PATHS - Update these to match your uploaded dataset name
MODEL_PATH = '/kaggle/input/distilbert/transformers/default/1/distilbert_model'  # Update this path to your dataset
TOKENIZER_PATH = '/kaggle/input/distilbert/transformers/default/1/distilbert_tokenizer'  # Update this path to your dataset



# Class labels (must match your training - alphabetical order)
CLASS_LABELS = [
    'job_scam',
    'legitimate', 
    'phishing',
    'popup_scam',
    'refund_scam',
    'reward_scam',
    'sms_spam',
    'ssn_scam',
    'tech_support_scam'
]

print(f"🔍 Checking paths:")
print(f"   Model: {MODEL_PATH}")
print(f"   Tokenizer: {TOKENIZER_PATH}")

# Check if paths exist
import os
model_exists = os.path.exists(MODEL_PATH)
tokenizer_exists = os.path.exists(TOKENIZER_PATH)
print(f"   Model exists: {model_exists}")
print(f"   Tokenizer exists: {tokenizer_exists}")

if not model_exists or not tokenizer_exists:
    print("\n❌ Model files not found!")
    print("📁 Make sure you've uploaded your model files to Kaggle:")
    print("   1. Go to Kaggle Datasets")
    print("   2. Create a new dataset")
    print("   3. Upload your 'distilbert_model/' and 'distilbert_tokenizer/' folders")
    print("   4. Update the paths above to match your dataset name")
    print("   5. Add your dataset as input to this notebook")
    fraud_model = None
    fraud_tokenizer = None
else:
    try:
        # Load your trained model and tokenizer
        print("🔄 Loading tokenizer...")
        fraud_tokenizer = DistilBertTokenizer.from_pretrained(TOKENIZER_PATH)
        
        print("🔄 Loading model...")
        fraud_model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
        
        # Move to GPU for faster inference
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        fraud_model.to(device)
        fraud_model.eval()
        
        print(f"✅ DistilBERT model loaded successfully!")
        print(f"🎯 Device: {device}")
        print(f"📋 Classes: {len(CLASS_LABELS)} fraud types + legitimate")
        print(f"🏷️  Labels: {CLASS_LABELS}")
        
        # Quick test to verify model works
        test_text = "Your package has been successfully delivered and left at your front door. If you do not locate the parcel, please check with members of your household or nearby areas where it may have been placed for security."
        test_encoding = fraud_tokenizer(
            test_text,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        test_input_ids = test_encoding['input_ids'].to(device)
        test_attention_mask = test_encoding['attention_mask'].to(device)
        
        with torch.no_grad():
            test_outputs = fraud_model(input_ids=test_input_ids, attention_mask=test_attention_mask)
            test_probabilities = torch.softmax(test_outputs.logits, dim=1).cpu().numpy()[0]
            test_predicted_class = CLASS_LABELS[np.argmax(test_probabilities)]
        
        print(f"\n🧪 Model test - '{test_text}':")
        print(f"   Predicted: {test_predicted_class} ({test_probabilities[np.argmax(test_probabilities)]:.2%})")
        print("   ✅ Model is working!" if test_predicted_class == 'legitimate' else f"   ⚠️ Unexpected result: {test_predicted_class}")
        
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("🔧 This will cause the classification to use demo mode.")
        fraud_model = None
        fraud_tokenizer = None

📦 Loading Your Trained DistilBERT Model...
🔍 Checking paths:
   Model: /kaggle/input/distilbert/transformers/default/1/distilbert_model
   Tokenizer: /kaggle/input/distilbert/transformers/default/1/distilbert_tokenizer
   Model exists: True
   Tokenizer exists: True
🔄 Loading tokenizer...
🔄 Loading model...
✅ DistilBERT model loaded successfully!
🎯 Device: cuda
📋 Classes: 9 fraud types + legitimate
🏷️  Labels: ['job_scam', 'legitimate', 'phishing', 'popup_scam', 'refund_scam', 'reward_scam', 'sms_spam', 'ssn_scam', 'tech_support_scam']

🧪 Model test - 'Your package has been successfully delivered and left at your front door. If you do not locate the parcel, please check with members of your household or nearby areas where it may have been placed for security.':
   Predicted: legitimate (72.74%)
   ✅ Model is working!


# 🔧 Local Reasoning Engine Configuration

This section sets up the local reasoning engine that generates explanations for fraud classifications using the language model we loaded earlier.

In [4]:
# Local Reasoning Engine Configuration
class LocalFraudReasoningEngine:
    """
    Local reasoning engine that generates explanations without API calls
    """
    
    def __init__(self, reasoning_pipeline):
        self.reasoning_pipe = reasoning_pipeline
        self.min_confidence = 0.5
        
        # Scam type descriptions for better reasoning
        self.scam_descriptions = {
            'phishing': {
                'description': 'Attempts to steal sensitive information like passwords, credit card numbers, or personal data',
                'indicators': ['urgent action required', 'verify account', 'click here', 'suspicious links', 'fake sender']
            },
            'popup_scam': {
                'description': 'Fake popup messages claiming virus infections or system issues',
                'indicators': ['virus detected', 'system error', 'immediate action', 'fake technical alerts']
            },
            'sms_spam': {
                'description': 'Unwanted promotional or fraudulent text messages',
                'indicators': ['unsolicited offers', 'prize claims', 'urgent responses', 'suspicious phone numbers']
            },
            'reward_scam': {
                'description': 'False promises of rewards, prizes, or free items',
                'indicators': ['congratulations', 'you have won', 'free gift', 'claim now', 'limited time']
            },
            'tech_support_scam': {
                'description': 'Fake technical support claiming to fix computer problems',
                'indicators': ['computer infected', 'microsoft support', 'remote access', 'technical issues']
            },
            'refund_scam': {
                'description': 'Fake refund notifications or requests for payment information',
                'indicators': ['refund available', 'payment failed', 'update payment', 'billing issue']
            },
            'ssn_scam': {
                'description': 'Attempts to steal Social Security Numbers or similar personal identifiers',
                'indicators': ['SSN verification', 'social security', 'identity verification', 'government agency']
            },
            'job_scam': {
                'description': 'Fake job offers or employment opportunities',
                'indicators': ['work from home', 'easy money', 'no experience required', 'guaranteed income']
            }
        }
        
        self.stats = {
            'total_processed': 0,
            'reasoning_generated': 0,
            'skipped_legitimate': 0,
            'skipped_low_confidence': 0
        }
        
    def should_generate_reasoning(self, predicted_label, confidence):
        """Determine if reasoning should be generated"""
        return predicted_label != 'legitimate' and confidence >= self.min_confidence
    
    def generate_local_reasoning(self, text, predicted_label, confidence, all_predictions):
        """Generate enhanced reasoning using local language model"""
        scam_info = self.scam_descriptions.get(predicted_label, {
            'description': 'Unknown scam type',
            'indicators': []
        })
        
        # Enhanced reasoning without relying on language model generation
        # Analyze text content directly
        text_lower = text.lower()
        detected_indicators = []
        
        # Check for specific indicators in the text
        for indicator in scam_info['indicators']:
            if any(word in text_lower for word in indicator.split()):
                detected_indicators.append(indicator)
        
        # Add common fraud patterns
        urgent_words = ['urgent', 'immediate', 'now', 'quickly', 'hurry', 'expires']
        if any(word in text_lower for word in urgent_words):
            detected_indicators.append('urgent language to pressure victims')
            
        money_words = ['$', 'money', 'prize', 'won', 'claim', 'free', 'gift']
        if any(word in text_lower for word in money_words):
            detected_indicators.append('financial incentives or rewards')
            
        action_words = ['click', 'call', 'text', 'visit', 'send', 'verify']
        if any(word in text_lower for word in action_words):
            detected_indicators.append('requests for immediate action')
            
        suspicious_elements = ['suspicious links', 'phone numbers', 'email addresses']
        if 'http' in text_lower or '@' in text_lower or any(char.isdigit() for char in text):
            detected_indicators.append('suspicious contact information')
        
        # Create comprehensive reasoning
        reasoning_parts = []
        reasoning_parts.append(f"This text was classified as {predicted_label} with {confidence:.1%} confidence.")
        reasoning_parts.append(f"\n{scam_info['description']}")
        
        if detected_indicators:
            reasoning_parts.append(f"\nKey fraud indicators detected:")
            for i, indicator in enumerate(detected_indicators[:4], 1):  # Limit to top 4
                reasoning_parts.append(f"• {indicator}")
        
        # Add context about why this is dangerous
        danger_context = {
            'phishing': 'This could lead to identity theft and financial loss.',
            'sms_spam': 'This could lead to unwanted charges and privacy violations.',
            'reward_scam': 'This could lead to financial scams and personal data theft.',
            'tech_support_scam': 'This could lead to remote access scams and financial fraud.',
            'job_scam': 'This could lead to advance fee fraud and identity theft.',
            'popup_scam': 'This could lead to malware installation and system compromise.',
            'refund_scam': 'This could lead to payment fraud and account takeover.',
            'ssn_scam': 'This could lead to identity theft and government impersonation fraud.'
        }
        
        if predicted_label in danger_context:
            reasoning_parts.append(f"\n⚠️ Risk: {danger_context[predicted_label]}")
        
        # Add confidence context
        if confidence > 0.9:
            reasoning_parts.append(f"\nHigh confidence ({confidence:.1%}) indicates strong fraud patterns.")
        elif confidence > 0.7:
            reasoning_parts.append(f"Moderate confidence ({confidence:.1%}) suggests probable fraud patterns.")
        
        return '\n'.join(reasoning_parts)

# Initialize the local reasoning engine
local_reasoning_engine = LocalFraudReasoningEngine(reasoning_pipe)
print("✅ Local reasoning engine initialized with enhanced analysis!")

✅ Local reasoning engine initialized with enhanced analysis!


In [5]:
# Fraud Classification and Reasoning Functions
def classify_text(text, max_length=128):
    """Classify text using the loaded DistilBERT model"""
    if fraud_model is None or fraud_tokenizer is None:
        print("🚨 WARNING: Model not loaded! Using demo mode.")
        print("💡 Please check model paths and ensure your dataset is properly uploaded to Kaggle.")
        print("🔄 Demo mode always returns 'phishing' - this is NOT real classification!")
        
        # Return a more realistic demo that varies by text content
        text_lower = text.lower()
        if any(word in text_lower for word in ['thanks', 'meeting', 'delivered', 'shipped', 'hi ', 'hello']):
            demo_label = 'legitimate'
            demo_conf = 0.75
        elif any(word in text_lower for word in ['urgent', 'click', 'verify', 'suspended']):
            demo_label = 'phishing'
            demo_conf = 0.85
        elif any(word in text_lower for word in ['won', 'prize', 'congratulations']):
            demo_label = 'reward_scam'
            demo_conf = 0.80
        else:
            demo_label = 'phishing'  # Default fallback
            demo_conf = 0.70
            
        return {
            'text': text,
            'predicted_label': demo_label,
            'confidence': demo_conf,
            'all_predictions': {
                demo_label: demo_conf,
                'legitimate': 0.15 if demo_label != 'legitimate' else demo_conf,
                'phishing': 0.10 if demo_label != 'phishing' else demo_conf,
                'reward_scam': 0.05,
                'tech_support_scam': 0.05
            },
            'demo_mode': True
        }
    
    # Real model classification
    try:
        # Tokenize input
        encoding = fraud_tokenizer(
            text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Move to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        
        # Get predictions
        with torch.no_grad():
            outputs = fraud_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1).cpu().numpy()[0]
            predicted_class_id = np.argmax(probabilities)
        
        # Format results
        predicted_label = CLASS_LABELS[predicted_class_id]
        confidence = float(probabilities[predicted_class_id])
        
        all_predictions = {
            CLASS_LABELS[i]: float(probabilities[i]) 
            for i in range(len(CLASS_LABELS))
        }
        
        return {
            'text': text,
            'predicted_label': predicted_label,
            'confidence': confidence,
            'all_predictions': all_predictions,
            'demo_mode': False
        }
        
    except Exception as e:
        print(f"❌ Error during classification: {e}")
        return {
            'text': text,
            'predicted_label': 'error',
            'confidence': 0.0,
            'all_predictions': {'error': 1.0},
            'demo_mode': True
        }

def analyze_with_local_reasoning(text):
    """Complete analysis: classification + local reasoning"""
    # Step 1: Classify the text
    classification_result = classify_text(text)
    
    # Check if we're in demo mode and warn user
    if classification_result.get('demo_mode', False):
        if classification_result['predicted_label'] == 'error':
            print("🚨 CLASSIFICATION ERROR - Please check your model setup!")
        else:
            print("🚨 DEMO MODE ACTIVE - Results are simulated, not real model predictions!")
    
    # Step 2: Generate local reasoning (only for non-legitimate classifications)
    if local_reasoning_engine.should_generate_reasoning(
        classification_result['predicted_label'], 
        classification_result['confidence']
    ):
        reasoning = local_reasoning_engine.generate_local_reasoning(
            text=classification_result['text'],
            predicted_label=classification_result['predicted_label'],
            confidence=classification_result['confidence'],
            all_predictions=classification_result['all_predictions']
        )
        
        local_reasoning_engine.stats['reasoning_generated'] += 1
        skip_reason = None
        reasoning_generated = True
    else:
        if classification_result['predicted_label'] == 'legitimate':
            skip_reason = 'legitimate_classification'
            local_reasoning_engine.stats['skipped_legitimate'] += 1
        elif classification_result['predicted_label'] == 'error':
            skip_reason = 'classification_error'
            local_reasoning_engine.stats['skipped_low_confidence'] += 1
        else:
            skip_reason = f"low_confidence_{classification_result['confidence']:.2f}"
            local_reasoning_engine.stats['skipped_low_confidence'] += 1
        
        reasoning = None
        reasoning_generated = False
    
    local_reasoning_engine.stats['total_processed'] += 1
    
    return {
        **classification_result,
        'reasoning': reasoning,
        'reasoning_generated': reasoning_generated,
        'skip_reason': skip_reason,
        'timestamp': datetime.now().isoformat()
    }

def print_analysis_result(result):
    """Pretty print analysis result"""
    print("\n" + "="*80)
    print("🔍 LOCAL FRAUD DETECTION + REASONING ANALYSIS")
    print("="*80)
    
    # Show demo mode warning prominently
    if result.get('demo_mode', False):
        print("🚨 DEMO MODE ACTIVE - NOT REAL MODEL PREDICTIONS!")
        print("📋 Upload your trained model to Kaggle and update paths to get real results")
        print("="*80)
    
    print(f"\n📝 Original Text:")
    print(f"   {result['text']}")
    
    print(f"\n🎯 Classification:")
    print(f"   Label: {result['predicted_label']}")
    print(f"   Confidence: {result['confidence']:.2%}")
    
    if not result.get('demo_mode', False):
        print(f"\n📊 All Predictions:")
        for label, prob in sorted(result['all_predictions'].items(), key=lambda x: x[1], reverse=True):
            print(f"   {label}: {prob:.2%}")
    
    if result['reasoning_generated']:
        print(f"\n🧠 Local AI Reasoning:")
        print("   " + result['reasoning'].replace('\n', '\n   '))
    else:
        print(f"\n⏭️  Reasoning Skipped: {result['skip_reason']}")
    
    print("\n" + "="*80)

print("✅ Classification and reasoning functions ready!")
print("🚀 Ready to analyze texts with local AI reasoning!")
print("💡 Note: Make sure to upload your trained model to Kaggle for real predictions!")

✅ Classification and reasoning functions ready!
🚀 Ready to analyze texts with local AI reasoning!
💡 Note: Make sure to upload your trained model to Kaggle for real predictions!


# 🧪 Sample Tests - Try Different Fraud Types

Let's test the local reasoning system with various types of fraudulent and legitimate messages.

In [6]:
# Sample Test Cases for Different Fraud Types
sample_texts = [
    {
        'category': 'Phishing Attack',
        'text': "URGENT: Your PayPal account has been suspended due to suspicious activity. Click here immediately to verify your information and restore access: http://paypal-verification-secure.fraudsite.com"
    },
    {
        'category': 'Tech Support Scam', 
        'text': "WARNING: Your computer is infected with 5 viruses! Your files will be deleted in 24 hours. Call Microsoft Support immediately at 1-800-555-SCAM. Don't restart your computer or you'll lose everything!"
    },
    {
        'category': 'Reward Scam',
        'text': "🎉 CONGRATULATIONS! 🎉 You've been selected as our LUCKY WINNER for a $1000 Amazon gift card! You're one of only 3 winners today! Claim your prize now by clicking this link and entering your credit card info for verification. Hurry, expires in 1 hour!"
    },
    {
        'category': 'Job Scam',
        'text': "Amazing work from home opportunity! Earn $5000/week working just 2 hours per day! No experience required! Just send $99 registration fee and start earning today! Guaranteed income or money back!"
    },
    {
        'category': 'SMS Spam',
        'text': "FREE iPhone 15 Pro! You have been randomly selected as a winner. Text CLAIM to 12345 or visit bit.ly/freeiphone15winner to get your prize. Message and data rates may apply. Text STOP to opt out."
    },
    {
        'category': 'Legitimate Message',
        'text': "Hi Sarah, thank you for your order #12345. Your package has been shipped and will arrive within 3-5 business days. You can track your shipment using the tracking number provided in your confirmation email. Have a great day!"
    },
    {
        'category': 'SSN Scam',
        'text': "IMPORTANT NOTICE: Your Social Security Number has been suspended due to suspicious illegal activity. Call the SSA office immediately at 1-800-555-FAKE to verify your identity and reactivate your SSN. Failure to respond will result in arrest."
    }
]

print("🧪 Testing Local AI Reasoning on Sample Fraud Types")

for i, sample in enumerate(sample_texts):
    print(f"\n🎯 Test {i+1}: {sample['category']}")
    print("-" * 50)
    
    result = analyze_with_local_reasoning(sample['text'])
    print_analysis_result(result)

print(f"\n📊 Summary:")
print(f"Total Processed: {local_reasoning_engine.stats['total_processed']}")
print(f"Reasoning Generated: {local_reasoning_engine.stats['reasoning_generated']}")
print(f"Legitimate (Skipped): {local_reasoning_engine.stats['skipped_legitimate']}")


🧪 Testing Local AI Reasoning on Sample Fraud Types

🎯 Test 1: Phishing Attack
--------------------------------------------------

🔍 LOCAL FRAUD DETECTION + REASONING ANALYSIS

📝 Original Text:
   URGENT: Your PayPal account has been suspended due to suspicious activity. Click here immediately to verify your information and restore access: http://paypal-verification-secure.fraudsite.com

🎯 Classification:
   Label: sms_spam
   Confidence: 72.69%

📊 All Predictions:
   sms_spam: 72.69%
   phishing: 26.92%
   legitimate: 0.20%
   job_scam: 0.06%
   refund_scam: 0.06%
   popup_scam: 0.04%
   ssn_scam: 0.01%
   tech_support_scam: 0.01%
   reward_scam: 0.01%

🧠 Local AI Reasoning:
   This text was classified as sms_spam with 72.7% confidence.
   
   Unwanted promotional or fraudulent text messages
   
   Key fraud indicators detected:
   • urgent responses
   • suspicious phone numbers
   • urgent language to pressure victims
   • requests for immediate action
   
   ⚠️ Risk: This could lead

# 📝 Interactive Text Analysis

Enter your own text below to analyze with the local fraud detection + reasoning system.

In [7]:
# Interactive Text Analysis
# Change the text below to analyze your own messages!

your_text = "Congratulations! You've won $1 million! Send your bank details to claim your prize!"

# Analyze your custom text
print("🔍 Analyzing Your Custom Text...")
custom_result = analyze_with_local_reasoning(your_text.strip())
print_analysis_result(custom_result)

🔍 Analyzing Your Custom Text...

🔍 LOCAL FRAUD DETECTION + REASONING ANALYSIS

📝 Original Text:
   Congratulations! You've won $1 million! Send your bank details to claim your prize!

🎯 Classification:
   Label: phishing
   Confidence: 50.99%

📊 All Predictions:
   phishing: 50.99%
   reward_scam: 46.17%
   sms_spam: 2.20%
   legitimate: 0.51%
   job_scam: 0.06%
   refund_scam: 0.04%
   tech_support_scam: 0.01%
   ssn_scam: 0.00%
   popup_scam: 0.00%

🧠 Local AI Reasoning:
   This text was classified as phishing with 51.0% confidence.
   
   Attempts to steal sensitive information like passwords, credit card numbers, or personal data
   
   Key fraud indicators detected:
   • financial incentives or rewards
   • requests for immediate action
   • suspicious contact information
   
   ⚠️ Risk: This could lead to identity theft and financial loss.



# 📊 Batch Processing - Analyze Multiple Texts

Upload a CSV file or analyze multiple texts at once with local reasoning.

In [10]:
# Batch Processing with Local Reasoning
def batch_analyze_texts(texts, save_results=True):
    """Analyze multiple texts and generate local reasoning"""
    results = []
    
    print(f"🔄 Processing {len(texts)} texts...")
    
    for i, text in enumerate(texts):
        if i % 5 == 0:  # Only print every 5th item to reduce clutter
            print(f"Progress: {i+1}/{len(texts)}")
        
        result = analyze_with_local_reasoning(text)
        results.append(result)
        
        # Small delay to avoid overwhelming the local model
        time.sleep(0.2)
    
    # Create summary DataFrame
    df_results = pd.DataFrame([
        {
            'text': r['text'][:100] + '...' if len(r['text']) > 100 else r['text'],
            'predicted_label': r['predicted_label'],
            'confidence': r['confidence'],
            'reasoning_generated': r['reasoning_generated'],
            'reasoning': r['reasoning'][:200] + '...' if r['reasoning'] and len(r['reasoning']) > 200 else r['reasoning'],
            'timestamp': r['timestamp']
        }
        for r in results
    ])
    
    if save_results:
        # Save results to CSV
        output_file = f'fraud_analysis_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
        df_results.to_csv(output_file, index=False)
        print(f"💾 Results saved to: {output_file}")
    
    return results, df_results

# Example batch processing
batch_texts = [
    "Your account will be closed unless you verify immediately!",
    "Hi John, thanks for the great meeting today. Let's follow up next week.",
    "You've won a free vacation! Call now to claim your prize!",
    "Your package has been successfully delivered and left at your front door. If you do not locate the parcel, please check with members of your household or nearby areas where it may have been placed for security.",
    "URGENT: Your social security number has been compromised!"
]

print("📊 Batch Analysis with Local Reasoning")
batch_results, batch_df = batch_analyze_texts(batch_texts)

print("\n📈 Batch Analysis Summary:")
fraud_count = (batch_df['predicted_label'] != 'legitimate').sum()
reasoning_count = batch_df['reasoning_generated'].sum()
print(f"Fraud detected: {fraud_count}/{len(batch_df)}")
print(f"Reasoning generated: {reasoning_count}/{len(batch_df)}")

# Display sample results
display(batch_df.head())

📊 Batch Analysis with Local Reasoning
🔄 Processing 5 texts...
Progress: 1/5
💾 Results saved to: fraud_analysis_results_20250916_155231.csv

📈 Batch Analysis Summary:
Fraud detected: 3/5
Reasoning generated: 3/5


Unnamed: 0,text,predicted_label,confidence,reasoning_generated,reasoning,timestamp
0,Your account will be closed unless you verify ...,phishing,0.999425,True,This text was classified as phishing with 99.9...,2025-09-16T15:52:30.163342
1,"Hi John, thanks for the great meeting today. L...",legitimate,0.999687,False,,2025-09-16T15:52:30.372727
2,You've won a free vacation! Call now to claim ...,sms_spam,0.99962,True,This text was classified as sms_spam with 100....,2025-09-16T15:52:30.582209
3,Your package has been successfully delivered a...,legitimate,0.727427,False,,2025-09-16T15:52:30.792033
4,URGENT: Your social security number has been c...,phishing,0.990928,True,This text was classified as phishing with 99.1...,2025-09-16T15:52:31.001553
