# NLP Fraud Detection Baseline Model

This notebook implements a comprehensive baseline model for fraud and scam detection using Natural Language Processing techniques. We'll build and compare multiple approaches from traditional machine learning to modern transformer-based models.

## 🎯 Objectives
1. Build traditional ML baselines (TF-IDF + Logistic Regression, SVM)
2. Implement BERT-based classification
3. Evaluate and compare model performance
4. Provide a foundation for more advanced fraud detection systems

## 📊 Dataset
We'll start with synthetic data and show how to adapt to real datasets like SMS Spam Collection, phishing emails, etc.

---

## 1. Import Required Libraries

Let's start by importing all the necessary libraries for our fraud detection system.

In [None]:
# Core data processing libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Text processing and NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

print("✅ All libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")
print(f"🤖 Scikit-learn version:", end=" ")
import sklearn
print(sklearn.__version__)

## 2. Load and Explore Dataset

We'll create a comprehensive dataset with various types of fraud and legitimate messages. In a real project, you would load your actual dataset here.

In [None]:
def create_fraud_dataset():
    """
    Load fraud/scam data from CSV dataset or create sample data
    """
    try:
        # Load the dataset
        df = pd.read_csv('final_fraud_detection_dataset.csv', nrows=1000)  # Subset for notebook
        print(f"Loaded dataset with {len(df)} samples (subset for notebook)")
        
        # Map binary_label to string labels
        df['label'] = df['binary_label'].map({1: 'fraud', 0: 'normal'})
        
        # Select relevant columns and rename
        df = df[['text', 'label']].copy()
        df.columns = ['message', 'label']
        
        print(f"Dataset prepared with {len(df)} samples")
        print(f"Label distribution:\n{df['label'].value_counts()}")
        
        return df
        
    except FileNotFoundError:
        print("Error: final_fraud_detection_dataset.csv not found.")
        print("Falling back to sample data...")
        return _create_sample_dataset()
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Falling back to sample data...")
        return _create_sample_dataset()

def _create_sample_dataset():
    """
    Create a comprehensive fraud detection dataset with various types of fraud and legitimate messages (fallback)
    """
    
    # Fraud/Scam messages
    fraud_messages = [
        # Phishing/Account security scams
        "URGENT! Your account will be suspended. Click here to verify: suspicious-link.com",
        "ALERT: Suspicious activity detected on your account. Verify your identity now or face suspension",
        "Your credit card has been charged $500. If this wasn't you, click here immediately",
        "Bank security alert! Update your information immediately to prevent account closure",
        "PayPal Security: Your account is limited. Click to restore access now",
        "Amazon: Your account has been compromised. Verify details to secure your account",
        
        # Prize/Lottery scams
        "Congratulations! You've won $10,000 in our daily draw! Send your bank details to claim",
        "WINNER! You've been selected for a $50,000 cash prize! Click to claim now",
        "You have inherited $2 million from a distant relative. Send processing fee to claim",
        "Lottery Commission: You've won €1,000,000! Pay the processing fee to receive your prize",
        
        # Investment/Get-rich-quick schemes
        "Limited time offer! Make $5000 weekly working from home! No experience needed",
        "Guaranteed 500% returns in 30 days! This investment opportunity won't last",
        "Earn $1000 daily with our proven system! Join thousands of successful members",
        "URGENT Investment Alert: Double your money in 24 hours with crypto trading",
        
        # Romance/Advance fee scams
        "Hello dear, I'm a soldier in Afghanistan and need help transferring my funds",
        "My love, I'm stuck in another country and need money for travel expenses",
        "Beautiful, I have fallen in love with you. I need your help with some funds",
        
        # Tech support scams
        "Microsoft Alert: Your computer is infected with 5 viruses. Call now for support",
        "Apple Security: Your iPhone has been hacked. Download our security software now",
        "Tech Support: Your computer will be disabled unless you call this number immediately",
        
        # Job/Employment scams
        "Congratulations! You've been selected for a high-paying remote job. Send $200 for training materials",
        "Work from home opportunity! Earn $300/day processing payments for our company",
        
        # Online shopping/Payment scams
        "Your package is delayed. Pay additional $50 shipping fees to receive your order",
        "eBay Alert: Complete your payment verification or your purchase will be cancelled",
        "Your order cannot be delivered. Pay customs fee of $75 to release your package",
        
        # Debt collection scams
        "FINAL NOTICE: Pay your outstanding debt of $1,500 immediately or face legal action",
        "Legal Department: You owe $2,000 in unpaid taxes. Pay now to avoid arrest",
        "Collection Agency: Settle your debt of $800 today or we'll seize your assets"
    ]
    
    # Legitimate messages
    normal_messages = [
        # Personal communications
        "Hey, are we still meeting for lunch tomorrow at the usual place?",
        "Thanks for your help with the project presentation yesterday",
        "Happy birthday! Hope you have a wonderful day with family and friends",
        "How was your weekend? Did you enjoy the concert you mentioned?",
        "Let me know if you need any assistance with the quarterly report",
        "Great job on the presentation! The client seemed very impressed",
        "Can we reschedule our meeting to next week? Something urgent came up",
        
        # Business communications
        "The meeting has been rescheduled to 3 PM in conference room B",
        "Please review the attached document and provide your feedback by Friday",
        "Reminder: Your monthly team meeting is scheduled for this Thursday",
        "Conference call with the client is set for next Tuesday at 2 PM",
        "The quarterly results look good. Let's discuss them in our next meeting",
        "Please submit your expense reports by the end of this week",
        
        # Service notifications
        "Your appointment with Dr. Smith is confirmed for Friday at 10 AM",
        "Reminder: Your library books are due next Tuesday",
        "Your prescription is ready for pickup at the pharmacy",
        "Thank you for your recent purchase. Your order will arrive in 3-5 business days",
        "Your flight to New York has been delayed by 30 minutes due to weather",
        
        # Social and casual
        "The weather is great today, perfect for a walk in the park",
        "The new restaurant downtown has excellent reviews. Want to try it?",
        "Movie night this Friday? I heard the new Marvel film is really good",
        "Thanks for the book recommendation, I really enjoyed reading it",
        "The team building event is planned for next month at the beach resort",
        
        # Educational/Informational
        "Don't forget to register for the upcoming workshop on data science",
        "The university library will be closed next Monday for maintenance",
        "New course registration opens tomorrow. Make sure to enroll early",
        "The seminar on artificial intelligence was very informative",
        
        # News and updates
        "The software update has been successfully installed on all systems",
        "New safety protocols will be implemented starting next month",
        "The annual company picnic is scheduled for the last Saturday of June",
        "Please update your emergency contact information in the HR system"
    ]
    
    # Create DataFrame
    messages = fraud_messages + normal_messages
    labels = ['fraud'] * len(fraud_messages) + ['normal'] * len(normal_messages)
    
    # Add message IDs for tracking
    message_ids = [f"MSG_{i:03d}" for i in range(len(messages))]
    
    df = pd.DataFrame({
        'message_id': message_ids,
        'message': messages,
        'label': labels
    })
    
    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return df

# Create the dataset
df = create_fraud_dataset()

print("📊 Dataset created successfully!")
print(f"Total samples: {len(df)}")
print(f"\nDataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

# Display basic information about the dataset
print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)

print(f"\nLabel distribution:")
label_counts = df['label'].value_counts()
print(label_counts)
print(f"\nClass balance: {label_counts['normal']/label_counts['fraud']:.2f}:1 (normal:fraud)")

# Show sample messages
print(f"\n📝 Sample fraud messages:")
print("-" * 30)
fraud_samples = df[df['label'] == 'fraud']['message'].head(3)
for i, msg in enumerate(fraud_samples, 1):
    print(f"{i}. {msg[:80]}...")

print(f"\n📝 Sample normal messages:")
print("-" * 30)
normal_samples = df[df['label'] == 'normal']['message'].head(3)
for i, msg in enumerate(normal_samples, 1):
    print(f"{i}. {msg[:80]}...")

# Display first few rows
print(f"\n📋 First 5 rows of the dataset:")
print(df.head())

In [None]:
# Create visualizations for data exploration
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Label distribution
label_counts = df['label'].value_counts()
axes[0, 0].pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%', 
               colors=['#ff7f7f', '#7fbf7f'])
axes[0, 0].set_title('Distribution of Labels', fontsize=14, fontweight='bold')

# 2. Message length distribution
df['message_length'] = df['message'].str.len()
fraud_lengths = df[df['label'] == 'fraud']['message_length']
normal_lengths = df[df['label'] == 'normal']['message_length']

axes[0, 1].hist(fraud_lengths, alpha=0.7, label='Fraud', bins=20, color='red')
axes[0, 1].hist(normal_lengths, alpha=0.7, label='Normal', bins=20, color='green')
axes[0, 1].set_xlabel('Message Length (characters)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Message Length Distribution', fontsize=14, fontweight='bold')
axes[0, 1].legend()

# 3. Word count distribution
df['word_count'] = df['message'].str.split().str.len()
fraud_words = df[df['label'] == 'fraud']['word_count']
normal_words = df[df['label'] == 'normal']['word_count']

axes[1, 0].boxplot([fraud_words, normal_words], labels=['Fraud', 'Normal'])
axes[1, 0].set_ylabel('Word Count')
axes[1, 0].set_title('Word Count Distribution by Label', fontsize=14, fontweight='bold')

# 4. Average statistics
stats_data = df.groupby('label').agg({
    'message_length': 'mean',
    'word_count': 'mean'
}).round(2)

x = range(len(stats_data.columns))
width = 0.35
fraud_stats = stats_data.loc['fraud'].values
normal_stats = stats_data.loc['normal'].values

axes[1, 1].bar([i - width/2 for i in x], fraud_stats, width, label='Fraud', color='red', alpha=0.7)
axes[1, 1].bar([i + width/2 for i in x], normal_stats, width, label='Normal', color='green', alpha=0.7)
axes[1, 1].set_xlabel('Metrics')
axes[1, 1].set_ylabel('Average Values')
axes[1, 1].set_title('Average Statistics by Label', fontsize=14, fontweight='bold')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(['Char Length', 'Word Count'])
axes[1, 1].legend()

# Add values on bars
for i, (fraud_val, normal_val) in enumerate(zip(fraud_stats, normal_stats)):
    axes[1, 1].text(i - width/2, fraud_val + 1, str(fraud_val), ha='center', va='bottom')
    axes[1, 1].text(i + width/2, normal_val + 1, str(normal_val), ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n📊 MESSAGE STATISTICS")
print("="*50)
print(f"Average message length (characters):")
print(f"  Fraud: {df[df['label'] == 'fraud']['message_length'].mean():.1f}")
print(f"  Normal: {df[df['label'] == 'normal']['message_length'].mean():.1f}")

print(f"\nAverage word count:")
print(f"  Fraud: {df[df['label'] == 'fraud']['word_count'].mean():.1f}")
print(f"  Normal: {df[df['label'] == 'normal']['word_count'].mean():.1f}")

print(f"\nMessage length range:")
print(f"  Minimum: {df['message_length'].min()} characters")
print(f"  Maximum: {df['message_length'].max()} characters")
print(f"  Median: {df['message_length'].median():.1f} characters")

## 3. Data Preprocessing and Text Cleaning

Now we'll clean and preprocess the text data to prepare it for machine learning models. This includes removing noise, normalizing text, and creating features that our models can understand.

In [None]:
class TextPreprocessor:
    """
    Comprehensive text preprocessing pipeline for fraud detection
    """
    
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Add domain-specific words to stop words if needed
        # self.stop_words.update(['would', 'could', 'should'])
    
    def clean_text(self, text):
        """
        Clean and normalize text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        text = re.sub(r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove phone numbers
        text = re.sub(r'[\+]?[1-9]?[0-9]{7,15}', '', text)
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize_and_lemmatize(self, text):
        """
        Tokenize text and apply lemmatization
        """
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords and short words
        tokens = [token for token in tokens if token not in self.stop_words and len(token) > 2]
        
        # Lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        
        return ' '.join(tokens)
    
    def preprocess(self, text):
        """
        Complete preprocessing pipeline
        """
        # Clean text
        cleaned = self.clean_text(text)
        
        # Tokenize and lemmatize
        processed = self.tokenize_and_lemmatize(cleaned)
        
        return processed
    
    def extract_features(self, text):
        """
        Extract additional features from text
        """
        features = {}
        
        # Basic text features
        features['char_count'] = len(text)
        features['word_count'] = len(text.split())
        features['sentence_count'] = len(re.findall(r'[.!?]+', text))
        features['avg_word_length'] = np.mean([len(word) for word in text.split()]) if text.split() else 0
        
        # Uppercase features
        features['upper_case_count'] = sum(1 for c in text if c.isupper())
        features['upper_case_ratio'] = features['upper_case_count'] / len(text) if len(text) > 0 else 0
        
        # Punctuation features
        features['exclamation_count'] = text.count('!')
        features['question_count'] = text.count('?')
        features['dollar_count'] = text.count('$')
        
        # Fraud-specific features
        fraud_indicators = ['urgent', 'click', 'verify', 'winner', 'prize', 'money', 'free', 'offer']
        features['fraud_words'] = sum(1 for word in fraud_indicators if word in text.lower())
        
        return features

# Initialize preprocessor
preprocessor = TextPreprocessor()

# Apply preprocessing to the dataset
print("🔄 Preprocessing text data...")
df['cleaned_message'] = df['message'].apply(preprocessor.preprocess)

# Extract additional features
print("🔍 Extracting additional features...")
feature_data = df['message'].apply(preprocessor.extract_features)
feature_df = pd.DataFrame(list(feature_data))

# Combine with main dataframe
df = pd.concat([df, feature_df], axis=1)

print("✅ Preprocessing complete!")

# Show preprocessing examples
print("\n📝 PREPROCESSING EXAMPLES")
print("="*60)

sample_indices = [0, 15, 30]  # Show examples from different categories
for i, idx in enumerate(sample_indices):
    print(f"\nExample {i+1} - Label: {df.iloc[idx]['label'].upper()}")
    print(f"Original: {df.iloc[idx]['message'][:80]}...")
    print(f"Cleaned:  {df.iloc[idx]['cleaned_message'][:80]}...")
    print("-" * 60)

# Show feature statistics
print("\n📊 EXTRACTED FEATURES STATISTICS")
print("="*50)
feature_cols = ['char_count', 'word_count', 'upper_case_ratio', 'fraud_words']
print(df.groupby('label')[feature_cols].mean().round(2))

## 4. Feature Engineering with TF-IDF

We'll convert the cleaned text into numerical features that machine learning algorithms can understand using TF-IDF (Term Frequency-Inverse Document Frequency) vectorization.

In [None]:
# Configure TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,          # Limit vocabulary to top 5000 words
    ngram_range=(1, 2),         # Use unigrams and bigrams
    stop_words='english',       # Remove English stop words
    min_df=2,                   # Ignore terms that appear in less than 2 documents
    max_df=0.95,               # Ignore terms that appear in more than 95% of documents
    lowercase=True,             # Convert to lowercase
    sublinear_tf=True          # Apply sublinear tf scaling
)

print("🔄 Creating TF-IDF features...")

# Fit and transform the cleaned text
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_message'])

print(f"✅ TF-IDF vectorization complete!")
print(f"📊 Feature matrix shape: {X_tfidf.shape}")
print(f"📚 Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert to DataFrame for easier handling
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=feature_names)

print(f"\n🔍 Sample TF-IDF features:")
print(f"First 10 features: {list(feature_names[:10])}")

# Analyze most important features for each class
print("\n📊 TOP TF-IDF FEATURES BY CLASS")
print("="*50)

# Calculate mean TF-IDF scores for each class
fraud_mask = df['label'] == 'fraud'
normal_mask = df['label'] == 'normal'

fraud_tfidf_mean = X_tfidf_df[fraud_mask].mean()
normal_tfidf_mean = X_tfidf_df[normal_mask].mean()

# Get top features for fraud class
top_fraud_features = fraud_tfidf_mean.nlargest(10)
print("🚨 Top 10 Fraud Features:")
for feature, score in top_fraud_features.items():
    print(f"  {feature}: {score:.4f}")

# Get top features for normal class
top_normal_features = normal_tfidf_mean.nlargest(10)
print("\n✅ Top 10 Normal Features:")
for feature, score in top_normal_features.items():
    print(f"  {feature}: {score:.4f}")

# Visualize top features
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Top fraud features
top_fraud_features.plot(kind='barh', ax=ax1, color='red', alpha=0.7)
ax1.set_title('Top 10 Features in Fraud Messages', fontsize=14, fontweight='bold')
ax1.set_xlabel('Average TF-IDF Score')

# Top normal features
top_normal_features.plot(kind='barh', ax=ax2, color='green', alpha=0.7)
ax2.set_title('Top 10 Features in Normal Messages', fontsize=14, fontweight='bold')
ax2.set_xlabel('Average TF-IDF Score')

plt.tight_layout()
plt.show()

# Create word clouds for visual representation
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Fraud word cloud
fraud_text = ' '.join(df[df['label'] == 'fraud']['cleaned_message'])
fraud_wordcloud = WordCloud(width=400, height=300, background_color='white').generate(fraud_text)
ax1.imshow(fraud_wordcloud, interpolation='bilinear')
ax1.axis('off')
ax1.set_title('Fraud Messages Word Cloud', fontsize=14, fontweight='bold')

# Normal word cloud
normal_text = ' '.join(df[df['label'] == 'normal']['cleaned_message'])
normal_wordcloud = WordCloud(width=400, height=300, background_color='white').generate(normal_text)
ax2.imshow(normal_wordcloud, interpolation='bilinear')
ax2.axis('off')
ax2.set_title('Normal Messages Word Cloud', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Combine TF-IDF features with additional engineered features
additional_features = ['char_count', 'word_count', 'upper_case_ratio', 'fraud_words', 
                      'exclamation_count', 'dollar_count']

# Normalize additional features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_additional = scaler.fit_transform(df[additional_features])

# Combine features
X_combined = np.hstack([X_tfidf.toarray(), X_additional])
print(f"\n🔗 Combined feature matrix shape: {X_combined.shape}")
print(f"   TF-IDF features: {X_tfidf.shape[1]}")
print(f"   Additional features: {len(additional_features)}")
print(f"   Total features: {X_combined.shape[1]}")

## 5. Train-Test Split

Now we'll split our data into training and testing sets, ensuring proper stratification to maintain class balance.

In [None]:
# Prepare target variable
y = df['label'].map({'normal': 0, 'fraud': 1})

print("🔄 Splitting dataset...")

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, 
    test_size=0.3,          # 70% train, 30% test
    random_state=42,        # For reproducibility
    stratify=y             # Maintain class balance
)

print("✅ Dataset split complete!")

# Print split information
print(f"\n📊 DATASET SPLIT SUMMARY")
print("="*40)
print(f"Total samples: {len(df)}")
print(f"Training samples: {len(X_train)} ({len(X_train)/len(df)*100:.1f}%)")
print(f"Testing samples: {len(X_test)} ({len(X_test)/len(df)*100:.1f}%)")

print(f"\n🏷️ LABEL DISTRIBUTION")
print("="*30)

# Training set distribution
train_fraud = sum(y_train)
train_normal = len(y_train) - train_fraud
print(f"Training set:")
print(f"  Normal: {train_normal} ({train_normal/len(y_train)*100:.1f}%)")
print(f"  Fraud:  {train_fraud} ({train_fraud/len(y_train)*100:.1f}%)")

# Testing set distribution
test_fraud = sum(y_test)
test_normal = len(y_test) - test_fraud
print(f"\nTesting set:")
print(f"  Normal: {test_normal} ({test_normal/len(y_test)*100:.1f}%)")
print(f"  Fraud:  {test_fraud} ({test_fraud/len(y_test)*100:.1f}%)")

# Visualize the split
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Training set distribution
train_labels = ['Normal', 'Fraud']
train_sizes = [train_normal, train_fraud]
ax1.pie(train_sizes, labels=train_labels, autopct='%1.1f%%', colors=['green', 'red'], alpha=0.7)
ax1.set_title('Training Set Distribution', fontsize=14, fontweight='bold')

# Testing set distribution
test_labels = ['Normal', 'Fraud']
test_sizes = [test_normal, test_fraud]
ax2.pie(test_sizes, labels=test_labels, autopct='%1.1f%%', colors=['green', 'red'], alpha=0.7)
ax2.set_title('Testing Set Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n🔢 FEATURE MATRIX SHAPES")
print("="*30)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape:  {y_test.shape}")

# Check for any potential data leakage or issues
print(f"\n✅ DATA QUALITY CHECKS")
print("="*25)
print(f"No missing values in X_train: {not np.isnan(X_train).any()}")
print(f"No missing values in X_test:  {not np.isnan(X_test).any()}")
print(f"Feature dimensions match: {X_train.shape[1] == X_test.shape[1]}")
print(f"Class balance maintained: {abs(train_fraud/len(y_train) - test_fraud/len(y_test)) < 0.05}")

## 6. Build Baseline Models

We'll implement and compare multiple baseline models to establish a strong foundation for fraud detection.

In [None]:
# Initialize baseline models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Store results
results = {}

print("🤖 Training baseline models...")
print("="*50)

# Train and evaluate each model
for name, model in models.items():
    print(f"\n🔄 Training {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_score': auc
    }
    
    print(f"✅ {name} complete!")
    print(f"   Accuracy: {accuracy:.3f}")
    print(f"   F1-Score: {f1:.3f}")
    print(f"   AUC:      {auc:.3f}")

print(f"\n🎯 MODEL PERFORMANCE SUMMARY")
print("="*60)
print(f"{'Model':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'AUC':<10}")
print("-" * 70)

for name, metrics in results.items():
    print(f"{name:<20} {metrics['accuracy']:<10.3f} {metrics['precision']:<10.3f} "
          f"{metrics['recall']:<10.3f} {metrics['f1_score']:<10.3f} {metrics['auc_score']:<10.3f}")

# Find best model based on F1-score (important for fraud detection)
best_model_name = max(results.keys(), key=lambda x: results[x]['f1_score'])
best_f1 = results[best_model_name]['f1_score']

print(f"\n🏆 Best Model: {best_model_name} (F1-Score: {best_f1:.3f})")

# Visualize model performance
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Performance metrics comparison
metrics_df = pd.DataFrame({
    name: [results[name]['accuracy'], results[name]['precision'], 
           results[name]['recall'], results[name]['f1_score'], results[name]['auc_score']]
    for name in results.keys()
}, index=['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC'])

metrics_df.plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Score')
axes[0, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0, 0].tick_params(axis='x', rotation=45)

# 2. ROC Curves
for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
    axes[0, 1].plot(fpr, tpr, label=f"{name} (AUC = {result['auc_score']:.3f})")

axes[0, 1].plot([0, 1], [0, 1], 'k--', label='Random Classifier')
axes[0, 1].set_xlabel('False Positive Rate')
axes[0, 1].set_ylabel('True Positive Rate')
axes[0, 1].set_title('ROC Curves Comparison', fontsize=14, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Feature importance for Random Forest
if 'Random Forest' in results:
    rf_model = results['Random Forest']['model']
    feature_importance = rf_model.feature_importances_
    
    # Get top 15 features
    top_indices = np.argsort(feature_importance)[-15:]
    top_features = [f"Feature_{i}" for i in top_indices]  # Simplified feature names
    top_importance = feature_importance[top_indices]
    
    axes[1, 0].barh(range(len(top_features)), top_importance)
    axes[1, 0].set_yticks(range(len(top_features)))
    axes[1, 0].set_yticklabels(top_features)
    axes[1, 0].set_xlabel('Importance')
    axes[1, 0].set_title('Random Forest - Top 15 Features', fontsize=14, fontweight='bold')

# 4. Model complexity vs performance
model_complexity = {'Logistic Regression': 1, 'Naive Bayes': 1, 'SVM': 3, 'Random Forest': 4}
f1_scores = [results[name]['f1_score'] for name in model_complexity.keys()]
complexity_scores = list(model_complexity.values())

axes[1, 1].scatter(complexity_scores, f1_scores, s=100, alpha=0.7)
for i, name in enumerate(model_complexity.keys()):
    axes[1, 1].annotate(name, (complexity_scores[i], f1_scores[i]), 
                       xytext=(5, 5), textcoords='offset points')

axes[1, 1].set_xlabel('Model Complexity (1=Simple, 4=Complex)')
axes[1, 1].set_ylabel('F1-Score')
axes[1, 1].set_title('Model Complexity vs Performance', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Model Training and Evaluation

Let's perform cross-validation and detailed analysis of our models to ensure robust performance estimates.

In [None]:
# Perform cross-validation for more robust evaluation
print("🔄 Performing cross-validation...")
print("="*50)

cv_results = {}
cv_folds = 5

for name, model in models.items():
    print(f"\n📊 Cross-validating {name}...")
    
    # Perform cross-validation on different metrics
    cv_accuracy = cross_val_score(model, X_combined, y, cv=cv_folds, scoring='accuracy')
    cv_precision = cross_val_score(model, X_combined, y, cv=cv_folds, scoring='precision')
    cv_recall = cross_val_score(model, X_combined, y, cv=cv_folds, scoring='recall')
    cv_f1 = cross_val_score(model, X_combined, y, cv=cv_folds, scoring='f1')
    cv_auc = cross_val_score(model, X_combined, y, cv=cv_folds, scoring='roc_auc')
    
    cv_results[name] = {
        'accuracy': cv_accuracy,
        'precision': cv_precision,
        'recall': cv_recall,
        'f1': cv_f1,
        'auc': cv_auc
    }
    
    print(f"   Accuracy:  {cv_accuracy.mean():.3f} (±{cv_accuracy.std()*2:.3f})")
    print(f"   Precision: {cv_precision.mean():.3f} (±{cv_precision.std()*2:.3f})")
    print(f"   Recall:    {cv_recall.mean():.3f} (±{cv_recall.std()*2:.3f})")
    print(f"   F1-Score:  {cv_f1.mean():.3f} (±{cv_f1.std()*2:.3f})")
    print(f"   AUC:       {cv_auc.mean():.3f} (±{cv_auc.std()*2:.3f})")

# Visualize cross-validation results
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc']

for idx, metric in enumerate(metrics):
    row = idx // 3
    col = idx % 3
    
    # Prepare data for box plot
    data_to_plot = [cv_results[name][metric] for name in models.keys()]
    model_names = list(models.keys())
    
    # Create box plot
    box_plot = axes[row, col].boxplot(data_to_plot, labels=model_names, patch_artist=True)
    
    # Color the boxes
    colors = ['lightblue', 'lightgreen', 'lightcoral', 'lightyellow']
    for patch, color in zip(box_plot['boxes'], colors):
        patch.set_facecolor(color)
    
    axes[row, col].set_title(f'{metric.upper()} - Cross Validation', fontsize=12, fontweight='bold')
    axes[row, col].set_ylabel(metric.title())
    axes[row, col].grid(True, alpha=0.3)
    axes[row, col].tick_params(axis='x', rotation=45)

# Remove the empty subplot
axes[1, 2].remove()

plt.tight_layout()
plt.show()

# Statistical significance testing
print(f"\n📈 CROSS-VALIDATION SUMMARY")
print("="*70)
print(f"{'Model':<20} {'Metric':<12} {'Mean':<8} {'Std':<8} {'Min':<8} {'Max':<8}")
print("-" * 70)

for name in models.keys():
    for metric in ['f1', 'precision', 'recall', 'accuracy', 'auc']:
        scores = cv_results[name][metric]
        print(f"{name:<20} {metric:<12} {scores.mean():<8.3f} {scores.std():<8.3f} "
              f"{scores.min():<8.3f} {scores.max():<8.3f}")
    print("-" * 70)

# Hyperparameter tuning for best models
print(f"\n🔧 HYPERPARAMETER TUNING")
print("="*40)

# Tune Logistic Regression
print("🔄 Tuning Logistic Regression...")
lr_params = {
    'C': [0.1, 1.0, 10.0, 100.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_grid = GridSearchCV(
    LogisticRegression(random_state=42, max_iter=1000),
    lr_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

lr_grid.fit(X_train, y_train)
print(f"✅ Best LR params: {lr_grid.best_params_}")
print(f"✅ Best LR score: {lr_grid.best_score_:.3f}")

# Tune SVM
print(f"\n🔄 Tuning SVM...")
svm_params = {
    'C': [0.1, 1.0, 10.0],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_grid = GridSearchCV(
    SVC(random_state=42, probability=True),
    svm_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

svm_grid.fit(X_train, y_train)
print(f"✅ Best SVM params: {svm_grid.best_params_}")
print(f"✅ Best SVM score: {svm_grid.best_score_:.3f}")

# Store tuned models
tuned_models = {
    'Tuned Logistic Regression': lr_grid.best_estimator_,
    'Tuned SVM': svm_grid.best_estimator_
}

# Evaluate tuned models
print(f"\n🎯 TUNED MODEL PERFORMANCE")
print("="*50)

for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"\n{name}:")
    print(f"  Accuracy:  {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall:    {recall:.3f}")
    print(f"  F1-Score:  {f1:.3f}")
    print(f"  AUC:       {auc:.3f}")
    
    # Update results with tuned models
    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_score': auc
    }

## 8. Performance Metrics and Confusion Matrix

Let's dive deep into the performance analysis with detailed confusion matrices and error analysis.

In [None]:
# Detailed confusion matrix analysis
def plot_confusion_matrices(results, y_test):
    """Plot confusion matrices for all models"""
    
    n_models = len(results)
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    for idx, (name, result) in enumerate(results.items()):
        if idx >= len(axes):
            break
            
        cm = confusion_matrix(y_test, result['y_pred'])
        
        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Normal', 'Fraud'], 
                   yticklabels=['Normal', 'Fraud'],
                   ax=axes[idx])
        
        axes[idx].set_title(f'{name}\nAccuracy: {result["accuracy"]:.3f}', 
                           fontsize=12, fontweight='bold')
        axes[idx].set_xlabel('Predicted Label')
        axes[idx].set_ylabel('True Label')
        
        # Add percentage annotations
        total = cm.sum()
        for i in range(2):
            for j in range(2):
                percent = cm[i, j] / total * 100
                axes[idx].text(j + 0.5, i + 0.7, f'({percent:.1f}%)', 
                             ha='center', va='center', fontsize=10, color='red')
    
    # Remove unused subplots
    for idx in range(len(results), len(axes)):
        fig.delaxes(axes[idx])
    
    plt.tight_layout()
    plt.show()

# Plot confusion matrices
plot_confusion_matrices(results, y_test)

# Detailed classification reports
print("📊 DETAILED CLASSIFICATION REPORTS")
print("="*60)

for name, result in results.items():
    print(f"\n🤖 {name}")
    print("-" * 40)
    print(classification_report(y_test, result['y_pred'], 
                              target_names=['Normal', 'Fraud'],
                              digits=3))

# Error analysis
print(f"\n🔍 ERROR ANALYSIS")
print("="*50)

# Get the best performing model for detailed analysis
best_model_name = max(results.keys(), key=lambda x: results[x]['f1_score'])
best_result = results[best_model_name]

print(f"Analyzing errors for: {best_model_name}")
print(f"Best F1-Score: {best_result['f1_score']:.3f}")

# Get test data for error analysis
X_test_indices = y_test.index if hasattr(y_test, 'index') else range(len(y_test))

# Find misclassified samples
y_pred_best = best_result['y_pred']
misclassified_mask = y_test != y_pred_best
misclassified_indices = [i for i, mask in enumerate(misclassified_mask) if mask]

print(f"\nTotal misclassified samples: {sum(misclassified_mask)}")

# Analyze false positives and false negatives
false_positives = [(i, y_test.iloc[i] if hasattr(y_test, 'iloc') else y_test[i], y_pred_best[i]) 
                   for i in misclassified_indices 
                   if (y_test.iloc[i] if hasattr(y_test, 'iloc') else y_test[i]) == 0 and y_pred_best[i] == 1]

false_negatives = [(i, y_test.iloc[i] if hasattr(y_test, 'iloc') else y_test[i], y_pred_best[i]) 
                   for i in misclassified_indices 
                   if (y_test.iloc[i] if hasattr(y_test, 'iloc') else y_test[i]) == 1 and y_pred_best[i] == 0]

print(f"False Positives (Normal predicted as Fraud): {len(false_positives)}")
print(f"False Negatives (Fraud predicted as Normal): {len(false_negatives)}")

# Show examples of misclassified samples
if len(false_positives) > 0:
    print(f"\n❌ FALSE POSITIVE EXAMPLES:")
    print("-" * 30)
    # Get original test data indices
    df_test = df.iloc[y_test.index] if hasattr(y_test, 'index') else df.iloc[-len(y_test):]
    
    for i, (idx, true_label, pred_label) in enumerate(false_positives[:3]):
        original_idx = df_test.index[idx] if hasattr(df_test, 'index') else idx
        message = df.loc[original_idx, 'message'] if original_idx in df.index else "Message not found"
        print(f"{i+1}. {message[:100]}...")
        print(f"   True: Normal, Predicted: Fraud")
        print()

if len(false_negatives) > 0:
    print(f"\n❌ FALSE NEGATIVE EXAMPLES:")
    print("-" * 30)
    df_test = df.iloc[y_test.index] if hasattr(y_test, 'index') else df.iloc[-len(y_test):]
    
    for i, (idx, true_label, pred_label) in enumerate(false_negatives[:3]):
        original_idx = df_test.index[idx] if hasattr(df_test, 'index') else idx
        message = df.loc[original_idx, 'message'] if original_idx in df.index else "Message not found"
        print(f"{i+1}. {message[:100]}...")
        print(f"   True: Fraud, Predicted: Normal")
        print()

# Cost analysis for fraud detection
print(f"\n💰 COST ANALYSIS")
print("="*30)

# Assuming costs: FN (missed fraud) = $1000, FP (false alarm) = $10
cost_fn = 1000  # Cost of missing a fraud
cost_fp = 10    # Cost of false alarm

for name, result in results.items():
    cm = confusion_matrix(y_test, result['y_pred'])
    tn, fp, fn, tp = cm.ravel()
    
    total_cost = fn * cost_fn + fp * cost_fp
    print(f"{name}:")
    print(f"  False Negatives: {fn} x ${cost_fn} = ${fn * cost_fn}")
    print(f"  False Positives: {fp} x ${cost_fp} = ${fp * cost_fp}")
    print(f"  Total Cost: ${total_cost}")
    print(f"  Cost per sample: ${total_cost / len(y_test):.2f}")
    print()

# Performance at different thresholds
print(f"\n📈 THRESHOLD ANALYSIS")
print("="*30)

# Analyze best model at different thresholds
best_model = best_result['model']
y_proba = best_result['y_pred_proba']

thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
print(f"Model: {best_model_name}")
print(f"{'Threshold':<10} {'Precision':<10} {'Recall':<10} {'F1':<10} {'Cost':<10}")
print("-" * 50)

for threshold in thresholds:
    y_pred_thresh = (y_proba >= threshold).astype(int)
    
    precision_thresh = precision_score(y_test, y_pred_thresh)
    recall_thresh = recall_score(y_test, y_pred_thresh)
    f1_thresh = f1_score(y_test, y_pred_thresh)
    
    # Calculate cost
    cm_thresh = confusion_matrix(y_test, y_pred_thresh)
    tn, fp, fn, tp = cm_thresh.ravel()
    cost_thresh = fn * cost_fn + fp * cost_fp
    
    print(f"{threshold:<10.1f} {precision_thresh:<10.3f} {recall_thresh:<10.3f} "
          f"{f1_thresh:<10.3f} ${cost_thresh:<9.0f}")

# Plot precision-recall curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

from sklearn.metrics import precision_recall_curve

precision_curve, recall_curve, thresholds_pr = precision_recall_curve(y_test, y_proba)

ax1.plot(recall_curve, precision_curve, color='blue', linewidth=2)
ax1.set_xlabel('Recall')
ax1.set_ylabel('Precision')
ax1.set_title(f'Precision-Recall Curve\n{best_model_name}', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Plot threshold vs metrics
ax2.plot(thresholds, [precision_score(y_test, (y_proba >= t).astype(int)) for t in thresholds], 
         label='Precision', marker='o')
ax2.plot(thresholds, [recall_score(y_test, (y_proba >= t).astype(int)) for t in thresholds], 
         label='Recall', marker='s')
ax2.plot(thresholds, [f1_score(y_test, (y_proba >= t).astype(int)) for t in thresholds], 
         label='F1-Score', marker='^')

ax2.set_xlabel('Threshold')
ax2.set_ylabel('Score')
ax2.set_title('Metrics vs Threshold', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Model Prediction on New Text Samples

Now let's test our trained models on new, unseen text samples to see how they perform in practice.

In [None]:
class FraudDetectionPredictor:
    """
    Wrapper class for making predictions on new text samples
    """
    
    def __init__(self, model, vectorizer, preprocessor, scaler, feature_cols):
        self.model = model
        self.vectorizer = vectorizer
        self.preprocessor = preprocessor
        self.scaler = scaler
        self.feature_cols = feature_cols
        
    def predict_message(self, message):
        """
        Predict if a message is fraud or normal
        """
        # Preprocess the message
        cleaned_message = self.preprocessor.preprocess(message)
        
        # Extract TF-IDF features
        tfidf_features = self.vectorizer.transform([cleaned_message])
        
        # Extract additional features
        additional_features_dict = self.preprocessor.extract_features(message)
        additional_features = np.array([[additional_features_dict[col] for col in self.feature_cols]])
        additional_features_scaled = self.scaler.transform(additional_features)
        
        # Combine features
        combined_features = np.hstack([tfidf_features.toarray(), additional_features_scaled])
        
        # Make prediction
        prediction = self.model.predict(combined_features)[0]
        probability = self.model.predict_proba(combined_features)[0]
        
        return {
            'message': message,
            'prediction': 'Fraud' if prediction == 1 else 'Normal',
            'confidence': max(probability),
            'fraud_probability': probability[1],
            'normal_probability': probability[0]
        }

# Create predictor with the best model
best_model = results[best_model_name]['model']
predictor = FraudDetectionPredictor(
    model=best_model,
    vectorizer=tfidf_vectorizer,
    preprocessor=preprocessor,
    scaler=scaler,
    feature_cols=additional_features
)

# Test on new sample messages
test_messages = [
    # Clear fraud examples
    "URGENT: Your account is suspended! Click here immediately to restore access or lose your money forever!",
    "Congratulations! You've won $1,000,000 in our lottery! Send $500 processing fee to claim your prize now!",
    "FINAL NOTICE: Pay $2000 immediately or we will take legal action against you today!",
    "Your bank account has been compromised. Verify your details at suspicious-bank-link.com right now!",
    
    # Clear normal examples  
    "Hey, thanks for helping me with the project yesterday. The presentation went really well!",
    "Reminder: Your doctor's appointment is scheduled for tomorrow at 2 PM",
    "The team meeting has been moved to Friday at 10 AM in conference room B",
    "Happy birthday! Hope you have a wonderful celebration with your family",
    
    # Ambiguous/edge cases
    "Limited time offer: 50% off all items. Sale ends soon!",
    "Your order has been delayed due to shipping issues. Additional fees may apply",
    "Security alert: We detected unusual activity on your account. Please review",
    "Investment opportunity: High returns guaranteed with our new fund"
]

print("🔮 TESTING ON NEW MESSAGES")
print("="*80)

# Predict for all test messages
predictions = []
for i, message in enumerate(test_messages, 1):
    result = predictor.predict_message(message)
    predictions.append(result)
    
    # Determine emoji and color based on prediction
    emoji = "🚨" if result['prediction'] == 'Fraud' else "✅"
    confidence_color = "HIGH" if result['confidence'] > 0.8 else "MEDIUM" if result['confidence'] > 0.6 else "LOW"
    
    print(f"\n{emoji} Test Message {i}:")
    print(f"Message: {message}")
    print(f"Prediction: {result['prediction']} (Confidence: {confidence_color} - {result['confidence']:.3f})")
    print(f"Fraud Probability: {result['fraud_probability']:.3f}")
    print("-" * 80)

# Create visualization of predictions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# 1. Prediction distribution
pred_counts = pd.Series([p['prediction'] for p in predictions]).value_counts()
colors = ['green' if label == 'Normal' else 'red' for label in pred_counts.index]
ax1.pie(pred_counts.values, labels=pred_counts.index, autopct='%1.1f%%', colors=colors, alpha=0.7)
ax1.set_title('Prediction Distribution on Test Messages', fontsize=14, fontweight='bold')

# 2. Confidence distribution
confidences = [p['confidence'] for p in predictions]
fraud_confidences = [p['confidence'] for p in predictions if p['prediction'] == 'Fraud']
normal_confidences = [p['confidence'] for p in predictions if p['prediction'] == 'Normal']

ax2.hist(fraud_confidences, alpha=0.7, label='Fraud Predictions', color='red', bins=5)
ax2.hist(normal_confidences, alpha=0.7, label='Normal Predictions', color='green', bins=5)
ax2.set_xlabel('Confidence Score')
ax2.set_ylabel('Number of Predictions')
ax2.set_title('Confidence Distribution by Prediction', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Interactive prediction function
def interactive_prediction():
    """
    Interactive function for testing custom messages
    """
    print("\n🎯 INTERACTIVE FRAUD DETECTION")
    print("="*50)
    print("Enter a message to test (or 'quit' to exit):")
    
    while True:
        user_message = input("\nMessage: ").strip()
        
        if user_message.lower() in ['quit', 'exit', 'q']:
            print("👋 Thanks for testing!")
            break
            
        if not user_message:
            print("⚠️ Please enter a message.")
            continue
            
        try:
            result = predictor.predict_message(user_message)
            
            emoji = "🚨" if result['prediction'] == 'Fraud' else "✅"
            print(f"\n{emoji} Prediction: {result['prediction']}")
            print(f"   Confidence: {result['confidence']:.3f}")
            print(f"   Fraud Probability: {result['fraud_probability']:.3f}")
            
            if result['prediction'] == 'Fraud':
                print("   ⚠️ This message appears to be fraudulent!")
            else:
                print("   ✅ This message appears to be legitimate.")
                
        except Exception as e:
            print(f"❌ Error processing message: {e}")

# Uncomment the line below to run interactive prediction
# interactive_prediction()

# Save the best model for future use
print(f"\n💾 MODEL PERSISTENCE")
print("="*30)

import joblib

# Save the complete pipeline
model_pipeline = {
    'model': best_model,
    'vectorizer': tfidf_vectorizer,
    'preprocessor': preprocessor,
    'scaler': scaler,
    'feature_columns': additional_features,
    'model_name': best_model_name,
    'performance_metrics': {
        'accuracy': results[best_model_name]['accuracy'],
        'precision': results[best_model_name]['precision'],
        'recall': results[best_model_name]['recall'],
        'f1_score': results[best_model_name]['f1_score'],
        'auc_score': results[best_model_name]['auc_score']
    }
}

# Save to file
# joblib.dump(model_pipeline, 'fraud_detection_pipeline.pkl')
print("Model pipeline ready for saving with joblib.dump()")

print(f"\nTo load the model later:")
print("model_pipeline = joblib.load('fraud_detection_pipeline.pkl')")
print("predictor = FraudDetectionPredictor(**model_pipeline)")

# Summary of best practices
print(f"\n📋 IMPLEMENTATION SUMMARY")
print("="*50)
print(f"🏆 Best Model: {best_model_name}")
print(f"📊 Performance: F1-Score = {results[best_model_name]['f1_score']:.3f}")
print(f"🔧 Key Features:")
print(f"   - TF-IDF vectorization with {X_tfidf.shape[1]} features")
print(f"   - Additional engineered features: {len(additional_features)}")
print(f"   - Cross-validation for robust evaluation")
print(f"   - Hyperparameter tuning")
print(f"   - Cost-sensitive analysis")
print(f"\n✨ Next Steps:")
print(f"   1. Deploy as web service (Flask/FastAPI)")
print(f"   2. Implement real-time monitoring")
print(f"   3. Add more sophisticated features")
print(f"   4. Experiment with BERT/transformer models")
print(f"   5. Collect and retrain on real-world data")