In [None]:
# Spam Email Detection using Logistic Regression
# Mini Project Implementation

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

class SpamDetector:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
        self.model = LogisticRegression(random_state=42, max_iter=1000)
        self.stop_words = set(stopwords.words('english'))
        
    def preprocess_text(self, text):
        """
        Comprehensive text preprocessing function
        """
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Tokenize and remove stopwords
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words and len(token) > 2]
        
        return ' '.join(tokens)
    
    def load_and_prepare_data(self, file_path=None, use_sample=True):
        """
        Load and prepare the dataset
        If use_sample=True, creates a sample dataset for demonstration
        """
        if use_sample or file_path is None:
            # Create a sample dataset for demonstration
            sample_data = {
                'text': [
                    "Congratulations! You've won $1000! Click here to claim your prize now!",
                    "Hi, let's meet for coffee tomorrow at 3pm",
                    "URGENT: Your account will be suspended. Verify now!",
                    "Thanks for the meeting today. Here are the notes we discussed",
                    "FREE VIAGRA! No prescription needed! Order now!",
                    "Can you send me the report by end of day?",
                    "WINNER! You are selected for a cash prize of $5000!",
                    "Happy birthday! Hope you have a wonderful day",
                    "CHEAP LOANS! Apply now for instant approval!",
                    "Reminder: Team meeting scheduled for tomorrow at 10am",
                    "Make money fast! Work from home opportunity!",
                    "Please review the attached document and provide feedback",
                    "HOT SINGLES in your area! Chat now!",
                    "The project deadline has been extended to next week",
                    "PHARMACY ONLINE - Best prices guaranteed!",
                    "Thanks for your help with the presentation",
                    "ACT NOW! Limited time offer expires soon!",
                    "Could you please send me your contact details?",
                    "CASINO BONUS! Free spins available now!",
                    "Meeting reschedule: New time is 2pm Thursday",
                    "Weight loss miracle! Lose 30 pounds in 30 days!",
                    "Please find the monthly report attached",
                    "CLICK HERE for amazing deals and discounts!",
                    "Let me know if you need any assistance",
                    "Nigerian prince needs your help transferring money",
                    "The weather forecast shows rain for tomorrow",
                    "PRIZE ALERT! You've been selected as a winner!",
                    "Can we schedule a call to discuss the project?",
                    "ADULT CONTENT! 18+ only! Click here now!",
                    "Thank you for your order. Delivery expected in 3 days"
                ],
                'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
            }
            df = pd.DataFrame(sample_data)
            print("Using sample dataset for demonstration")
        else:
            # Load actual dataset
            df = pd.read_csv(file_path)
            print(f"Loaded dataset with {len(df)} rows")
        
        return df
    
    def train(self, df):
        """
        Train the spam detection model
        """
        print("\n=== TRAINING SPAM DETECTION MODEL ===")
        
        # Data overview
        print(f"\nDataset shape: {df.shape}")
        print(f"Spam emails: {sum(df['label'])}")
        print(f"Ham emails: {len(df) - sum(df['label'])}")
        
        # Preprocess text data
        print("\nPreprocessing text data...")
        df['processed_text'] = df['text'].apply(self.preprocess_text)
        
        # Prepare features and target
        X = df['processed_text']
        y = df['label']
        
        # Train-test split (70-30)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, stratify=y
        )
        
        print(f"Training set size: {len(X_train)}")
        print(f"Test set size: {len(X_test)}")
        
        # TF-IDF Vectorization
        print("\nApplying TF-IDF vectorization...")
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        X_test_tfidf = self.vectorizer.transform(X_test)
        
        print(f"Feature dimensions: {X_train_tfidf.shape[1]}")
        
        # Train logistic regression model
        print("\nTraining Logistic Regression model...")
        self.model.fit(X_train_tfidf, y_train)
        
        # Make predictions
        y_pred = self.model.predict(X_test_tfidf)
        y_pred_proba = self.model.predict_proba(X_test_tfidf)
        
        # Store results for evaluation
        self.X_test = X_test
        self.y_test = y_test
        self.y_pred = y_pred
        self.y_pred_proba = y_pred_proba
        
        print("Model training completed!")
        
        return X_train_tfidf, X_test_tfidf, y_train, y_test, y_pred
    
    def evaluate_model(self):
        """
        Comprehensive model evaluation
        """
        print("\n=== MODEL EVALUATION ===")
        
        # Calculate metrics
        accuracy = accuracy_score(self.y_test, self.y_pred)
        precision = precision_score(self.y_test, self.y_pred)
        recall = recall_score(self.y_test, self.y_pred)
        f1 = f1_score(self.y_test, self.y_pred)
        
        print(f"\nPerformance Metrics:")
        print(f"Accuracy:  {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1 Score:  {f1:.4f}")
        
        # Detailed classification report
        print(f"\nDetailed Classification Report:")
        print(classification_report(self.y_test, self.y_pred, 
                                  target_names=['Ham', 'Spam']))
        
        return accuracy, precision, recall, f1
    
    def plot_confusion_matrix(self):
        """
        Plot confusion matrix
        """
        cm = confusion_matrix(self.y_test, self.y_pred)
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Ham', 'Spam'], 
                   yticklabels=['Ham', 'Spam'])
        plt.title('Confusion Matrix - Spam Detection')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        
        return cm
    
    def get_important_features(self, top_n=20):
        """
        Get most important features for spam detection
        """
        print(f"\n=== TOP {top_n} FEATURES FOR SPAM DETECTION ===")
        
        # Get feature names and coefficients
        feature_names = self.vectorizer.get_feature_names_out()
        coefficients = self.model.coef_[0]
        
        # Create feature importance dataframe
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'coefficient': coefficients,
            'abs_coefficient': np.abs(coefficients)
        }).sort_values('abs_coefficient', ascending=False)
        
        print("\nTop spam indicators (positive coefficients):")
        spam_features = feature_importance[feature_importance['coefficient'] > 0].head(top_n//2)
        for idx, row in spam_features.iterrows():
            print(f"  {row['feature']}: {row['coefficient']:.4f}")
        
        print("\nTop ham indicators (negative coefficients):")
        ham_features = feature_importance[feature_importance['coefficient'] < 0].head(top_n//2)
        for idx, row in ham_features.iterrows():
            print(f"  {row['feature']}: {row['coefficient']:.4f}")
        
        return feature_importance
    
    def predict_single_email(self, email_text):
        """
        Predict if a single email is spam or not
        """
        # Preprocess the email
        processed_email = self.preprocess_text(email_text)
        
        # Vectorize
        email_tfidf = self.vectorizer.transform([processed_email])
        
        # Predict
        prediction = self.model.predict(email_tfidf)[0]
        probability = self.model.predict_proba(email_tfidf)[0]
        
        result = {
            'prediction': 'SPAM' if prediction == 1 else 'HAM',
            'spam_probability': probability[1],
            'ham_probability': probability[0],
            'confidence': max(probability)
        }
        
        return result

# Main execution
def main():
    print("SPAM EMAIL DETECTION USING LOGISTIC REGRESSION")
    print("=" * 50)
    
    # Initialize detector
    detector = SpamDetector()
    
    # Load and prepare data
    df = detector.load_and_prepare_data(use_sample=True)
    
    # Train the model
    X_train_tfidf, X_test_tfidf, y_train, y_test, y_pred = detector.train(df)
    
    # Evaluate the model
    accuracy, precision, recall, f1 = detector.evaluate_model()
    
    # Plot confusion matrix
    cm = detector.plot_confusion_matrix()
    
    # Get important features
    feature_importance = detector.get_important_features(top_n=20)
    
    # Test with custom emails
    print("\n=== TESTING WITH CUSTOM EMAILS ===")
    
    test_emails = [
        "Congratulations! You've won $10,000! Click here immediately!",
        "Hi John, can we schedule a meeting for tomorrow?",
        "URGENT: Your account will be closed! Verify now!",
        "Thanks for the great presentation yesterday."
    ]
    
    for i, email in enumerate(test_emails, 1):
        result = detector.predict_single_email(email)
        print(f"\nTest Email {i}: {email[:50]}...")
        print(f"Prediction: {result['prediction']}")
        print(f"Confidence: {result['confidence']:.4f}")
        print(f"Spam Probability: {result['spam_probability']:.4f}")
    
    # Model insights and improvements
    print("\n=== MODEL INSIGHTS AND IMPROVEMENTS ===")
    print("\nKey Findings:")
    print("1. The model successfully identifies spam patterns")
    print("2. Common spam words include: free, winner, urgent, click")
    print("3. Ham emails typically contain normal conversational language")
    
    print("\nPossible Improvements:")
    print("1. Use larger, more diverse datasets")
    print("2. Try ensemble methods (Random Forest, Gradient Boosting)")
    print("3. Implement feature engineering (email metadata, sender patterns)")
    print("4. Use advanced NLP techniques (word embeddings, BERT)")
    print("5. Regular model retraining with new spam patterns")
    
    return detector, accuracy, precision, recall, f1

# Run the main function
if __name__ == "__main__":
    detector, accuracy, precision, recall, f1 = main()
    
    print(f"\n=== FINAL RESULTS ===")
    print(f"Model Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Project completed successfully!")