In [7]:
import pandas as pd
import numpy as np
import re
import string
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

class DatasetAnalyzer:
    """Analyze and understand the uploaded dataset"""

    @staticmethod
    def analyze_dataset(file_path):
        """Analyze the dataset structure and provide insights"""
        try:
            # Try different encodings
            encodings = ['utf-8', 'latin-1', 'cp1252']
            df = None

            for encoding in encodings:
                try:
                    df = pd.read_csv(file_path, encoding=encoding)
                    print(f"✓ Dataset loaded successfully with {encoding} encoding")
                    break
                except:
                    continue

            if df is None:
                print("❌ Could not load dataset. Please check file format.")
                return None

            print("\n" + "="*60)
            print("DATASET ANALYSIS")
            print("="*60)

            print(f"📊 Dataset Shape: {df.shape}")
            print(f"📁 Columns: {list(df.columns)}")
            print(f"💾 Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

            print("\n📋 Column Information:")
            print("-" * 40)
            for col in df.columns:
                dtype = df[col].dtype
                null_count = df[col].isnull().sum()
                unique_count = df[col].nunique()
                print(f"{col:15} | {str(dtype):10} | Nulls: {null_count:4} | Unique: {unique_count:6}")

            print("\n🔍 First 3 rows:")
            print(df.head(3).to_string())

            # Detect text and label columns
            text_col, label_col = DatasetAnalyzer.detect_columns(df)

            if text_col and label_col:
                print(f"\n✅ Auto-detected:")
                print(f"   Text column: '{text_col}'")
                print(f"   Label column: '{label_col}'")

                # Analyze labels
                label_counts = df[label_col].value_counts()
                print(f"\n📈 Label Distribution:")
                for label, count in label_counts.items():
                    percentage = (count / len(df)) * 100
                    print(f"   {label}: {count} ({percentage:.1f}%)")

            return df, text_col, label_col

        except Exception as e:
            print(f"❌ Error analyzing dataset: {e}")
            return None, None, None

    @staticmethod
    def detect_columns(df):
        """Automatically detect text and label columns"""
        text_col = None
        label_col = None

        # Common text column names
        text_candidates = ['text', 'message', 'email', 'content', 'body', 'mail', 'subject']
        # Common label column names
        label_candidates = ['label', 'class', 'category', 'target', 'spam', 'type']

        # Find text column
        for col in df.columns:
            col_lower = col.lower()
            if any(candidate in col_lower for candidate in text_candidates):
                text_col = col
                break

        # If not found, use the column with longest average text
        if not text_col:
            text_lengths = {}
            for col in df.columns:
                if df[col].dtype == 'object':
                    avg_length = df[col].astype(str).str.len().mean()
                    if avg_length > 20:  # Assuming text should be reasonably long
                        text_lengths[col] = avg_length

            if text_lengths:
                text_col = max(text_lengths, key=text_lengths.get)

        # Find label column
        for col in df.columns:
            col_lower = col.lower()
            if any(candidate in col_lower for candidate in label_candidates):
                label_col = col
                break

        # If not found, use column with few unique values (likely categorical)
        if not label_col:
            for col in df.columns:
                if col != text_col and df[col].nunique() <= 10:
                    label_col = col
                    break

        return text_col, label_col

class SpamDetectorML:
    def __init__(self):
        self.models = {}
        self.vectorizers = {}
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english')) if 'english' in stopwords.fileids() else set()
        self.best_model = None
        self.best_vectorizer = None
        self.label_mapping = None

    def prepare_data(self, df, text_col, label_col):
        """Prepare data for training"""
        print("\n" + "="*60)
        print("DATA PREPARATION")
        print("="*60)

        # Extract texts and labels
        texts = df[text_col].fillna('').astype(str).tolist()
        labels = df[label_col].fillna('unknown').tolist()

        # Create label mapping
        unique_labels = list(set(labels))
        print(f"📊 Found labels: {unique_labels}")

        # Map labels to binary (spam=1, ham=0)
        if len(unique_labels) == 2:
            # Determine which label is spam
            spam_indicators = ['spam', '1', 'positive', 'yes', 'true']
            spam_label = None

            for label in unique_labels:
                if any(indicator in str(label).lower() for indicator in spam_indicators):
                    spam_label = label
                    break

            if spam_label is None:
                # If unclear, ask user or use first label as spam
                spam_label = unique_labels[0]
                print(f"⚠️  Assuming '{spam_label}' = SPAM, '{[l for l in unique_labels if l != spam_label][0]}' = HAM")

            self.label_mapping = {spam_label: 1, [l for l in unique_labels if l != spam_label][0]: 0}
        else:
            print(f"❌ Expected 2 labels, found {len(unique_labels)}. Please check your data.")
            return None, None

        # Convert labels to binary
        binary_labels = [self.label_mapping.get(label, 0) for label in labels]

        print(f"✅ Label mapping: {self.label_mapping}")
        print(f"📈 Final distribution: Spam={sum(binary_labels)}, Ham={len(binary_labels)-sum(binary_labels)}")

        return texts, binary_labels

    def preprocess_text(self, text):
        """Clean and preprocess text"""
        if pd.isna(text) or text == '':
            return ""

        text = str(text).lower()

        # Remove URLs, emails, HTML
        text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)
        text = re.sub(r'\S+@\S+', ' ', text)
        text = re.sub(r'<[^>]+>', ' ', text)

        # Remove extra punctuation
        text = re.sub(r'[!]{2,}', '!', text)
        text = re.sub(r'[?]{2,}', '?', text)

        # Remove numbers but keep currency
        text = re.sub(r'\b(?!\$)\d+\b', ' ', text)

        # Clean whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # Simple tokenization and stemming
        if self.stop_words:
            tokens = [self.stemmer.stem(word) for word in text.split()
                     if word not in self.stop_words and len(word) > 2
                     and word not in string.punctuation]
        else:
            tokens = [word for word in text.split()
                     if len(word) > 2 and word not in string.punctuation]

        return ' '.join(tokens)

    def extract_features(self, texts):
        """Extract numerical features"""
        features = []

        for text in texts:
            text_str = str(text)
            feature_dict = {
                'length': len(text_str),
                'word_count': len(text_str.split()),
                'exclamation_count': text_str.count('!'),
                'question_count': text_str.count('?'),
                'dollar_count': text_str.count('$'),
                'caps_ratio': sum(1 for c in text_str if c.isupper()) / max(len(text_str), 1),
                'digit_ratio': sum(1 for c in text_str if c.isdigit()) / max(len(text_str), 1)
            }

            # Spam keywords
            spam_words = ['free', 'winner', 'urgent', 'limited', 'offer', 'money', 'prize']
            feature_dict['spam_words'] = sum(1 for word in spam_words if word in text_str.lower())

            features.append(feature_dict)

        return pd.DataFrame(features)

    def train_and_evaluate(self, texts, labels):
        """Train multiple models and find the best one"""
        print("\n" + "="*60)
        print("MODEL TRAINING & EVALUATION")
        print("="*60)

        # Preprocess texts
        print("🔄 Preprocessing texts...")
        processed_texts = [self.preprocess_text(text) for text in texts]

        # Extract features
        print("🔄 Extracting features...")
        numerical_features = self.extract_features(texts)

        # Split data
        X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
            processed_texts, numerical_features, labels, test_size=0.2, random_state=42, stratify=labels
        )

        # Models to try
        models = {
            'Naive Bayes': MultinomialNB(),
            'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'SVM': SVC(probability=True, random_state=42)
        }

        # Vectorizers to try
        vectorizers = {
            'TF-IDF': TfidfVectorizer(max_features=3000, ngram_range=(1, 2), stop_words='english'),
            'Count': CountVectorizer(max_features=3000, ngram_range=(1, 2), stop_words='english')
        }

        results = []
        best_score = 0

        for vec_name, vectorizer in vectorizers.items():
            print(f"\n🔄 Testing {vec_name} vectorizer...")

            # Vectorize text
            X_train_vec = vectorizer.fit_transform(X_train_text)
            X_test_vec = vectorizer.transform(X_test_text)

            # Combine with numerical features
            X_train_combined = np.hstack([X_train_vec.toarray(), X_train_num.values])
            X_test_combined = np.hstack([X_test_vec.toarray(), X_test_num.values])

            for model_name, model in models.items():
                try:
                    # Train model
                    model.fit(X_train_combined, y_train)

                    # Evaluate
                    y_pred = model.predict(X_test_combined)
                    y_proba = model.predict_proba(X_test_combined)[:, 1]

                    accuracy = accuracy_score(y_test, y_pred)
                    auc = roc_auc_score(y_test, y_proba)

                    # Cross validation
                    cv_scores = cross_val_score(model, X_train_combined, y_train, cv=5)

                    result = {
                        'Vectorizer': vec_name,
                        'Model': model_name,
                        'Accuracy': accuracy,
                        'AUC': auc,
                        'CV_Mean': cv_scores.mean(),
                        'CV_Std': cv_scores.std()
                    }
                    results.append(result)

                    # Save if best
                    if accuracy > best_score:
                        best_score = accuracy
                        self.best_model = model
                        self.best_vectorizer = vectorizer
                        self.best_combo = f"{vec_name} + {model_name}"

                    print(f"   {model_name:15} | Acc: {accuracy:.3f} | AUC: {auc:.3f}")

                except Exception as e:
                    print(f"   {model_name:15} | Error: {str(e)[:30]}...")

        # Results summary
        results_df = pd.DataFrame(results).sort_values('Accuracy', ascending=False)

        print(f"\n🏆 BEST MODEL: {self.best_combo}")
        print(f"📊 Best Accuracy: {best_score:.4f}")

        print(f"\n📋 All Results:")
        print(results_df.to_string(index=False, float_format='%.4f'))

        # Detailed evaluation of best model
        X_test_best = self.best_vectorizer.transform(X_test_text)
        X_test_best_combined = np.hstack([X_test_best.toarray(), X_test_num.values])
        y_pred_best = self.best_model.predict(X_test_best_combined)

        print(f"\n📊 DETAILED CLASSIFICATION REPORT:")
        print(classification_report(y_test, y_pred_best,
                                  target_names=['Ham', 'Spam']))

        return results_df

    def predict_email(self, email_text):
        """Predict single email"""
        if self.best_model is None:
            return "❌ Model not trained yet!"

        # Preprocess
        processed = self.preprocess_text(email_text)
        numerical = self.extract_features([email_text])

        # Vectorize and combine
        text_vec = self.best_vectorizer.transform([processed])
        combined = np.hstack([text_vec.toarray(), numerical.values])

        # Predict
        prediction = self.best_model.predict(combined)[0]
        probabilities = self.best_model.predict_proba(combined)[0]

        return {
            'prediction': 'SPAM' if prediction == 1 else 'HAM',
            'confidence': max(probabilities),
            'spam_probability': probabilities[1],
            'ham_probability': probabilities[0]
        }

    def interactive_testing(self):
        """Interactive email testing"""
        print(f"\n🧪 INTERACTIVE TESTING")
        print("="*60)
        print("Type emails to test (or 'quit' to exit)")

        while True:
            email = input("\n📧 Enter email text: ")
            if email.lower() in ['quit', 'exit', 'q']:
                break

            if email.strip():
                result = self.predict_email(email)
                print(f"🔍 Prediction: {result['prediction']}")
                print(f"📊 Confidence: {result['confidence']:.3f}")
                print(f"📈 Spam Probability: {result['spam_probability']:.3f}")

def main():
    """Main execution function"""
    # Download NLTK data
    try:
        nltk.download('stopwords', quiet=True)
        nltk.download('punkt', quiet=True)
    except:
        pass

    print("🚀 SPAM DETECTION SYSTEM")
    print("="*60)

    # Get dataset path from user
    file_path = input("📁 Enter your dataset file path: ").strip()

    if not file_path:
        print("❌ No file path provided!")
        return

    # Analyze dataset
    df, text_col, label_col = DatasetAnalyzer.analyze_dataset(file_path)

    if df is None:
        return

    # Manual column selection if auto-detection failed
    if not text_col or not label_col:
        print(f"\n❓ Manual column selection needed:")
        print(f"Available columns: {list(df.columns)}")

        if not text_col:
            text_col = input("Enter text column name: ").strip()
        if not label_col:
            label_col = input("Enter label column name: ").strip()

    # Initialize detector
    detector = SpamDetectorML()

    # Prepare data
    texts, labels = detector.prepare_data(df, text_col, label_col)

    if texts is None:
        return

    # Train models
    results = detector.train_and_evaluate(texts, labels)

    # Interactive testing
    detector.interactive_testing()

    # Save model
    save_choice = input(f"\n💾 Save model? (y/n): ").lower()
    if save_choice == 'y':
        model_name = input("Enter model filename (default: spam_model.pkl): ").strip()
        if not model_name:
            model_name = "spam_model.pkl"

        # Save model
        model_data = {
            'model': detector.best_model,
            'vectorizer': detector.best_vectorizer,
            'stemmer': detector.stemmer,
            'stop_words': detector.stop_words,
            'label_mapping': detector.label_mapping
        }

        with open(model_name, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"✅ Model saved as {model_name}")

if __name__ == "__main__":
    main()

# QUICK START GUIDE:
# 1. Save this code as 'spam_detector.py'
# 2. Install required packages: pip install pandas scikit-learn nltk
# 3. Run: python spam_detector.py
# 4. Enter your dataset file path when prompted
#
# DATASET FORMAT:
# - CSV file with text and label columns
# - Labels should be 'spam'/'ham' or similar binary values
# - Text column should contain email content
#
# EXAMPLE USAGE:
# >>> python spam_detector.py
# >>> Enter dataset path: my_spam_data.csv
# >>> [System analyzes and trains automatically]
# >>> Enter email to test: "Free money! Click here now!"

🚀 SPAM DETECTION SYSTEM
📁 Enter your dataset file path: /content/emails.csv
✓ Dataset loaded successfully with utf-8 encoding

DATASET ANALYSIS
📊 Dataset Shape: (5172, 3002)
💾 Memory Usage: 118.75 MB

📋 Column Information:
----------------------------------------
Email No.       | object     | Nulls:    0 | Unique:   5172
the             | int64      | Nulls:    0 | Unique:     90
to              | int64      | Nulls:    0 | Unique:     72
ect             | int64      | Nulls:    0 | Unique:     90
and             | int64      | Nulls:    0 | Unique:     53
for             | int64      | Nulls:    0 | Unique:     42
of              | int64      | Nulls:    0 | Unique:     59
a               | int64      | Nulls:    0 | Unique:    372
you             | int64      | Nulls:    0 | Unique:     39
hou             | int64      | Nulls:    0 | Unique:     60
in              | int64      | Nulls:    0 | Unique:    133
on              | int64      | Nulls:    0 | Unique:    123
is              