In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download NLTK resources (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

# Enhanced text preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove whitespaces
    text = text.strip()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)

# Function to detect spam in a file
def detect_spam_in_file(file_path, model=None, vectorizer=None):
    """
    Detect spam messages in a file using a trained model
    
    Parameters:
    file_path (str): Path to the file containing messages
    model: Trained model for prediction (if None, will train one)
    vectorizer: Fitted vectorizer (if None, will create one)
    
    Returns:
    DataFrame: Original messages with spam predictions
    """
    # Read the file
    try:
        # Try different formats
        try:
            # Try CSV format with header
            df = pd.read_csv(file_path)
            if 'message' not in df.columns and 'text' not in df.columns:
                # If no obvious message column, use the first column
                df.columns = ['message'] + list(df.columns[1:])
        except:
            # Try tab-separated without header
            df = pd.read_csv(file_path, sep='\t', header=None)
            if len(df.columns) >= 2:
                # Assume first column is label, second is message
                df.columns = ['label', 'message'] + [f'col{i}' for i in range(2, len(df.columns))]
            else:
                # Assume only message column
                df.columns = ['message']
    except Exception as e:
        print(f"Error reading file: {e}")
        return None
    
    # Check if we have a message column
    if 'message' not in df.columns:
        print("No 'message' column found in the file")
        return None
    
    # Check if we need to train a model
    if model is None or vectorizer is None:
        print("Training a new model...")
        model, vectorizer = train_spam_detector()
    
    # Preprocess messages
    df['cleaned_message'] = df['message'].apply(preprocess_text)
    
    # Transform messages
    X = vectorizer.transform(df['cleaned_message'])
    
    # Make predictions
    predictions = model.predict(X)
    probabilities = model.predict_proba(X)
    
    # Add predictions to dataframe
    df['is_spam'] = predictions
    df['spam_probability'] = probabilities[:, 1]  # Probability of being spam
    df['prediction'] = df['is_spam'].apply(lambda x: 'spam' if x == 1 else 'ham')
    
    return df

# Function to train a spam detector
def train_spam_detector():
    """
    Train a spam detection model
    
    Returns:
    tuple: (trained_model, fitted_vectorizer)
    """
    # Load the training data
    features = ['label', 'message']
    try:
        sms = pd.read_csv('sms.txt', header=None, names=features, sep='\t')
    except FileNotFoundError:
        print("Training file 'sms.txt' not found. Please make sure it exists.")
        return None, None
    
    # Convert label to numerical variable
    sms['label_num'] = sms.label.map({'ham': 0, 'spam': 1})
    
    # Apply text preprocessing
    sms['cleaned_message'] = sms['message'].apply(preprocess_text)
    
    # Prepare features and target
    X = sms.cleaned_message
    y = sms.label_num
    
    # Create and fit vectorizer
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.5, min_df=2)
    X_vectorized = vectorizer.fit_transform(X)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X_vectorized, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Train model
    model = MultinomialNB()
    model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print(f"Model trained with accuracy: {accuracy:.4f}")
    
    return model, vectorizer

# Function to save results
def save_results(df, output_file):
    """
    Save detection results to a file
    
    Parameters:
    df (DataFrame): Results dataframe
    output_file (str): Path to output file
    """
    # Select only relevant columns
    if 'label' in df.columns:
        result_df = df[['label', 'message', 'prediction', 'spam_probability']]
    else:
        result_df = df[['message', 'prediction', 'spam_probability']]
    
    # Save to CSV
    result_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    
    # Print summary
    spam_count = (result_df['prediction'] == 'spam').sum()
    total_count = len(result_df)
    print(f"Found {spam_count} spam messages out of {total_count} total messages")
    print(f"Spam rate: {spam_count/total_count*100:.2f}%")

# Function to display results
def display_results(df):
    """
    Display detection results
    
    Parameters:
    df (DataFrame): Results dataframe
    """
    # Print spam messages
    spam_df = df[df['prediction'] == 'spam']
    if len(spam_df) > 0:
        print("\n=== SPAM MESSAGES DETECTED ===")
        for i, row in spam_df.iterrows():
            print(f"\nMessage: {row['message']}")
            print(f"Spam probability: {row['spam_probability']:.4f}")
            if 'label' in row and pd.notna(row['label']):
                print(f"Actual label: {row['label']}")
    
    # Print summary
    spam_count = (df['prediction'] == 'spam').sum()
    total_count = len(df)
    print(f"\n=== SUMMARY ===")
    print(f"Total messages: {total_count}")
    print(f"Spam messages: {spam_count}")
    print(f"Ham messages: {total_count - spam_count}")
    print(f"Spam rate: {spam_count/total_count*100:.2f}%")

# Main execution
if __name__ == "__main__":
    # Train or load model
    model, vectorizer = train_spam_detector()
    
    if model is not None:
        # Ask user for file path
        file_path = input("Enter the path to your SMS file (or press Enter to use default 'sms.txt'): ").strip()
        if not file_path:
            file_path = "sms.txt"
        
        # Detect spam in file
        results = detect_spam_in_file(file_path, model, vectorizer)
        
        if results is not None:
            # Display results
            display_results(results)
            
            # Ask if user wants to save results
            save_option = input("\nDo you want to save the results to a file? (y/n): ").strip().lower()
            if save_option == 'y':
                output_file = input("Enter output file name (default: spam_results.csv): ").strip()
                if not output_file:
                    output_file = "spam_results.csv"
                save_results(results, output_file)

Model trained with accuracy: 0.9659

=== SPAM MESSAGES DETECTED ===

Message: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
Spam probability: 0.9517
Actual label: spam

Message: WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
Spam probability: 0.9880
Actual label: spam

Message: Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030
Spam probability: 0.9037
Actual label: spam

Message: SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info
Spam probability: 0.9688
Actual label: spam

Message: URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to 