In [28]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

nltk.download('stopwords', quiet=True)


True

In [None]:

class SpamClassifier:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
    
    def load_and_prep_data(self, filepath):
        
        try:
            #attempt to read the file
            df = pd.read_csv(filepath)
            
            # 1.cleaning (drop rows where Spam/Ham is NaN)
            df.dropna(subset=['Spam/Ham'], inplace=True)
            
            # 2.combine subject and message for better context
            # Replace NaN contents with empty strings to avoid errors
            df['Subject'] = df['Subject'].fillna('')
            df['Message'] = df['Message'].fillna('')
            df['full_text'] = df['Subject'] + " " + df['Message']
            
            # 3.convert labels to binary: spam=1, ham=0
            #mapping from 'ham' -> 0 and 'spam' -> 1
            df['label_num'] = df['Spam/Ham'].map({'ham': 0, 'spam': 1})
            
            print(f"Data Loaded Successfully. Shape: {df.shape}")
            print(f"Spam/Ham distribution:\n{df['label_num'].value_counts()}")
            
            return df
        except Exception as e:
            print(f"Error loading data: {e}")
            return None

    def preprocess_text(self, text):
        """
        Cleans text: removes non-alphabetic chars, lowers case, removes stopwords.
        """
        # 1.cancle non-letters and lowercase
        text = re.sub('[^a-zA-Z]', ' ', str(text)).lower()
        
        # 2.tokenize
        words = text.split()
        
        # 3.remove stopwords and Stem
        cleaned_words = [self.stemmer.stem(word) for word in words if word not in self.stop_words]
        
        return ' '.join(cleaned_words)

    def train_evaluate(self, df):
        # Apply preprocessing
        print("Preprocessing text data (this may take a moment)...")
        df['clean_message'] = df['full_text'].apply(self.preprocess_text)
        
        # Split Data
        X = df['clean_message']
        y = df['label_num']
        
        # Stratify ensures train/test split has same spam proportion as original
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Build Pipeline: TF-IDF -> Naive Bayes
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()), 
            ('classifier', MultinomialNB())
        ])
        
        # Train
        print("Training model...")
        pipeline.fit(X_train, y_train)
        
        # Predict
        y_pred = pipeline.predict(X_test)
        
        # Evaluation
        print("\n--- Evaluation Results ---")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        
        return pipeline


In [None]:

# execution Block 
# ensure this runs AFTER the class is defined above
if __name__ == "__main__":
    classifier = SpamClassifier()
    
    df = classifier.load_and_prep_data('es_data.csv')
    
    if df is not None:
        model = classifier.train_evaluate(df)
        
        #test a fake email to verify
        print("\n--- Testing Custom Message ---")
        test_msg = "Congratulations! You've won a $1000 gift card. Click here to claim."
        processed_test = classifier.preprocess_text(test_msg)
        prediction = model.predict([processed_test])
        print(f"Message: '{test_msg}'")
        print(f"Prediction: {'SPAM' if prediction[0]==1 else 'HAM'}")

Data Loaded Successfully. Shape: (33716, 7)
Spam/Ham distribution:
label_num
1    17171
0    16545
Name: count, dtype: int64
Preprocessing text data (this may take a moment)...
Training model...

--- Evaluation Results ---
Accuracy: 0.9812

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      3309
           1       0.96      1.00      0.98      3435

    accuracy                           0.98      6744
   macro avg       0.98      0.98      0.98      6744
weighted avg       0.98      0.98      0.98      6744

Confusion Matrix:
[[3182  127]
 [   0 3435]]

--- Testing Custom Message ---
Message: 'Congratulations! You've won a $1000 gift card. Click here to claim.'
Prediction: HAM
