In [1]:
# --- 1. Imports ---
import pandas as pd
import numpy as np
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.pipeline import Pipeline

# Import NLTK components from your cleaning file (optional)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# --- 2. Load the cleaned dataset ---
print("üìÇ Loading cleaned dataset...")
df = pd.read_csv("cleaned_spam_dataset.csv")
print(f"‚úÖ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print("Label distribution:\n", df['Label'].value_counts())

üìÇ Loading cleaned dataset...
‚úÖ Dataset loaded: 28397 rows, 3 columns
Label distribution:
 Label
0    19821
1     8576
Name: count, dtype: int64


In [3]:
# --- 3. Split data ---
X = df['Body']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"üß© Training size: {X_train.shape[0]} | Testing size: {X_test.shape[0]}")


üß© Training size: 22717 | Testing size: 5680


In [4]:
# --- 4. TF-IDF Vectorization ---
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

In [5]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [6]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)
print("‚úÖ Model training complete!")

‚úÖ Model training complete!


In [7]:
y_pred = model.predict(X_test_tfidf)
print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred, digits=4))
print("\nüìà Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n‚≠ê F1 Macro Score:", round(f1_score(y_test, y_pred, average='macro'), 4))



üìä Classification Report:
              precision    recall  f1-score   support

           0     0.9627    0.9841    0.9733      3965
           1     0.9613    0.9120    0.9360      1715

    accuracy                         0.9623      5680
   macro avg     0.9620    0.9480    0.9546      5680
weighted avg     0.9623    0.9623    0.9620      5680


üìà Confusion Matrix:
[[3902   63]
 [ 151 1564]]

‚≠ê F1 Macro Score: 0.9546


In [None]:
# --- 7. Save Model and Vectorizer ---
joblib.dump(model, "spam_classifier_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("üíæ Model and TF-IDF vectorizer saved successfully!")

ValueError: Non valid compression method given: "wb". Possible values are {'zlib': <joblib.compressor.ZlibCompressorWrapper object at 0x000001C9F8DC4AD0>, 'gzip': <joblib.compressor.GzipCompressorWrapper object at 0x000001C9F8161250>, 'bz2': <joblib.compressor.BZ2CompressorWrapper object at 0x000001C9F88E5190>, 'lzma': <joblib.compressor.LZMACompressorWrapper object at 0x000001C9F8E1DC10>, 'xz': <joblib.compressor.XZCompressorWrapper object at 0x000001C9F85A57F0>, 'lz4': <joblib.compressor.LZ4CompressorWrapper object at 0x000001C9F8675010>}.

In [None]:
# --- 8. Utility: Clean Text Function (same as cleaning script) ---
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return ' '.join(words)


In [None]:
# --- 9. Inference Function ---
def predict_email(text):
    cleaned = clean_text(text)
    features = tfidf.transform([cleaned])
    prob = model.predict_proba(features)[0]   # probabilities for both classes
    pred = np.argmax(prob)
    confidence = round(prob[pred] * 100, 2)
    label = "Spam" if pred == 1 else "Ham"
    return label, confidence

In [None]:
# --- 10. Quick Test Prediction ---
label, conf = predict_email("Win a brand new iPhone!")
print(f"Prediction: {label} ({conf}% confidence)")


Prediction: Spam (79.78% confidence)


In [None]:
# --- 11. Full Pipeline Option (TF-IDF + Model together) ---
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

In [None]:
pipeline.fit(X_train, y_train)
pipeline_score = pipeline.score(X_test, y_test)
print(f"\nüì¶ Combined Pipeline Accuracy: {pipeline_score:.4f}")


üì¶ Combined Pipeline Accuracy: 0.9620


In [None]:
joblib.dump(pipeline, "models/spam_pipeline.pkl")
print("‚úÖ Combined pipeline saved as spam_pipeline.pkl")
print("\nüéØ Training complete ‚Äî ready for deployment!")

‚úÖ Combined pipeline saved as spam_pipeline.pkl

üéØ Training complete ‚Äî ready for deployment!
