In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib
import re
import os

# 1.Data Preprocessing

In [16]:
# 1. Load dataset
df = pd.read_csv('data/processed_data.csv', encoding='latin1')

# 2. Combine subject and message, then clean text data
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [17]:
# Check label distribution before training
print('Label distribution:')
print(df['label'].value_counts())


Label distribution:
label
1    50199
0    25220
Name: count, dtype: int64


In [18]:
# Combine subject and message columns
df['combined_text'] = (df['subject'].fillna('') + ' ' + df['message'].fillna('')).apply(clean_text)


# 3. Feature extraction (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['combined_text'])

# 4. Target variable
y = df['label']


In [19]:
# 4. Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# 2. Model Training

In [20]:
# 3. Feature extraction (TF-IDF) with fewer features
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['combined_text'])

# 4. Target variable
y = df['label']

# 4. Split dataset (stratified)
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Train RandomForestClassifier with class_weight
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# 6. Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
print('Cross-validation F1 scores:', cv_scores)
print('Mean CV F1:', cv_scores.mean())

# 7. Evaluate model with default threshold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
probs = model.predict_proba(X_test)[:, 1]
threshold = 0.5
preds = (probs >= threshold).astype(int)
print(f'Using threshold: {threshold}')
print('Accuracy:', accuracy_score(y_test, preds))
print('Precision:', precision_score(y_test, preds))
print('Recall:', recall_score(y_test, preds))
print('F1 Score:', f1_score(y_test, preds))
print('\nClassification Report:\n', classification_report(y_test, preds))
print('Confusion Matrix:\n', confusion_matrix(y_test, preds))

# 8. Try a lower threshold for imbalanced data
threshold = 0.3
preds = (probs >= threshold).astype(int)
print(f'\nUsing threshold: {threshold}')
print('Accuracy:', accuracy_score(y_test, preds))
print('Precision:', precision_score(y_test, preds))
print('Recall:', recall_score(y_test, preds))
print('F1 Score:', f1_score(y_test, preds))
print('\nClassification Report:\n', classification_report(y_test, preds))
print('Confusion Matrix:\n', confusion_matrix(y_test, preds))

Cross-validation F1 scores: [0.99583618 0.99577167 0.99527128 0.99552294 0.99552239]
Mean CV F1: 0.9955848928234436
Using threshold: 0.5
Accuracy: 0.9939671174754707
Precision: 0.994532259667959
Recall: 0.996414342629482
F1 Score: 0.9954724115627643

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      5044
           1       0.99      1.00      1.00     10040

    accuracy                           0.99     15084
   macro avg       0.99      0.99      0.99     15084
weighted avg       0.99      0.99      0.99     15084

Confusion Matrix:
 [[ 4989    55]
 [   36 10004]]

Using threshold: 0.3
Accuracy: 0.9909175285070273
Precision: 0.9875923190546528
Recall: 0.9989043824701195
F1 Score: 0.9932161426095568

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      5044
           1       0.99      1.00      0.99     10040

    accuracy          

In [21]:
# 8. Save model and vectorizer
import os
import joblib
os.makedirs('model', exist_ok=True)
joblib.dump(model, 'model/spam_classifier.joblib')
joblib.dump(vectorizer, 'model/tfidf_vectorizer.joblib')
print('Model and vectorizer saved in /model directory.')

Model and vectorizer saved in /model directory.
