In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
import inflect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pin.lyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pin.lyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pin.lyu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### EDA

In [3]:
# load data

email = pd.read_csv("/Users/pin.lyu/Documents/BC_Folder/NPL/Data/emails.csv")

# make a copy of the original data for operation

email_data = email

# show data

email_data.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
# Check class distribution

print(email_data['spam'].value_counts())  

spam
0    4360
1    1368
Name: count, dtype: int64


In [5]:
# comment

print('This is a imbalanced dataset. Modeling results may be poor due to this reason')

This is a imbalanced dataset. Modeling results may be poor due to this reason


### Data Operation

In [7]:
# defining cleaning function 

def clean_text(text):
    
    # Remove special characters and numbers
    
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    
    text = text.lower()  # Convert to lowercase
    
    return text

In [8]:
#def clean_text(text):
    # Remove special characters
    #text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    #text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    #text = text.lower()  # Convert to lowercase

    # Tokenize the text
   # words = word_tokenize(text)

    # Remove stop words
    #stop_words = set(stopwords.words('english'))
    #words = [word for word in words if word not in stop_words]

    # Convert numbers to words
    #p = inflect.engine()
   # words = [p.number_to_words(word) if word.isdigit() else word for word in words]

    # Lemmatization
    #lemmatizer = WordNetLemmatizer()
    #words = [lemmatizer.lemmatize(word) for word in words]

    #return ' '.join(words)

In [9]:
# cleaning the 'text' column in all three datasets

email_data['text'] = email_data['text'].apply(clean_text)

In [10]:
# Initialize CountVectorizer with 2-grams

vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=5000)  # Limit to top 5000 features

# Fit and transform the text data

# Features

X = vectorizer.fit_transform(email_data['text']).toarray() 

# Labels

y = email_data['spam']  

In [11]:
# Split into training + test (80%) and validation (20%)

X_train_test, X_val, y_train_test, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Split training + test into training (80%) and test (20%)

X_train, X_test, y_train, y_test = train_test_split(X_train_test, y_train_test, test_size=0.2, random_state=42)

### Model Building

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),  
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42)
}

In [15]:
results = {}

for name, model in models.items():
    
    print(f"Training {name}...")

    # Train the model
    
    model.fit(X_train, y_train) 

    # Predict on the test set
    
    y_pred = model.predict(X_test)  
    
    # Evaluate the model
    
    accuracy = accuracy_score(y_test, y_pred)
    
    report = classification_report(y_test, y_pred)
    
    confusion = confusion_matrix(y_test, y_pred)
    
    # Store results
    
    results[name] = {
        
        "accuracy": accuracy,
        
        "classification_report": report,
        
        "confusion_matrix": confusion
    }
    
    # Print results
    
    print(f"Results for {name}:")
    
    print(f"Accuracy: {accuracy:.4f}")
    
    print("Classification Report:")
    
    print(report)
    
    print("Confusion Matrix:")
    
    print(confusion)
    
    print("-" * 50)

Training Naive Bayes...
Results for Naive Bayes:
Accuracy: 0.9553
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       707
           1       0.90      0.91      0.90       210

    accuracy                           0.96       917
   macro avg       0.93      0.94      0.94       917
weighted avg       0.96      0.96      0.96       917

Confusion Matrix:
[[685  22]
 [ 19 191]]
--------------------------------------------------
Training Logistic Regression...
Results for Logistic Regression:
Accuracy: 0.9607
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       707
           1       0.96      0.87      0.91       210

    accuracy                           0.96       917
   macro avg       0.96      0.93      0.94       917
weighted avg       0.96      0.96      0.96       917

Confusion Matrix:
[[699   8]
 [ 28 182]]
-------------------

Based on the modeling results, logistic regression outperforms the other models for three key reasons:

1) Highest Accuracy: It achieves an accuracy score of 0.9607, the best among all models.

2) Best F1 Scores: It has the highest F1 scores for both classes (0 and 1).

3) Strong Precision and Recall:

- For spam detection (class 1), it has a precision of 0.96, meaning 96% of emails labeled as spam are truly spam.

- For non-spam emails (class 0), it has a recall of 0.99, meaning 99% of non-spam emails are correctly identified as non-spam.

This combination of high precision for spam detection and high recall for non-spam emails ensures the model effectively blocks spam while allowing legitimate emails to pass through, achieving its goal.