In [2]:
import pandas as pd
import numpy as np
import re
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
df = pd.read_csv('../data/Spam Email raw text for NLP.csv')
print(df.head())

   CATEGORY                                            MESSAGE  \
0         1  Dear Homeowner,\n\n \n\nInterest Rates are at ...   
1         1  ATTENTION: This is a MUST for ALL Computer Use...   
2         1  This is a multi-part message in MIME format.\n...   
3         1  IMPORTANT INFORMATION:\n\n\n\nThe new domain n...   
4         1  This is the bottom line.  If you can GIVE AWAY...   

                                FILE_NAME  
0  00249.5f45607c1bffe89f60ba1ec9f878039a  
1  00373.ebe8670ac56b04125c25100a36ab0510  
2  00214.1367039e50dc6b7adb0f2aa8aba83216  
3  00210.050ffd105bd4e006771ee63cabc59978  
4  00033.9babb58d9298daa2963d4f514193d7d6  


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CATEGORY   5796 non-null   int64 
 1   MESSAGE    5796 non-null   object
 2   FILE_NAME  5796 non-null   object
dtypes: int64(1), object(2)
memory usage: 136.0+ KB
None


In [5]:
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 CATEGORY     0
MESSAGE      0
FILE_NAME    0
dtype: int64


In [6]:
df = df[['CATEGORY', 'MESSAGE']]

In [7]:
df.columns = ['label', 'text']

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+', ' ', text)   # remove emails
    text = re.sub(r'http\S+', ' ', text)   # remove urls
    text = re.sub(r'\d+', ' ', text)       # remove numbers
    text = re.sub(r'[^a-z\s]', ' ', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

In [9]:
print("Sample cleaned message: ")
print(df['clean_text'].iloc[0])

Sample cleaned message: 
dear homeowner interest rates are at their lowest point in years we help you find the best rate for your situation by matching your needs with hundreds of lenders home improvement refinance second mortgage home equity loans and more even with less than perfect credit this service is free to home owners and new home buyers without any obligation just fill out a quick simple form and jump start your future plans today visit to unsubscribe please visit


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size = 0.2, random_state=42, stratify=df['label']
)

In [12]:
#TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    stop_words = 'english',
    max_df = 0.95,
    min_df = 3,
    ngram_range = (1, 2)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Vocabulary size: ", len(vectorizer.get_feature_names_out()))

Vocabulary size:  61564


In [13]:
# Training and Hyperparamete Tuning
param_grid = {
    'C':[0.1, 1, 5, 10],
    'max_iter': [200, 300],
    'solver': ['liblinear']
}

lr = LogisticRegression(class_weight='balanced')

grid = GridSearchCV(lr, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train_tfidf, y_train)

print("Best param found: ", grid.best_params_)
best_model = grid.best_estimator_

Best param found:  {'C': 10, 'max_iter': 200, 'solver': 'liblinear'}


In [None]:
y_pred = best_model.predict(X_test_tfidf)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\nClassification report: ", classification_report(y_test, y_pred))
print("\nConfusion matrix: ", confusion_matrix(y_test, y_pred))

Accuracy:  0.9896551724137931

Classification report:                precision    recall  f1-score   support

           0       0.99      1.00      0.99       781
           1       0.99      0.97      0.98       379

    accuracy                           0.99      1160
   macro avg       0.99      0.99      0.99      1160
weighted avg       0.99      0.99      0.99      1160


Confusion matrix:  [[779   2]
 [ 10 369]]


In [15]:
joblib.dump(best_model, "./models/logistic_spam_model.pkl")
joblib.dump(vectorizer, "./models/tfidf_vectorizer.pkl")

print("Model + Vectorizer Saved Successfully!")

Model + Vectorizer Saved Successfully!
