step#01 import libraries

In [22]:
# import libraries for visulaization and evaluation
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,  classification_report

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\AL REHMAN
[nltk_data]     LAPTOPS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\AL REHMAN
[nltk_data]     LAPTOPS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

step#02 Load data

In [12]:
df = pd.read_csv('spam.csv', encoding='latin1')
print("data is loaded")
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

data is loaded


In [13]:
df.sample

<bound method NDFrame.sample of       label  ... Unnamed: 4
0         0  ...        NaN
1         0  ...        NaN
2         1  ...        NaN
3         0  ...        NaN
4         0  ...        NaN
...     ...  ...        ...
5567      1  ...        NaN
5568      0  ...        NaN
5569      0  ...        NaN
5570      0  ...        NaN
5571      0  ...        NaN

[5572 rows x 5 columns]>

In [14]:
df.shape

(5572, 5)

step: 02 data cleaning

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       5572 non-null   int64 
 1   text        5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: int64(1), object(4)
memory usage: 217.8+ KB


In [16]:
df.columns

Index(['label', 'text', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [17]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.sample(5)

Unnamed: 0,label,text
2288,0,Dont you have message offer
4163,0,"Its ok, called mom instead have fun"
2162,0,1) Go to write msg 2) Put on Dictionary mode 3...
3931,0,Sos! Any amount i can get pls.
5387,0,I will be gentle baby! Soon you will be taking...


step:3 Preprocessing function

In [27]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()  # convert to string and lowercase
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)
# apply preprocessing
df['clean_text'] = df['text'].astype(str).apply(preprocess)

step:4 training\test split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42
)

In [33]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [34]:
# TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

step:5 Models

In [29]:
models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

step:6 training and evaluation

In [35]:
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    print(f"\n{name} Accuracy: {acc * 100:.2f}%")
    print(classification_report(y_test, preds))


MultinomialNB Accuracy: 96.68%
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


LogisticRegression Accuracy: 94.89%
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.65      0.77       150

    accuracy                           0.95      1115
   macro avg       0.95      0.82      0.87      1115
weighted avg       0.95      0.95      0.94      1115


RandomForest Accuracy: 97.49%
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99     

step:7 save model

In [36]:
joblib.dump(models["LogisticRegression"], "spam_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']