Email Spam detection

In [554]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay




In [555]:
#Checking versions
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("nltk version:", nltk.__version__)


pandas version: 2.2.3
numpy version: 2.2.1
nltk version: 3.8.1


Data loading and cleaning

In [556]:
df=pd.read_csv("spam_data.csv")
df.rename(columns={"label": "target"}, inplace=True)
df['target'] = df['target'].replace({'ham': 0, 'spam': 1})



  df['target'] = df['target'].replace({'ham': 0, 'spam': 1})


Deep cleaning data

In [557]:

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Cleaning function
def clean_text(text):
    text = text.lower()                              # 1. Lowercase
    text = re.sub(r'[^\w\s]', '', text)              # 2. Remove punctuation
    words = text.split()                             # 3. Split into words
    words = [word for word in words if word not in stop_words]  # 4. Remove stopwords
    return ' '.join(words)                           # 5. Re-join to sentence
 

# Apply cleaning to text column
df['text'] = df['text'].apply(clean_text)

# Preview cleaned data
df.sample(5)    


Unnamed: 0,target,text
4352,0,night ended another day morning come special w...
2332,0,home way
3655,0,come people
1479,0,think far find check google maps place dorm
4681,0,thats cool hell night lemme know youre around


Model use ==> Naive Bayes classifier

In [558]:
#Features extraction
X = df['text']            # features (messages)
y = df['target']          # labels (0 = ham, 1 = spam)

In [559]:
from sklearn.feature_extraction.text import TfidfVectorizer

Vectorizer = TfidfVectorizer()
X_vec = Vectorizer.fit_transform(X)


In [560]:
#Splitting the dataset

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)


In [561]:

#Model training and fitting 

model = MultinomialNB(alpha=0.5)
model.fit(X_train, y_train)


### Checking model accuracy

In [562]:

y_pred = model.predict(X_test)
print("Accuracy of the model:", accuracy_score(y_test, y_pred))

#confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

Accuracy of the model: 0.9802690582959641
Confusion Matrix:
[[953   1]
 [ 21 140]]


In [563]:
# str= input("Enter any text")  # raw text
text = [" You've $1000 cash prize."]  # raw text  
text_vec = Vectorizer.transform(text)                # convert to numbers
prediction = model.predict(text_vec)             # now predict
if (prediction==0):
    print("It's not a spam")                     # → [0] means ham
else:
    print("It's likely a spam")                  #  → [1] means spam

It's likely a spam


### Saving the model for web deployment

To deploy the model on a website, we need to save both the trained model and the vectorizer. The vectorizer is crucial as it transforms raw text into the same feature space used during training.

In [564]:
import pickle
import os

# Create a directory for the model files if it doesn't exist
if not os.path.exists('model'):
    os.makedirs('model')

# Save the trained model
with open('model/spam_classifier.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer (this is important as we need it to transform new text)
with open('model/vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(Vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully to the 'model' directory!")

Model and vectorizer saved successfully to the 'model' directory!
