To build an email spam detection system, we will perform the following steps:

1. Load and Explore the Dataset
2. Text Preprocessing
3. Feature Extraction
4. Training a Classification Model
5. Evaluating the Model
6. Predicting New Emails

Step 1:  Load and Explore the Dataset

In [1]:
import pandas as pd

In [2]:
#  Load the data
data = "spam.csv"
df = pd.read_csv(data, encoding='latin-1')

In [3]:
#  Display the first few rows of the dataset
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
print(df.dtypes)

v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object


Step 2: Text Perprocessing

In [5]:
#  Drop unnecessary columns and remname rhe relevant 
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
df = df.rename(columns={'v1':"label",'v2':"text"})

In [6]:
#  convert labels to binary (1 for spam, 0 for ham)
df['label'] =df['label'].map({'spam':1,'ham':0})

In [7]:
#  Display the processed data
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
print(df['label'].unique())

[0 1]


In [9]:
# Check for any non-string values in the text column
print(df['text'].apply(lambda x: isinstance(x, str)).all())

True


In [10]:
#  Text preprocessing
import string
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [11]:
#  Initilize stop word and stemmer
stop_word = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [12]:
def preprocess_text(text):
    # Ensure the text is a string
    if not isinstance(text, str):
        text = str(text)
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stop words and perform stemming
    text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_word)
    return text

In [13]:
# Apply preprocessing to the text column
df['text'] = df['text'].apply(preprocess_text)

In [14]:
# Check the first few rows of the preprocessed text
df.head()

Unnamed: 0,label,text
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah dont think goe usf live around though


Step 3: Feature Extraction

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(
    input='content',
    encoding='utf-8',
    decode_error='strict',
    strip_accents=None,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    analyzer='word',
    stop_words=None,
    token_pattern=r'(?u)\b\w\w+\b',
    ngram_range=(1, 1),
    max_df=1.0,
    min_df=1,
    max_features=None,
    vocabulary=None,
    binary=False,
    dtype='float64',
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False
)

# Transform the text data
X = vectorizer.fit_transform(df['text'])
y = df['label']




Step 4: Training a Classification Model

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)



Step 5: Evaluating the Model

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9623318385650225
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       965
           1       0.85      0.88      0.86       150

    accuracy                           0.96      1115
   macro avg       0.91      0.93      0.92      1115
weighted avg       0.96      0.96      0.96      1115

Confusion Matrix:
[[941  24]
 [ 18 132]]


Step 6: Predicting New Emails

In [18]:
# Function to predict a new email
def predict_email(email):
    # Preprocess the email
    email = preprocess_text(email)
    # Transform the email using the vectorizer
    email_vectorized = vectorizer.transform([email])
    # Predict using the trained model
    prediction = model.predict(email_vectorized)
    return "Spam" if prediction[0] == 1 else "Ham"

In [19]:
# Test the prediction function
new_email = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
print(predict_email(new_email))

Spam


In [20]:
import pickle

In [21]:
# save the model and vectorizer to disk
with open('email_spam_model.pkl','wb') as model_file:
    pickle.dump((vectorizer, model), model_file)
print("Model and vectorizer have been saved  to email_spam_model.pkl")    

Model and vectorizer have been saved  to email_spam_model.pkl


Here's the code to load the pickle object and make predictions on new emails:

In [22]:
# Load the model and vectorizer from the pickle file
with open('email_spam_model.pkl', 'rb') as model_file:
    vectorizer, model = pickle.load(model_file)

In [23]:
def preprocess_text(text):
    import string
    import re
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

In [24]:
def predict_email(email):
    # Preprocess the email
    email = preprocess_text(email)
    # Transform the email using the vectorizer
    email_vectorized = vectorizer.transform([email])
    # Predict using the trained model
    prediction = model.predict(email_vectorized)
    return "Spam" if prediction[0] == 1 else "Ham"

In [25]:
# Test the prediction function
new_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print(predict_email(new_email))


Spam


In [26]:
new_email_ham = "Hey, can we reschedule our meeting to tomorrow afternoon?"
print(predict_email(new_email_ham))

Ham
