# Package Installation

In [1]:
#pip install string

# Importing of Packages

In [2]:
import string

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

import joblib

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VARSHITH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading and Knowing data

In [4]:
df = pd.read_csv('spam_ham_dataset.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
df.shape

(5171, 4)

In [7]:
df['text'] = df['text'].apply(lambda x:x.replace('\r\n' , '_'))

In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


# Data Preprocessing and Cleaning

In [24]:
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))
corpus = []
for text in df['text']:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords_set]
    corpus.append(' '.join(text))

In [23]:
corpus[4]

'subjectindianspringsthidealbooktecopvrrevenuunderstandtecojustsenduscheckreceivanswerwhetheraprederminpriceassocidealtecoletusknowwhatwgivecontinuchasedealneed'

# Text Vectorizer and Spliting of Data

In [25]:
vectorizer = CountVectorizer(min_df=5, max_df=0.7)

X = vectorizer.fit_transform(corpus).toarray()
Y = df['label_num']

X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size = 0.2 , random_state = 42)



# Model Training

In [26]:
model = MultinomialNB()
model.fit(X_train, Y_train)

In [None]:
Y_pred = model.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


In [27]:
model.score(X_test , Y_test)

0.9565217391304348

# Saving model into .joblib file

In [None]:
joblib.dump(model , 'Spam_classifier.joblib')
print("Model is succesfully saved")