<a href="https://colab.research.google.com/github/Sayed-Ali-Raza-Naqvi/DEP_Spam_Classification_Project/blob/main/DEP_Spam_Classification_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df = pd.read_csv("/content/spam_ham_dataset.csv")

In [None]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
216,2186,ham,Subject: re : tenaska iv 10 / 00\r\ni don ' t ...,0
1381,1157,ham,"Subject: enron actuals for july 7 thru 9 , 200...",0
3327,4988,spam,Subject: people laugh at you ? nol\r\nthe only...,1
2590,1736,ham,Subject: enron / hpl actuals for september 29 ...,0
3231,339,ham,Subject: cdnow shipment confirmation\r\ndear d...,0


In [None]:
df.shape

(5171, 3)

In [None]:
df = df.drop(columns=["Unnamed: 0"])

In [None]:
df.sample(3)

Unnamed: 0,label,text,label_num
2610,ham,Subject: january setup - - mops\r\nspecificall...,0
4642,ham,Subject: cleburne / tenaska iv outage\r\n- - -...,0
4603,ham,"Subject: hpl nom for dec . 16 - 18 , 2000\r\ns...",0


In [None]:
df["text"] = df["text"].apply(lambda x: x.replace("\r\n", " "))

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5171 non-null   object
 1   text       5171 non-null   object
 2   label_num  5171 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 121.3+ KB


In [None]:
stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words("english"))

for i in range(len(df)):
  text = df["text"].iloc[i].lower().translate(str.maketrans("", "", string.punctuation)).split()
  text = [stemmer.stem(word) for word in text if word not in stopwords_set]
  text = " ".join(text)
  corpus.append(text)

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
X = vectorizer.fit_transform(corpus).toarray()
Y = df["label_num"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
multi_clf = MultinomialNB()
gauss_clf = GaussianNB()
bern_clf = BernoulliNB()

In [None]:
multi_clf.fit(x_train, y_train)
gauss_clf.fit(x_train, y_train)
bern_clf.fit(x_train, y_train)

In [None]:
multi_clf.score(x_test, y_test)

0.9082125603864735

In [None]:
gauss_clf.score(x_test, y_test)

0.9400966183574879

In [None]:
bern_clf.score(x_test, y_test)

0.855072463768116

In [None]:
confusion_matrix(y_test, multi_clf.predict(x_test))

array([[732,   0],
       [ 95, 208]])

In [None]:
confusion_matrix(y_test, gauss_clf.predict(x_test))

array([[702,  30],
       [ 32, 271]])

In [None]:
confusion_matrix(y_test, bern_clf.predict(x_test))

array([[718,  14],
       [136, 167]])