# Problem statement :Spam filtering using naive Bayes classifiers in order to predict whether a new mail based on its content, can be categorized as spam or not-spam.

In [124]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string
import matplotlib.pyplot as plt

In [125]:
# Load Dataset

data=pd.read_csv('spam.tsv' ,sep='\t', names=['Class','Message'])
data

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!
...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...
5563,ham,Will ü b going to esplanade fr home?
5564,ham,"Pity, * was in mood for that. So...any other s..."
5565,ham,The guy did some bitching but I acted like i'd...


In [126]:
data.head()

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


In [127]:
data.tail()

Unnamed: 0,Class,Message
5562,spam,This is the 2nd time we have tried 2 contact u...
5563,ham,Will ü b going to esplanade fr home?
5564,ham,"Pity, * was in mood for that. So...any other s..."
5565,ham,The guy did some bitching but I acted like i'd...
5566,ham,Rofl. Its true to its name


In [128]:
data.info()         #summary of dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5567 non-null   object
 1   Message  5567 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [129]:
# create a column to keep the count of the characters present in each record

data['Length']=data['Message'].apply(len)

In [130]:
# view the dataset with the column 'Length' which contains the number of characters present in each mail
data.head(10)

Unnamed: 0,Class,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
5,ham,As per your request 'Melle Melle (Oru Minnamin...,160
6,spam,WINNER!! As a valued network customer you have...,157
7,spam,Had your mobile 11 months or more? U R entitle...,154
8,ham,I'm gonna be home soon and i don't want to tal...,109
9,spam,"SIX chances to win CASH! From 100 to 20,000 po...",136


In [131]:
data.describe()

Unnamed: 0,Length
count,5567.0
mean,80.450153
std,59.891023
min,2.0
25%,36.0
50%,62.0
75%,122.0
max,910.0


In [132]:
data.describe(include='O')

Unnamed: 0,Class,Message
count,5567,5567
unique,2,5164
top,ham,"Sorry, I'll call later"
freq,4821,30


In [133]:
# Lets see the class of each count
data['Class'].value_counts()

Class
ham     4821
spam     746
Name: count, dtype: int64

# Text Pre-Processing

In [135]:
# Manual Encoding

In [136]:
data.loc[data['Class']=='ham','Class']=1

In [137]:
data.loc[data['Class']=='spam','Class']=0

In [138]:
data.head()

Unnamed: 0,Class,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36


In [139]:
# Let's remove all punctuations
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [140]:
# Create function to remove the punctuation
def remove_punct(text):
    text=''.join([char for char in text if char not in string.punctuation])
    return text

In [141]:
text=[]
for i in data['Message']:
    t=remove_punct(i)
    text.append(t)

In [142]:
data['Text_clean']=text

In [143]:
data.head(10)

Unnamed: 0,Class,Message,Length,Text_clean
0,1,I've been searching for the right words to tha...,196,Ive been searching for the right words to than...
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
2,1,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...
3,1,Even my brother is not like to speak with me. ...,77,Even my brother is not like to speak with me T...
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36,I HAVE A DATE ON SUNDAY WITH WILL
5,1,As per your request 'Melle Melle (Oru Minnamin...,160,As per your request Melle Melle Oru Minnaminun...
6,0,WINNER!! As a valued network customer you have...,157,WINNER As a valued network customer you have b...
7,0,Had your mobile 11 months or more? U R entitle...,154,Had your mobile 11 months or more U R entitled...
8,1,I'm gonna be home soon and i don't want to tal...,109,Im gonna be home soon and i dont want to talk ...
9,0,"SIX chances to win CASH! From 100 to 20,000 po...",136,SIX chances to win CASH From 100 to 20000 poun...


In [144]:
# spliting data
X=data['Text_clean'].values
y=data['Class'].values

In [145]:
X

array(['Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You have been wonderful and a blessing at all times',
       'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
       'Nah I dont think he goes to usf he lives around here though', ...,
       'Pity  was in mood for that Soany other suggestions',
       'The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free',
       'Rofl Its true to its name'], dtype=object)

In [146]:
y

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [147]:
# Datatype of y is still object. so convert it into int
y=y.astype('int')
y

array([1, 0, 1, ..., 1, 1, 1])

In [148]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

In [149]:
X_train.shape

(4453,)

In [150]:
X_test.shape

(1114,)

# Bag Of Words and TFidf

### CountVectorizer - to extract the features from text

In [175]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize the object for CountVectorizer
CV=CountVectorizer(stop_words='english')

In [154]:
# Apply the CountVectorizer functionality on the training data to convert the categorical data into vectors

X_train_CV=CV.fit_transform(X_train)

In [155]:
import warnings
warnings.filterwarnings('ignore')

In [156]:
CV.get_feature_names_out()

array(['008704050406', '0089my', '0121', ..., 'zyada', 'üll', '〨ud'],
      dtype=object)

# Training Model

In [158]:
NB=MultinomialNB()

In [159]:
NB.fit(X_train_CV, y_train)        #feed data to model

In [160]:
# Lets apply CV on our test data.
X_test_CV=CV.transform(X_test)     #transform is used to avoid data leakage

In [161]:
y_predict=NB.predict(X_test_CV)
y_predict

array([1, 1, 1, ..., 1, 1, 1])

In [162]:
# Classification report

print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94       168
           1       0.99      0.99      0.99       946

    accuracy                           0.98      1114
   macro avg       0.97      0.96      0.97      1114
weighted avg       0.98      0.98      0.98      1114



In [163]:
# confusion matrix
pd.crosstab(y_test,y_predict)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,155,13
1,6,940


In [164]:
# Initialising a model
bnb=BernoulliNB()

# fitting the model
bnb.fit(X_train_CV, y_train)

#getting prediction
y_hat1=bnb.predict(X_test_CV)

#Confusion matrix
pd.crosstab(y_test,y_hat1)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,130,38
1,0,946


# TF-IDF

In [166]:
# Splitting X and Y

X=data['Text_clean'].values
y=data['Class'].values

In [167]:
X

array(['Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You have been wonderful and a blessing at all times',
       'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
       'Nah I dont think he goes to usf he lives around here though', ...,
       'Pity  was in mood for that Soany other suggestions',
       'The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free',
       'Rofl Its true to its name'], dtype=object)

In [168]:
y

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [183]:
y=y.astype('int')
y

array([1, 0, 1, ..., 1, 1, 1])

In [185]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=6)

In [187]:
X_train

array(['Where u been hiding stranger',
       'Jus finish my lunch on my way home lor I tot u dun wan 2 stay in sch today',
       'Yeah like if it goes like it did with my friends imma flip my shit in like half an hour',
       ..., 'See the forwarding message for proof',
       'Dear we are going to our rubber place',
       ' and  picking them up from various points'], dtype=object)

In [193]:
y_train

array([1, 1, 1, ..., 1, 1, 1])

In [195]:
tf=TfidfVectorizer()

In [201]:
X_train_cv=tf.fit_transform(X_train)
X_test_cv=tf.transform(X_test)

In [203]:
# Model Creation
nb=MultinomialNB()
nb.fit(X_train_cv,y_train)

In [205]:
y_hat=nb.predict(X_test_cv)

In [207]:
y_hat

array([1, 1, 1, ..., 1, 1, 1])

In [209]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       1.00      0.69      0.82       149
           1       0.95      1.00      0.98       965

    accuracy                           0.96      1114
   macro avg       0.98      0.85      0.90      1114
weighted avg       0.96      0.96      0.96      1114



In [211]:
pd.crosstab(y_test,y_hat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,103,46
1,0,965


In [215]:
nb=BernoulliNB()

In [217]:
nb.fit(X_train_cv,y_train)

In [223]:
y_hat=nb.predict(X_test_cv)
y_hat

array([1, 1, 1, ..., 1, 1, 1])

In [267]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       1.00      0.78      0.88       149
           1       0.97      1.00      0.98       965

    accuracy                           0.97      1114
   macro avg       0.98      0.89      0.93      1114
weighted avg       0.97      0.97      0.97      1114



In [269]:
pd.crosstab(y_test,y_hat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,116,33
1,0,965
