# **Import Libraries**

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# **Importing the Dataset**


In [89]:
dataset = pd.read_csv('spam.csv',encoding='latin-1')[['v1', 'v2']]
dataset

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# **Downloading NLTK Resources**

In [90]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# **Preprocess the text data by tokenizing, removing stop words, and performing stemming**


In [91]:

def preprocess_text(text, stemming=True):
  text = text.lower()
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words]

  stemmer = PorterStemmer()
  tokens = [stemmer.stem(token) for token in tokens]

  return ' '.join(tokens)


dataset["preprocessed_text"] = dataset.iloc[:, 1].apply(preprocess_text)

print(dataset["preprocessed_text"])


0       go jurong point , crazi .. avail bugi n great ...
1                           ok lar ... joke wif u oni ...
2       free entri 2 wkli comp win fa cup final tkt 21...
3             u dun say earli hor ... u c alreadi say ...
4              nah n't think goe usf , live around though
                              ...                        
5567    2nd time tri 2 contact u. u å£750 pound prize ...
5568                           ì_ b go esplanad fr home ?
5569                        piti , * mood . ... suggest ?
5570    guy bitch act like 'd interest buy someth els ...
5571                                     rofl . true name
Name: preprocessed_text, Length: 5572, dtype: object


# **Split the dataset Into X,Y**

In [92]:
X = dataset["preprocessed_text"]
y = dataset.iloc[:, 0]

# **Split Train Data into Train and Validation sets**


In [93]:
Xtrain,Xval, Ytrain,Yval= train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=42)

In [94]:
Xtrain.head()

184                                      go noth great.by
2171                                wont . wat 's wit guy
5422                ok k .. sri knw 2 siva .. tat askd ..
4113    ? ? stand away ? n't heart ach without ? n't w...
4588                             finish work yet someth ?
Name: preprocessed_text, dtype: object

# **Feature extraction using TF-IDF**

In [70]:
vectorizer = TfidfVectorizer(max_features=2000)
X_features_train = vectorizer.fit_transform(Xtrain)
X_features_test = vectorizer.fit_transform(Xval)

# **Multinomial Naive Bayes model**

In [95]:
model = MultinomialNB()
model.fit(X_features_train, Ytrain)

# **Make predictions on the test set**

In [96]:
y_pred = model.predict(X_features_test)

# **Evaluate the performance of the model**

In [97]:
accuracy = accuracy_score(Yval, y_pred)
print("Accuracy:", accuracy)
precision = precision_score(Yval, y_pred ,pos_label='spam')
print("Precision:", precision)
recall = recall_score(Yval, y_pred, pos_label='spam')
print("Recall:", recall)
f1 = f1_score(Yval, y_pred, pos_label='spam')
print("F1-Score:", f1)




Accuracy: 0.8430493273542601
Precision: 0.3984375
Recall: 0.3422818791946309
F1-Score: 0.3682310469314079
