#SMS SPAM DETECTION USING NAIVE BAYES


##Import Libraries

In [24]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB #for texts related problem we use Multinomial naive bayes
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


##Load the Dataset

In [3]:
data = pd.read_csv("/content/spam.csv",  encoding='latin-1')[['v1', 'v2']] #using latin-1 ensures python reads spc characters

In [4]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.columns = ['label', 'message'] #we are renaming the column names to label and message from v1, v2

In [6]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


##Process the data

In [7]:
data['label_num'] = data.label.map({'ham': 0, 'spam': 1}) # converts text values into numberic values and saves it in label_num

In [23]:
data.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



##Split data into train and test

In [15]:
x = data['message']
y = data['label_num']

In [16]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size =0.2, random_state=42)

In [17]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


##Convert Text into Numerical Features

In [20]:
vectorizer = CountVectorizer(stop_words='english') #tells to ignore common words like the, is, was
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)


##Train the Naive Bayes Model

In [21]:
model = MultinomialNB()
model.fit(x_train_vec, y_train)

##Make prediction

In [22]:
y_pred = model.predict(x_test_vec)

##Evaluate the model

In [26]:
print('Accuracy', accuracy_score(y_test,y_pred))
print('\n Confusion matric\n', confusion_matrix(y_test,y_pred))
print('\n Classification report\n', classification_report(y_test,y_pred))

Accuracy 0.9838565022421525

 Confusion matric
 [[959   6]
 [ 12 138]]

 Classification report
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



##Try it yourself

In [30]:
msg = ["Come tonight for Diwali party and get free drinks with your friends before 10pm"
, "I love biryani", "You won $1000 cash prize!"]

msg_vec = vectorizer.transform(msg)
pred = model.predict(msg_vec)

for msg, label in zip(msg, pred):
   print(f"{msg} --> {'Spam' if label else 'Not Spam'}")

Come tonight for Diwali party and get free drinks with your friends before 10pm --> Not Spam
I love biryani --> Not Spam
You won $1000 cash prize! --> Spam
