In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score , confusion_matrix, classification_report

In [2]:
df = pd.read_csv('SMSSpamCollection.csv', sep='\t', names=['label', 'messages'])

In [3]:
df

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     5572 non-null   object
 1   messages  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.sample(5)

Unnamed: 0,label,messages
4947,ham,I'm already back home so no probably not
5312,ham,Here got ur favorite oyster... N got my favori...
4657,ham,"K, I'll work something out"
3417,ham,Uhhhhrmm isnt having tb test bad when youre sick
867,ham,"Same here, but I consider walls and bunkers an..."


In [6]:
df

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [8]:
df['label'] = encoder.fit_transform(df['label'])

In [9]:
df.sample(5)

Unnamed: 0,label,messages
4808,1,PRIVATE! Your 2004 Account Statement for 07849...
1956,0,"Aight I'll grab something to eat too, text me ..."
4694,0,Tessy..pls do me a favor. Pls convey my birthd...
1110,0,S s..first time..dhoni rocks...
2121,0,"Argh my 3g is spotty, anyway the only thing I ..."


In [10]:
df.isnull().sum()

label       0
messages    0
dtype: int64

 df.duplicated().sum()

In [11]:
df = df.drop_duplicates(keep='first')

In [12]:
df.duplicated().sum()

0

# EDA 

In [13]:
import nltk

In [14]:
!pip install nltk



In [None]:
nltk.download('punkt')

In [16]:
df['num_characters'] = df['messages'].apply(len)

In [17]:
df.head()

Unnamed: 0,label,messages,num_characters
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [18]:
df['num_words'] = df['messages'].apply(lambda x:len(nltk.word_tokenize(x)))

In [19]:
df.sample(5)

Unnamed: 0,label,messages,num_characters,num_words
4347,0,You still around? I could use a half-8th,40,9
282,0,"Wen u miss someone, the person is definitely s...",141,30
2606,0,HELLO U.CALL WEN U FINISH WRK.I FANCY MEETIN U...,132,30
549,0,Wait &lt;#&gt; min..,22,10
975,0,Eh u send wrongly lar...,24,6


In [20]:
df[['num_characters','num_words']].describe()

Unnamed: 0,num_characters,num_words
count,5169.0,5169.0
mean,79.344554,18.591023
std,58.437457,13.40284
min,2.0,1.0
25%,36.0,9.0
50%,61.0,15.0
75%,119.0,26.0
max,910.0,220.0


In [21]:
df[df['label'] == 0][['num_characters','num_words']].describe()

Unnamed: 0,num_characters,num_words
count,4516.0,4516.0
mean,70.90589,17.264836
std,56.715046,13.587852
min,2.0,1.0
25%,34.0,8.0
50%,53.0,13.0
75%,91.0,22.0
max,910.0,220.0


# model Building

In [22]:
X_train, X_test, y_train, y_test = tts(df['messages'], df['label'], test_size=0.2, random_state=1)     

In [23]:
X_train

335     Valentines Day Special! Win over £1000 in our ...
3647    Carlos says we can pick up from him later so y...
2341    I will take care of financial problem.i will h...
32                          K tell me anything about you.
4112    URGENT! Your Mobile number has been awarded a ...
                              ...                        
2904                          Tell me pa. How is pain de.
925     Actually i deleted my old website..now i m blo...
4249    accordingly. I repeat, just text the word ok o...
239     New Theory: Argument wins d SITUATION, but los...
5560                    Anything lor. Juz both of us lor.
Name: messages, Length: 4135, dtype: object

In [24]:
X_test

3131                               Ok. But i finish at 6.
4077    87077: Kick off a new season with 2wks FREE go...
3238    Ron say fri leh. N he said ding tai feng cant ...
3420    Do you want a new Video phone? 600 anytime any...
2225    I prefer my free days... Tues, wed, fri oso ca...
                              ...                        
3812          Excellent! Wish we were together right now!
1741    UR GOING 2 BAHAMAS! CallFREEFONE 08081560665 a...
1098    Don't fret. I'll buy the ovulation test strips...
3405    Then ü ask dad to pick ü up lar... Ü wan 2 sta...
2118    Wish u many many returns of the day.. Happy bi...
Name: messages, Length: 1034, dtype: object

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


In [26]:
train_data = cv.fit_transform(X_train)

In [27]:
test_data = cv.transform(X_test)

In [28]:
Mnb = MultinomialNB()
Mnb.fit(train_data, y_train)

In [29]:
MnbPredicts = Mnb.predict(test_data)

In [30]:
print("The accuracy of our Naïve Bayes multinomial model is {} %".format(accuracy_score(y_test, MnbPredicts) * 100))
print("The Precision of our Naïve Bayes multinomial model is {} %". format(precision_score(y_test, MnbPredicts)* 100))
print("The Recall of our Naïve Bayes multinomial model is {} %" . format(recall_score(y_test, MnbPredicts)* 100))

The accuracy of our Naïve Bayes multinomial model is 99.03288201160542 %
The Precision of our Naïve Bayes multinomial model is 98.1651376146789 %
The Recall of our Naïve Bayes multinomial model is 93.04347826086956 %


In [31]:
confusionmatrix = confusion_matrix(y_test, MnbPredicts)
print("The accuracy of Naive Bayes clasifier is {}%".format(accuracy_score(y_test, MnbPredicts) * 100))
print("\n", confusionmatrix)


     

The accuracy of Naive Bayes clasifier is 99.03288201160542%

 [[917   2]
 [  8 107]]
