In [1]:
#Data handling libraries
import pandas as pd
import numpy as np
#Data Preprocessing libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_table('SMSSpamCollection',names=['label','message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
le=LabelEncoder()
df['label']=le.fit_transform(df['label'])

In [6]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


##  Count Vectorizer

In [7]:
msg=['Hi I have reached home','You have won a lottery of 10 Lakh rupees','How are you','Good morning',
     'Hi this is Shivam from ML batch','Congratulations you have won a lottery','Good morning morning']


In [8]:
msg

['Hi I have reached home',
 'You have won a lottery of 10 Lakh rupees',
 'How are you',
 'Good morning',
 'Hi this is Shivam from ML batch',
 'Congratulations you have won a lottery',
 'Good morning morning']

In [9]:
cv = CountVectorizer()
cv.fit(msg)

CountVectorizer()

In [10]:
words=cv.get_feature_names()



In [11]:
matrix=cv.transform(msg).toarray()

In [12]:
df_eg=pd.DataFrame(data=matrix,columns=words)

In [13]:
df_eg

Unnamed: 0,10,are,batch,congratulations,from,good,have,hi,home,how,...,lottery,ml,morning,of,reached,rupees,shivam,this,won,you
0,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,1,0,0,1,0,1,0,0,1,1
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,1,0,0
5,0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,1
6,0,0,0,0,0,1,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0


##  Count Vector Implementation on SMS data

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   int32 
 1   message  5572 non-null   object
dtypes: int32(1), object(1)
memory usage: 65.4+ KB


In [15]:
cv=CountVectorizer()

In [16]:
x = df['message']
y = df['label']

In [17]:
cv.fit(x)

CountVectorizer()

In [18]:
unique_words=cv.get_feature_names()



In [19]:
len(unique_words)

8713

In [20]:
count=cv.transform(x).toarray()

In [21]:
df_message=pd.DataFrame(data=count,columns=unique_words)

In [22]:
df_message

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
x_train,x_test,y_train,y_test=train_test_split(df_message,y,test_size=0.3,random_state=1)

In [24]:
nb = MultinomialNB()

In [25]:
nb.fit(x_train,y_train)

MultinomialNB()

In [26]:
y_pred=nb.predict(x_test)

In [27]:
accuracy_score(y_test,y_pred)

0.9838516746411483

##  Cross validation on the above model

In [28]:
crv=cross_val_score(estimator=MultinomialNB(),X=df_message,y=y,cv=10)

In [29]:
crv

array([0.98207885, 0.97849462, 0.97845601, 0.98025135, 0.97845601,
       0.97845601, 0.98204668, 0.98384201, 0.97127469, 0.99102334])

In [30]:
crv.mean()

0.9804379580831772

In [31]:
new_record=['You have won a lucky draw worth $10000']

In [32]:
transform_array=cv.transform(new_record).toarray()

In [33]:
len(transform_array[0])

8713

In [34]:
transform_array

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [35]:
tranform_record=pd.DataFrame(data=transform_array,columns=unique_words)

In [39]:
pred=nb.predict(tranform_record)

In [40]:
pred

array([1])

In [41]:
le.inverse_transform(pred)

array(['spam'], dtype=object)