In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("spams.csv")
df.head()

Unnamed: 0,title,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('title').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,1143,1110,"Sorry, I'll call later",8
spam,196,187,Free entry in 2 a wkly comp to win FA Cup fina...,2


In [4]:
    #The describe() method returns description of the data in the DataFrame.
    #need to convert the title into numbers such as whether spam or not

In [5]:
df['spam']=df['title'].apply(lambda x:1 if x=='spam' else 0) #the lambda function takes for every value x and checks if its spam or ham if it is spam then it returns 1 or else it returns 0
df.head

<bound method NDFrame.head of      title                                            Message  spam
0      ham  Go until jurong point, crazy.. Available only ...     0
1      ham                      Ok lar... Joking wif u oni...     0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...     1
3      ham  U dun say so early hor... U c already then say...     0
4      ham  Nah I don't think he goes to usf, he lives aro...     0
...    ...                                                ...   ...
1334   ham         Oh... Icic... K lor, den meet other day...     0
1335   ham  Oh ! A half hour is much longer in Syria than ...     0
1336   ham  Sometimes we put walls around our hearts,not j...     0
1337   ham  Sweet, we may or may not go to 4U to meet carl...     0
1338   ham       Then she buying today? Ü no need to c meh...     0

[1339 rows x 3 columns]>

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train,x_test,y_train,y_test=train_test_split(df.Message,df.title,test_size=0.25)

In [8]:
#Convert message column into numbers using CountVectorizer
#In order to use textual data for predictive modeling, the text must be parsed to remove certain words – this process is called tokenization. These words need to then be encoded as integers, or floating-point values, for use as inputs in machine learning algorithms. This process is called feature extraction (or vectorization).
#Scikit-learn’s CountVectorizer is used to convert a collection of text documents to a vector of term/token counts. It also enables the ​pre-processing of text data prior to generating the vector representation. This functionality makes it a highly flexible feature representation module for text.

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer()
x_train_count=v.fit_transform(x_train.values)
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
#We use multinomialNB. 
#There are 3 types of classifiers under Naive Bayes: 1.Bernoulli NB  2.Multinomial NB  3.Gaussian NB
# 1. We use Bernoulli NB when the features are of either 0 or 1 values
# 2. We use Multinomial NB when we have discrete data 
# 3. We use Gaussian NB when there is normal distribution and all our features are continuous.

In [11]:
from sklearn.naive_bayes import MultinomialNB 
model=MultinomialNB()
model.fit(x_train_count,y_train)

MultinomialNB()

In [13]:
emails=['Ok ill tell the company','Ur balance is now £500. Ur next question is: Who sang dgusyfdb ? 2 answer txt ur ANSWER to 83600. Good luck!']
emails_count=v.transform(emails)
model.predict(emails_count)

array(['ham', 'spam'], dtype='<U4')

In [14]:
x_test_count=v.transform(x_test)
model.score(x_test_count,y_test)

0.9791044776119403

In [15]:
#Another way to do the above program is by using a pipeline. The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by a '__'


In [16]:
from sklearn.pipeline import Pipeline 
clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [17]:
clf.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [18]:
clf.score(x_test,y_test)

0.9791044776119403