In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('spam.csv', encoding='cp1252')
df.head(5)

## Here, there are some characters that aren't unicode. So they cannot be read by default settings.
## hence, changing the encoding (referred stack overflow by pasting the error msg received earlier)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Exploration

In [3]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [4]:
df['Category'].describe()

count     5572
unique       2
top        ham
freq      4825
Name: Category, dtype: object

In [5]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

Encoding 'Category' Feature

In [6]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['spam'], test_size = 0.25)

Upto here, we have encoded the category column

Messages are still in text format. But in ML, the algos cannot work with words and sentences. they need to be converted to numbers

## sklearn CountVectorizer
is one of the technique to represent words as count

Similar to pd.get_dummies... CountVectorizer treats all the unique words (in entire dataset) as separate columns

For each record, it counts the occurences of each word and builds a matrix

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
v = CountVectorizer()

In [10]:
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
from sklearn.naive_bayes import MultinomialNB

## Gaussian Naive Bayes is used when the data is normally distributed.it is used when all features are continuous
## and cannot be represented in terms of their occurences

## Multinomial Naive Bayes: isused when we have discrete data (eg. movie rating from 1 to 5 as each rating will have certain frequency)

## Bernoulli Naive Bayes: Assumes that all features are binary (eg.0: word does not exist in document
##  1: Work exista in document)

In [12]:
model = MultinomialNB()

In [13]:
model.fit(X_train_count, y_train)

MultinomialNB()