In [1]:
import pandas as pd

In [52]:
messages = pd.read_csv('spam_ham_dataset.csv')
messages.head()

Unnamed: 0,Label,Message
0,spam,Congratulations! You've won a free ticket to t...
1,ham,"Hey, are we still on for lunch tomorrow?"
2,spam,You have been selected to receive a $1000 Walm...
3,ham,Can you send me the report by 5 PM today?
4,spam,Exclusive offer! Get 50% off on your next purc...


In [46]:
import re

## Data Cleaning and Preprocessing

In [9]:
import nltk

In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer,SnowballStemmer

In [48]:
stp = stopwords.words('english')
stp

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [49]:
lemma = WordNetLemmatizer()

##### Step 1 -> Try to remove all special characters other than a-z and A-Z 
##### Step 2 -> converting each sentence into lower case
##### Step 3 -> making a list of words using split 
##### Step 4 -> If the word not in stopwords english list we create lemma (root word) for each word
##### Step 5 -> joining it again to form a sentence 
##### Step 6 -> appending each sentence to create a list of sentences

In [50]:
corpus = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['Message'][i])
    review = review.lower()
    review = review.split()
    review = [lemma.lemmatize(word) for word in review if word not in stp]
    review = ' '.join(review)
    corpus.append(review)

In [51]:
corpus

['congratulation free ticket bahamas call claim prize',
 'hey still lunch tomorrow',
 'selected receive walmart gift card click claim',
 'send report pm today',
 'exclusive offer get next purchase limited time',
 'forget meeting tomorrow',
 'urgent account compromised click link secure',
 'happy birthday hope great day',
 'win brand new iphone participate survey get chance win',
 'help presentation next meeting']

## Create a bag of words 

#### Creating bag of words model

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

#### CountVectorizer can be used to select different parameters in bag of words

In [55]:
cv = CountVectorizer(max_features=30)

In [60]:
X=cv.fit_transform(corpus).toarray()

In [61]:
X

array([[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 1,

In [63]:
X.shape

(10, 30)

The fit_transform method combines two steps:
Fit: It learns the vocabulary of the text data. In other words, it finds all the unique words in your text documents and assigns each word a unique index.
Transform: It converts the text documents into a matrix where each row represents a document and each column represents a word from the vocabulary. The values in the matrix are the counts of each word in the respective document.

### Bag Of Words with ngrams usage

In [67]:
cv.vocabulary_

{'ticket': 23,
 'bahamas': 1,
 'claim': 4,
 'prize': 14,
 'still': 21,
 'tomorrow': 26,
 'selected': 19,
 'receive': 16,
 'walmart': 28,
 'click': 5,
 'send': 20,
 'report': 17,
 'pm': 12,
 'today': 25,
 'offer': 10,
 'get': 6,
 'next': 9,
 'purchase': 15,
 'time': 24,
 'meeting': 7,
 'urgent': 27,
 'account': 0,
 'secure': 18,
 'birthday': 2,
 'win': 29,
 'brand': 3,
 'new': 8,
 'participate': 11,
 'survey': 22,
 'presentation': 13}

In [64]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
cv = CountVectorizer(max_features=15,ngram_range=(1,3))

In [77]:
X=cv.fit_transform(corpus).toarray()

In [78]:
X

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0]], dtype=int64)

In [79]:
cv.vocabulary_

{'claim': 1,
 'prize': 12,
 'tomorrow': 13,
 'click': 2,
 'pm': 8,
 'get': 3,
 'next': 5,
 'meeting': 4,
 'account': 0,
 'win': 14,
 'participate survey': 6,
 'participate survey get': 7,
 'presentation': 9,
 'presentation next': 10,
 'presentation next meeting': 11}

#### We can see in vocabulary we have combination of 2 and 3 words also since ngrams_range from 1 to 3 combinations