# Vectorizing Raw Data: N-Grams

In [1]:
import pandas as pd
import nltk 
import string 
import re
pd.set_option('display.max_colwidth', 100)

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
ss = nltk.SnowballStemmer('english')

In [3]:
data = pd.read_csv('SMSSpamCollection.tsv', sep='\t')
data.columns = ['label', 'body_text']
data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


In [8]:
def clean_text(text):
    #stopwords = nltk.corpus.stopwords.words('english')
    text = ''.join([word.lower() for word in text if word not in string.punctuation]) # Remove punctuation
    tokens = re.split('\W+',text) # Tokenize: Split on any character that is not alphanumeric
    text = ' '.join([ss.stem(word) for word in tokens if word not in stopwords]) # Remove stopwords & stem
    #text = [ss.stem(word) for word in tokenized_text] # Stemming
    
    return text

In [9]:
data['cleaned_text'] = data['body_text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,body_text,cleaned_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entri 2 wkli comp win fa cup final tkts 21st may 2005 text fa 87121 receiv entri questionst...
1,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think goe usf live around though
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak treat like aid patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend...


## Apply CountVectorizer (w/ N-Grams) 

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
# Apply on smaller dataset to make it easy to visualize the implementation
data_sample = data[:20]

### Vectorizer outputs a sparse matrix
- A ***sparse matrix***: A matric in which most elements are 0. In the interest of efficient storage, a sparse matrix will be stored by only storing the locations of the non-zero elements

In [13]:
bi_gram = CountVectorizer(ngram_range=(2,2))
xcounts = bi_gram.fit_transform(data['cleaned_text'])

print(xcounts.shape)
#print(bi_gram.get_feature_names())

(5567, 31275)


In [16]:
# Sample on smaller data

bigram_sample = CountVectorizer(ngram_range=(2,2))
xcounts_sample = bigram_sample.fit_transform(data_sample['cleaned_text'])

print(xcounts_sample.shape)
print(bigram_sample.get_feature_names())

(20, 198)
['09061701461 claim', '100 20000', '100000 prize', '11 month', '12 hour', '150pday 6day', '16 tsandc', '20000 pound', '2005 text', '21st may', '4txtú120 poboxox36504w45wq', '6day 16', '81010 tc', '87077 eg', '87077 trywal', '87121 receiv', '87575 cost', '900 prize', 'aft finish', 'aid patent', 'alright way', 'anymor tonight', 'appli 08452810075over18', 'appli repli', 'ard smth', 'around though', 'brother like', 'call 09061701461', 'call mobil', 'caller press', 'callertun caller', 'camera free', 'cash 100', 'chanc win', 'claim 81010', 'claim call', 'claim code', 'click httpwap', 'click wap', 'co free', 'code kl341', 'colour mobil', 'comp win', 'copi friend', 'cost 150pday', 'credit click', 'cri enough', 'csh11 send', 'cup final', 'custom select', 'da stock', 'date sunday', 'dont miss', 'dont think', 'dont want', 'eg england', 'eh rememb', 'england 87077', 'england macedonia', 'enough today', 'entitl updat', 'entri questionstd', 'entri wkli', 'even brother', 'fa 87121', 'fa cup

### Create DataFrame 

In [18]:
df = pd.DataFrame(xcounts.toarray())
df.columns = bi_gram.get_feature_names()
df.head()

Unnamed: 0,008704050406 sp,0089mi last,0121 2025050,01223585236 xx,01223585334 cum,0125698789 ring,02 user,020603 2nd,0207 153,02072069400 bx,...,zoe 18,zoe hit,zogtorius stare,zoom cine,zouk nichol,zyada kisi,üll finish,üll submit,üll take,〨ud even
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
