# Vectorization

In [1]:
import pandas as pd
import string 
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option('display.max_colwidth', 100)

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
ss = nltk.SnowballStemmer('english')

In [3]:
data = pd.read_csv('SMSSpamCollection.tsv', sep='\t')
data.columns = ['label', 'body_text']
data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


## Clean up text (tokenize, remove punctuation, etc.)

In [4]:
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = ''.join([word.lower() for word in text if word not in string.punctuation]) # Remove punctuation
    tokens = re.split('\W+',text) # Tokenize: Split on any character that is not alphanumeric
    text = [ss.stem(word) for word in tokens if word not in stopwords] # Remove stopwords & stem
    #text = [ss.stem(word) for word in tokenized_text] # Stemming
    
    return text

In [5]:
#ata['body_clean'] = data['body_text'].apply(lambda x: clean_text(x.lower(), stopwords))

In [6]:
cv = CountVectorizer(analyzer=clean_text)
# The model trains to create vectors and then actually transforms the data based on the fitted model
x_counts = cv.fit_transform(data['body_text']) 
print(x_counts.shape)

(5567, 8102)


In [7]:
#print(cv.get_feature_names())
data_sample = data[0:20]

cv_sample = CountVectorizer(analyzer=clean_text)
x_counts_sample = cv_sample.fit_transform(data_sample['body_text'])
print(x_counts_sample.shape)

(20, 192)


In [8]:
print(cv_sample.get_feature_names())

['08002986030', '08452810075over18', '09061701461', '1', '100', '100000', '11', '12', '150pday', '16', '2', '20000', '2005', '21st', '3', '4', '4403ldnw1a7rw18', '4txtú120', '6day', '81010', '87077', '87121', '87575', '9', '900', 'aft', 'aid', 'alreadi', 'alright', 'anymor', 'appli', 'ard', 'around', 'b', 'brother', 'call', 'caller', 'callertun', 'camera', 'cash', 'chanc', 'claim', 'click', 'co', 'code', 'colour', 'comin', 'comp', 'copi', 'cost', 'credit', 'cri', 'csh11', 'cup', 'custom', 'da', 'date', 'dont', 'eg', 'eh', 'england', 'enough', 'entitl', 'entri', 'even', 'fa', 'feel', 'ffffffffff', 'final', 'fine', 'finish', 'first', 'free', 'friend', 'go', 'goalsteam', 'goe', 'gonna', 'gota', 'ha', 'hl', 'home', 'hour', 'httpwap', 'im', 'info', 'ive', 'jackpot', 'joke', 'k', 'kim', 'kl341', 'lar', 'latest', 'lccltd', 'like', 'link', 'live', 'lor', 'lunch', 'macedonia', 'make', 'may', 'meet', 'mell', 'membership', 'messag', 'minnaminungint', 'miss', 'mobil', 'month', 'nah', 'name', 'nati

### Sparse Matrix
- A ***sparse matrix***: A matric in which most elements are 0. In the interest of efficient storage, a sparse matrix will be stored by only storing the locations of the non-zero elements

In [9]:
x_counts_sample

<20x192 sparse matrix of type '<class 'numpy.int64'>'
	with 218 stored elements in Compressed Sparse Row format>

#### Document Term Matrix

In [12]:
# This is the document term matrix
x_counts_sample_df = pd.DataFrame(x_counts_sample.toarray())
x_counts_sample_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,182,183,184,185,186,187,188,189,190,191
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
x_counts_sample_df.columns = cv_sample.get_feature_names()
x_counts_sample_df.head()

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150pday,16,...,wet,win,winner,wkli,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,yes,ü
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
