In [4]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wl = nltk.WordNetLemmatizer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

# also check spaCy and language models

## Cleaning
We remove punctuations and stopwords(too common words), we turn each document into a list of tokens(words) and we lemmatize each word, meaning we convert words into their root form.

(check: lemmatization vs stemming)

In [5]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wl.lemmatize(word) for word in tokens if word not in stopwords]
    return text
data['body_text'].apply(clean_text)

0       [free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...
1                                                         [nah, dont, think, go, usf, life, around, though]
2                                                    [even, brother, like, speak, treat, like, aid, patent]
3                                                                                            [date, sunday]
4       [per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, caller, pre...
                                                       ...                                                 
5562    [2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1...
5563                                                                     [ü, b, going, esplanade, fr, home]
5564                                                                        [pity, mood, soany, suggestion]
5565       [guy, bitching, a

## Count Vectorizer
Each column represents a different word and each row are the occurrences of that word in a document

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['body_text'])
X_counts.shape

(5567, 8911)

In [7]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df.columns = count_vect.get_feature_names_out()
X_counts_df

Unnamed: 0,Unnamed: 1,0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,...,zindgi,zoe,zogtorius,zoom,zouk,zyada,é,ü,üll,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5562,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X_counts_df.sum().sort_values(ascending=False)

u           1193
call         607
2            482
im           464
             423
            ... 
lambu          1
lambda         1
lakh           1
laidwant       1
〨ud            1
Length: 8911, dtype: int64

## N-Grams
N-Grams are tokens of multiple words, they are used to try and retain the context. "the tree of knowledge is forbidden" with n-gram = 3 gives a list ["the tree of", "tree of knowledge", "of knowledge is", "knowledge is forbidden"]


We need the untokenized clean text to produce our n-gram tokens

In [9]:
def untokenized_clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([wl.lemmatize(word) for word in tokens if word not in stopwords])
    return text

data['cleaned_text'] = data['body_text'].apply(untokenized_clean_text)
data

Unnamed: 0,label,body_text,cleaned_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
1,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think go usf life around though
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak treat like aid patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy fr...
...,...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy...,2nd time tried 2 contact u u 750 pound prize 2 claim easy call 087187272008 now1 10p per minute ...
5563,ham,Will ü b going to esplanade fr home?,ü b going esplanade fr home
5564,ham,"Pity, * was in mood for that. So...any other suggestions?",pity mood soany suggestion
5565,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week ...,guy bitching acted like id interested buying something else next week gave u free


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# (x,y) where x is the starting range of produced n-grams
# and y is the end range of produced n-grams
ngram_vect = CountVectorizer(ngram_range=(2,2))
X_gram_counts = ngram_vect.fit_transform(data['cleaned_text'])
X_gram_counts.shape, ngram_vect.get_feature_names_out()

((5567, 31621),
 array(['008704050406 sp', '0089my last', '0121 2025050', ...,
        'üll submitting', 'üll take', '〨ud evening'], dtype=object))

In [11]:
X_gram_counts_df = pd.DataFrame(X_gram_counts.toarray())
X_gram_counts_df.columns = ngram_vect.get_feature_names_out()
X_gram_counts_df

Unnamed: 0,008704050406 sp,0089my last,0121 2025050,01223585236 xx,01223585334 cum,0125698789 ring,02 user,020603 2nd,0207 153,02072069400 bx,...,zoe 18,zoe hit,zogtorius staring,zoom cine,zouk nichols,zyada kisi,üll finish,üll submitting,üll take,〨ud evening
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5562,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X_gram_counts_df.sum().sort_values(ascending=False)

please call       53
call later        52
ill call          48
let know          42
sorry ill         39
                  ..
group mail         1
group liao         1
group company      1
group attached     1
〨ud evening        1
Length: 31621, dtype: int64

## Term Frequency, Inverse Document Frequency (TF-IDF)
We combine the frequency of a term in a document(text) and multiply it by the log of the inverse frequency of in how many documents that word appears.
(for example, if a word appears in every document: log(n_documents/n_appearances) = log(1) = 0; which makes sense because that word contains no information for our purpose)

This is useful to dissipate the weight of words that appear in too many documents(similar to stopwords), and also to intensify the weight of words used only in certain documents.
It's a tradeoff where document size and total n of documents matter.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

data

Unnamed: 0,label,body_text,cleaned_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
1,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think go usf life around though
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak treat like aid patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy fr...
...,...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy...,2nd time tried 2 contact u u 750 pound prize 2 claim easy call 087187272008 now1 10p per minute ...
5563,ham,Will ü b going to esplanade fr home?,ü b going esplanade fr home
5564,ham,"Pity, * was in mood for that. So...any other suggestions?",pity mood soany suggestion
5565,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week ...,guy bitching acted like id interested buying something else next week gave u free


In [16]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf.shape, tfidf_vect.get_feature_names_out()

((5567, 8911),
 array(['', '0', '008704050406', ..., 'ü', 'üll', '〨ud'], dtype=object))

In [17]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names_out()
X_tfidf_df

Unnamed: 0,Unnamed: 1,0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,...,zindgi,zoe,zogtorius,zoom,zouk,zyada,é,ü,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
5563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32906,0.0,0.0
5564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
5565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0


In [18]:
X_tfidf_df.sum().sort_values(ascending=False)

u              163.291053
call           107.923967
ok              93.627056
im              92.573196
                91.705847
                  ...    
theseyours       0.096727
rememberi        0.096727
approaching      0.096727
dasara           0.096727
ugadi            0.096727
Length: 8911, dtype: float64

## tf-idf n-grams
Yes, we can combine these

In [20]:
tfidf_ngram_vect = TfidfVectorizer(ngram_range=(2,2))
X_tfidf_gram = tfidf_ngram_vect.fit_transform(data['cleaned_text'])
X_tfidf_gram.shape, tfidf_ngram_vect.get_feature_names_out()

((5567, 31621),
 array(['008704050406 sp', '0089my last', '0121 2025050', ...,
        'üll submitting', 'üll take', '〨ud evening'], dtype=object))

In [21]:
X_tfidf_gram_df = pd.DataFrame(X_tfidf_gram.toarray())
X_tfidf_gram_df.columns = tfidf_ngram_vect.get_feature_names_out()
X_tfidf_gram_df

Unnamed: 0,008704050406 sp,0089my last,0121 2025050,01223585236 xx,01223585334 cum,0125698789 ring,02 user,020603 2nd,0207 153,02072069400 bx,...,zoe 18,zoe hit,zogtorius staring,zoom cine,zouk nichols,zyada kisi,üll finish,üll submitting,üll take,〨ud evening
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X_tfidf_gram_df.sum().sort_values(ascending=False)

call later            25.112877
ill call              23.761147
sorry ill             21.302681
please call           12.931733
ok lor                11.145534
                        ...    
praise herlove         0.112426
beautiful lady         0.112426
beautiful thenwill     0.112426
time dreamlove         0.112426
every breath           0.112426
Length: 31621, dtype: float64