# TP2 Machine Learning - Part 3: Text mining & classification  

*William BLAUFUKS* & *Virgile FOY*  

In [80]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
# nltk.download('stopwords')

## 1. Data loading

In [103]:
df = pd.read_csv("SMSSpamCollection.data", sep='\t', header=None)
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 150
df.head(100)

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845281007...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your frie...
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 ...
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 080...


## 2. CountVectorizer

We'll use the approach described in the following article: https://medium.com/@randerson112358/email-spam-detection-using-python-machine-learning-abe38c889855 (130 likes from the community after 6 months).

In [104]:
# Change the features' names according to the article
mapper = {
    0: 'spam',
    1: 'text'
}
df = df.rename(mapper, axis=1)

# Replace ham/spam with 0/1
df['spam'] = df['spam'].replace(['ham', 'spam'], [0, 1])

In [105]:
# Tokenization (a list of tokens), will be used as the analyzer
#   1.Punctuations are [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]
#   2.Stop words in natural language processing, are useless words (data).
def process_text(text):
    '''
    What will be covered:
    1. Remove punctuation
    2. Remove stopwords
    3. Return list of clean text words
    '''
    
    #1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words

# Show the Tokenization (a list of tokens )
df['text'].head().apply(process_text)

0                                                      [Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]
1                                                                                                                           [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receive, entry, questionstd, txt, rateTCs, apply, 084528...
3                                                                                                            [U, dun, say, early, hor, U, c, already, say]
4                                                                                                     [Nah, dont, think, goes, usf, lives, around, though]
Name: text, dtype: object

### 2.1 Naive Bayes model

In [96]:
X = df['text']
y = df['spam']

X = CountVectorizer(analyzer=process_text).fit_transform(X)

classifier = MultinomialNB()
parameters = {
    'alpha': np.arange(.1, 1, .1)
}

search = GridSearchCV(classifier, parameters, scoring='accuracy', iid=False, cv=10, refit=False)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.968):
{'alpha': 0.1}


### 2.2 K-nn model

In [101]:
X = df['text']
y = df['spam']

X = CountVectorizer(analyzer=process_text).fit_transform(X)

classifier = KNeighborsClassifier()
parameters = {'n_neighbors': np.arange(1,30)}

search = GridSearchCV(classifier, parameters, scoring='accuracy', iid=False, cv=10, refit=False)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.949):
{'n_neighbors': 1}


### 2.3 CART model

In [67]:
X = df['text']
y = df['spam']

X = CountVectorizer(analyzer=process_text).fit_transform(X)

classifier = DecisionTreeClassifier(random_state=1)
parameters = {
    'criterion': ["gini", "entropy"],
    'splitter': ["best", "random"],
    'max_depth': np.concatenate((np.arange(1, 5), 5*np.arange(1, 21)))
}

search = GridSearchCV(classifier, parameters, scoring='precision', iid=False, cv=10, refit=False)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.845):
{'criterion': 'gini', 'max_depth': 95, 'splitter': 'random'}


### 2.4 Random Forest model

In [94]:
X = df['text']
y = df['spam']

X = CountVectorizer(analyzer=process_text).fit_transform(X)

import warnings
warnings.filterwarnings('ignore')
classifier = RandomForestClassifier(max_depth=2, random_state=0)
parameters = {
    'max_depth': np.arange(1, 30, 1)
}

search = GridSearchCV(classifier, parameters, scoring='precision', iid=False, cv=10, refit=False)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.553):
{'max_depth': 29}


The best accuracy we have got (96.8%) is obtained with the naive bayes model. Suprisingly, the k-nn model (n_neighbors=1) has a very good accuracy (94.9%).

## 3. Tf–idf term weighting

### 3.1 Naive Bayes model

In [74]:
X = df['text']
y = df['spam']

X = CountVectorizer(analyzer=process_text).fit_transform(X)

X = TfidfTransformer(smooth_idf=False).fit_transform(X)

classifier = MultinomialNB()
parameters = {
    'alpha': np.arange(.1, 1, .1)
}

search = GridSearchCV(classifier, parameters, scoring='recall', iid=False, cv=10, refit=False)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.938):
{'alpha': 0.1}


### 3.2 K-nn model

In [106]:
X = df['text']
y = df['spam']

X = CountVectorizer(analyzer=process_text).fit_transform(X)

X = TfidfTransformer(smooth_idf=False).fit_transform(X)

classifier = KNeighborsClassifier()
parameters = {'n_neighbors': np.arange(1,30)}


search = GridSearchCV(classifier, parameters, scoring='recall', iid=False, cv=10, refit=False)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.596):
{'n_neighbors': 1}


Surprisingly, the tf-idf transformation doesn't help at all. It even makes the K-nn model totally unaccurate.

## 4. Semantic decomposition

In [107]:
X = df['text']
y = df['spam']

X = CountVectorizer(analyzer=process_text).fit_transform(X)

semantic_decomp = TruncatedSVD(
             n_components=100, 
             algorithm='randomized', 
             n_iter=7, 
             random_state=None, 
             tol=0.0
)
X = semantic_decomp.fit_transform(X)
print('preprocessing OK')

classifier = KNeighborsClassifier()
parameters = {'n_neighbors': np.arange(1,30)}


search = GridSearchCV(classifier, parameters, scoring='recall', iid=False, cv=10, refit=False)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

preprocessing OK
Best parameter (CV score=0.751):
{'n_neighbors': 1}


We can't use the naive bayes model after the semantic decomposition because it doesn't support negative values. The k-nn model doesn't perform very well here. In order to keep using the naive bayes model, we should use NMF instead of the semantic decomposition. Anyway, the naive bayes model needs the input features to be strongly independent, which is not true after the semantic decomposition.