# Optional 2: Spam classification

In [2]:
import pandas as pd
from nltk.metrics.distance import edit_distance 
from nltk.metrics import ConfusionMatrix
import numpy as np
from time import time
from datetime import timedelta

In [3]:
path = 'data/SMSSpamCollection'
corpus = pd.read_csv(path, lineterminator='\n',sep='\t', names=['label', 'sentence'], header=None)
corpus.head()

Unnamed: 0,label,sentence
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
def preprocessing(data):
    # remove the digits and puntuation
    #data['sentence'] = data['sentence'].str.replace('\d+', '')
    # convert to lowercase
    data['sentence'] = data['sentence'].str.replace('\W+', ' ')
    # replace continuous white spaces by a single one
    data['sentence'] = data['sentence'].str.replace('\s+', ' ')
    data['sentence'] =[a.strip().lower() for a in data['sentence']] 
    return data

In [5]:
corpus = preprocessing(corpus)
y = corpus['label']
X = corpus.drop('label',axis=1)
X.head()

Unnamed: 0,sentence
0,go until jurong point crazy available only in ...
1,ok lar joking wif u oni
2,free entry in 2 a wkly comp to win fa cup fina...
3,u dun say so early hor u c already then say
4,nah i don t think he goes to usf he lives arou...


In [6]:
from sklearn.model_selection import train_test_split
from nltk.metrics.scores import accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

### Handmade KNN

In [7]:
X_train_hand = X_train.copy()
X_train_hand['sentence']= X_train['sentence'].str.split()
X_train_hand.head()

Unnamed: 0,sentence
3406,"[jus, chillaxin, what, up]"
383,"[hey, leave, it, not, a, big, deal, take, care]"
852,"[i, am, real, baby, i, want, to, bring, out, y..."
1880,"[u, have, a, secret, admirer, who, is, looking..."
370,"[cool, text, me, when, you, re, ready]"


In [8]:
X_test_hand = X_test.copy()
X_test_hand['sentence']= X_test['sentence'].str.split()
X_test_hand.head()

Unnamed: 0,sentence
3245,"[squeeeeeze, this, is, christmas, hug, if, u, ..."
944,"[and, also, i, ve, sorta, blown, him, off, a, ..."
1044,"[mmm, thats, better, now, i, got, a, roast, do..."
2484,"[mm, have, some, kanji, dont, eat, anything, h..."
812,"[so, there, s, a, ring, that, comes, with, the..."


In [9]:
from nltk.metrics.distance import jaccard_distance

jaccard = lambda a,b : jaccard_distance(set(a),set(b))

def knn(ex,d=jaccard):
    try:
        dis = [d(train, ex) for train in X_train_hand['sentence']]
        index = X_train_hand['sentence'].index[np.argmin(dis)]
        label = y_train[index]
    # If the sentence is not valid (or has len 0) return a random value
    except:
        labs = ['spam','ham']
        label = labs[np.random.randint(0,2)]
    return label

def test_knn(x_test):
    preds = []
    for val in x_test['sentence'].values:
        preds.append(knn(val))
    print('Accuracy:',round(accuracy(y_test,preds),3))
    print(ConfusionMatrix(y_test.tolist(),preds).pretty_format())

In [10]:
init = time()
test_knn(X_test_hand)
print('time:', timedelta(seconds=time() - init))

Accuracy: 0.981
     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2397>  19 |
spam |   34 <336>|
-----+-----------+
(row = reference; col = test)

time: 0:00:31.059042


### Sklearn version KNN

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_train = cv.fit_transform(X_train['sentence'])
X_test = cv.transform(X_test['sentence'])

In [12]:
from sklearn.neighbors import KNeighborsClassifier
init = time()

clf = KNeighborsClassifier(1,metric='jaccard')
clf.fit(X_train.todense(),y_train)

preds = clf.predict(X_test.todense()).tolist()
print('Accuracy:',round(accuracy(y_test,preds),3))
print(ConfusionMatrix(y_test.tolist(),preds).pretty_format())
print('time:', timedelta(seconds=time() - init))

Accuracy: 0.978
     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2398>  18 |
spam |   44 <326>|
-----+-----------+
(row = reference; col = test)

time: 0:01:23.509468


### Simple SVM

In [13]:
from sklearn.svm import SVC

clf = SVC(kernel='linear')
clf.fit(X_train,y_train)

preds = clf.predict(X_test).tolist()
print('Accuracy:',round(accuracy(y_test,preds),3))
print(ConfusionMatrix(y_test.tolist(),preds).pretty_format())
print('time:', timedelta(seconds=time() - init))           

Accuracy: 0.987
     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2414>   2 |
spam |   33 <337>|
-----+-----------+
(row = reference; col = test)

time: 0:01:23.887914


### Custom Kernel SVM

In [14]:
def calculate_kernel_matrix(m1,m2):
    # m1.shape = (n_samples_1, n_features),
    # m2.shape = (n_samples_2, n_features)
    # return M (n_samples_1,n_samples2)
    n_samples_1 = m1.shape[0]
    n_samples_2 = m2.shape[0]
    M = np.zeros([n_samples_1,n_samples_2])
    k = 0
    for i in m1.index:
        l = 0
        for j in m2.index:
            M[k,l] = 2 ** len(set(m1[i]).intersection(set(m2[j])))
            l+=1
        k+=1
    return M

In [15]:
init = time()
clf2 = SVC(kernel='precomputed')
clf2.fit(calculate_kernel_matrix(X_train_hand['sentence'], X_train_hand['sentence']),y_train)
preds = clf2.predict(calculate_kernel_matrix(X_test_hand['sentence'], X_train_hand['sentence']))

In [16]:
print('Accuracy:',round(accuracy(y_test,preds),3))
print(ConfusionMatrix(list(y_test),list(preds)).pretty_format())
print('time:', timedelta(seconds=time() - init))           

Accuracy: 0.901
     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2416>   . |
spam |  277  <93>|
-----+-----------+
(row = reference; col = test)

time: 0:09:17.403736
