In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("spam.csv")

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
df['spam'] = df['Category'].apply(lambda x:1 if x=="spam" else 0)

In [7]:
df.shape

(5572, 3)

In [8]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
import spacy

In [10]:
nlp = spacy.load("en_core_web_lg")

In [11]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [12]:
df["processed_msg"] = df["Message"].apply(preprocess)

In [13]:
df.shape

(5572, 4)

In [14]:
df.head()

Unnamed: 0,Category,Message,spam,processed_msg
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,U dun early hor U c
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think go usf live


In [15]:
df['processed_msg'][0]

'jurong point crazy available bugis n great world la e buffet cine get amore wat'

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df.processed_msg, df.spam, test_size = 0.2)

In [20]:
X_train.shape

(4457,)

In [21]:
X_test.shape

(1115,)

In [22]:
type(X_train)

pandas.core.series.Series

In [23]:
X_train[:4]

5021                            cool little get time soon
4082                                      hurry home soup
5112    December mobile 11mths+ entitle update late co...
127     know yetunde send money send text bother send ...
Name: processed_msg, dtype: object

In [24]:
y_train[:4]

5021    0
4082    0
5112    1
127     0
Name: spam, dtype: int64

In [25]:
clf1 = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (1,3))),
    ('MultiNB', MultinomialNB())
])

In [26]:
clf2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('MultiNB', MultinomialNB())
])

In [27]:
clf1.fit(X_train, y_train)

In [28]:
y_pred1 = clf1.predict(X_test)

In [29]:
score1 = clf1.score(X_test, y_test)

In [30]:
print(score1)

0.9811659192825112


In [31]:
clf2.fit(X_train, y_train)

In [32]:
y_pred2 = clf2.predict(X_test)
score2 = clf2.score(X_test, y_test)
print(score2)

0.9668161434977578


In [33]:
from sklearn.neighbors import KNeighborsClassifier

In [34]:
clf3 = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (1,3))),
    ('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean'))
])

In [35]:
clf4 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean'))
])

In [36]:
clf3.fit(X_train, y_train)

In [37]:
y_pred3 = clf3.predict(X_test)
score3 = clf3.score(X_test, y_test)
print(score3)

0.8816143497757848


In [38]:
clf4.fit(X_train, y_train)

In [39]:
y_pred4 = clf4.predict(X_test)
score4 = clf4.score(X_test, y_test)
print(score4)

0.8959641255605382


In [40]:
df['vector'] = df['processed_msg'].apply(lambda x: nlp(x).vector )

In [42]:
df.shape

(5572, 5)

In [43]:
df.head()

Unnamed: 0,Category,Message,spam,processed_msg,vector
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great wor...,"[1.1192545, 0.980326, 0.26543233, -0.8769394, ..."
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni,"[-0.14939333, 1.0167166, 0.4778967, -1.6510634..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...,"[-0.2184723, -2.4377646, 1.8605095, 0.9640945,..."
3,ham,U dun say so early hor... U c already then say...,0,U dun early hor U c,"[-3.6434948, 3.2535734, 4.5511975, -1.37153, 3..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think go usf live,"[0.04441598, 3.1517997, -3.422078, -0.27837402..."


In [44]:
df['vector'][0].shape

(300,)

In [45]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(df.vector.values, df.spam, test_size = 0.2, random_state = 22)

In [46]:
X_train1.shape

(4457,)

In [47]:
X_train_2d = np.stack(X_train1)

In [48]:
X_test_2d = np.stack(X_test1)

In [49]:
X_train1

array([array([-1.522218  , -3.823107  , -2.1704404 , -0.27528667, -0.29322332,
              -1.2841648 ,  2.2971082 ,  0.21488596,  0.20296057,  0.85559154,
               0.04110332, -1.969739  , -3.4464972 , -0.38096046, -1.2629168 ,
              -1.9378579 ,  2.6480806 ,  1.0139581 , -1.0237074 , -0.4054495 ,
              -0.8187585 , -0.14870428, -1.1310327 ,  2.8060417 ,  0.41730544,
               1.9699616 , -1.0538626 , -0.48041317, -0.39099902,  0.71153766,
               1.1770262 ,  1.85609   , -0.11356167, -1.9289862 ,  3.3372762 ,
               0.6743443 ,  2.7631428 ,  0.27461547, -1.1722784 ,  0.35798144,
              -0.6186242 ,  1.956289  , -2.0970578 ,  4.199608  , -0.14987756,
              -0.2617786 ,  0.38411146,  0.88764435,  1.8486418 , -0.67998934,
               0.8430682 , -1.10015   , -3.442134  , -0.8177819 , -2.6532435 ,
               1.5034018 , -1.7867792 ,  0.85583586,  0.1139581 , -1.9417485 ,
               2.3473337 ,  1.8807696 , -0.6466653 ,

In [50]:
X_train_2d

array([[-1.522218  , -3.823107  , -2.1704404 , ..., -1.6762636 ,
        -1.4322667 ,  2.1795278 ],
       [-1.3528188 , -1.8267708 , -2.094289  , ..., -1.542088  ,
        -0.19779992,  2.7439222 ],
       [ 0.40955624,  2.6260335 ,  0.8465111 , ..., -0.66098666,
         0.20266666,  0.79161114],
       ...,
       [ 0.26102787,  0.5486174 , -1.3060302 , ...,  0.627335  ,
        -1.2749283 ,  0.92560905],
       [-1.8604    , -1.1013    , -3.9087    , ..., -2.5081    ,
        -4.5749    , -1.1536    ],
       [ 0.04626051, -3.3620121 , -0.20317008, ..., -1.4716837 ,
        -1.8088847 ,  0.5984979 ]], dtype=float32)

In [51]:
from sklearn.preprocessing import MinMaxScaler

In [52]:
scaler = MinMaxScaler()

In [53]:
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

In [54]:
clf5 = MultinomialNB()

In [55]:
clf5.fit(scaled_train_embed, y_train)

In [56]:
y_pred5 = clf5.predict(scaled_test_embed)

In [57]:
score5 = clf5.score(scaled_test_embed, y_test)

In [58]:
print(score5)

0.8538116591928251


In [59]:
clf6 = KNeighborsClassifier(n_neighbors = 5, metric ='euclidean')

In [60]:
clf6.fit(X_train_2d, y_train)

In [61]:
y_pred6 = clf6.predict(X_test_2d)

In [62]:
score6 = clf6.score(X_test_2d, y_test)

In [63]:
print(score6)

0.8331838565022421


In [64]:
from sklearn.metrics import classification_report

In [66]:
print("CountVectorizer with MultinomialNB\n" , classification_report(y_pred1,y_test))

CountVectorizer with MultinomialNB
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       969
           1       0.88      0.99      0.93       146

    accuracy                           0.98      1115
   macro avg       0.94      0.98      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [68]:
print("CountVectorizer with KNeighborsClassifier\n" , classification_report(y_pred3,y_test))

CountVectorizer with KNeighborsClassifier
               precision    recall  f1-score   support

           0       1.00      0.88      0.94      1084
           1       0.19      1.00      0.32        31

    accuracy                           0.88      1115
   macro avg       0.60      0.94      0.63      1115
weighted avg       0.98      0.88      0.92      1115



In [69]:
print("TfidfVectorizer with KNeighborsClassifier\n" , classification_report(y_pred4,y_test))

TfidfVectorizer with KNeighborsClassifier
               precision    recall  f1-score   support

           0       1.00      0.89      0.94      1068
           1       0.29      1.00      0.45        47

    accuracy                           0.90      1115
   macro avg       0.64      0.95      0.70      1115
weighted avg       0.97      0.90      0.92      1115



In [70]:
print("Word Vectors with MultinomialNB\n" , classification_report(y_pred5,y_test))

Word Vectors with MultinomialNB
               precision    recall  f1-score   support

           0       1.00      0.85      0.92      1115
           1       0.00      0.00      0.00         0

    accuracy                           0.85      1115
   macro avg       0.50      0.43      0.46      1115
weighted avg       1.00      0.85      0.92      1115



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [71]:
print("Tfidf with KNeighborsClassifier\n" , classification_report(y_pred1,y_test))

Tfidf with KNeighborsClassifier
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       969
           1       0.88      0.99      0.93       146

    accuracy                           0.98      1115
   macro avg       0.94      0.98      0.96      1115
weighted avg       0.98      0.98      0.98      1115



#### So, our best model from these is (CountVectorizer + MultinomialNB)