# NLP - Spam Detection

## Importing the Modules

In [144]:
import pandas as pd
import numpy as np

## Importing the Data

In [None]:
pd.options.display.max_colwidth=500
df=pd.read_table('./SMSSpamCollection',header=None,names=['class','message'])
df

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
...,...,...
5567,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate."
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other suggestions?"
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free


In [115]:
print(f'Columns:{df.columns}')
print(f'Number of samples:{df.shape}')

Columns:Index(['class', 'message'], dtype='object')
Number of samples:(5572, 2)


In [102]:
df[df['class']=="spam"].shape

(747, 2)

perform frequency count on the variable 'class' .
using normalize=True, displays the numbers as percentage

In [116]:
df['class'].value_counts()

class
ham     4825
spam     747
Name: count, dtype: int64

## Partitioning the Data

In [104]:
from sklearn.model_selection import train_test_split

In [117]:
train,test=train_test_split(df,train_size=0.7,random_state=41,stratify=df['class'])
train

Unnamed: 0,class,message
5383,ham,Good day to You too.Pray for me.Remove the teeth as its painful maintaining other stuff.
4686,ham,"Alright we'll bring it to you, see you in like &lt;#&gt; mins"
2919,ham,Thanks chikku..:-) gud nyt:-*
3055,ham,What happened to our yo date?
5336,ham,Sounds better than my evening im just doing my costume. Im not sure what time i finish tomorrow but i will txt you at the end.
...,...,...
761,spam,"Romantic Paris. 2 nights, 2 flights from £79 Book now 4 next year. Call 08704439680Ts&Cs apply."
1597,spam,As a Registered Subscriber yr draw 4 a £100 gift voucher will b entered on receipt of a correct ans. When are the next olympics. Txt ans to 80062
4676,spam,"Hi babe its Chloe, how r u? I was smashed on saturday night, it was great! How was your weekend? U been missing me? SP visionsms.com Text stop to stop 150p/text"
1895,spam,"FreeMsg Hey U, i just got 1 of these video/pic fones, reply WILD to this txt & ill send U my pics, hurry up Im so bored at work xxx (18 150p/rcvd STOP2stop)"


In [118]:
print("Train Set\n",train['class'].value_counts(normalize=True))
print("Test Set\n",train['class'].value_counts(normalize=True))


Train Set
 class
ham     0.865897
spam    0.134103
Name: proportion, dtype: float64
Test Set
 class
ham     0.865897
spam    0.134103
Name: proportion, dtype: float64


count the number of unique words in the train corpus/set

In [119]:
words=[]
for text in train['message']:
    words.extend(text.split())
print("Total number of words:",len(words))
print("Total number of unique words:",len(set(words)))
words[:10]

Total number of words: 60690
Total number of unique words: 12737


['Good',
 'day',
 'to',
 'You',
 'too.Pray',
 'for',
 'me.Remove',
 'the',
 'teeth',
 'as']

## Building the document–term matrix for training
NLP Preprocessing

import the module used to handle stop words

In [121]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [125]:
print(f'ENGLISH_STOP_WORDS is of type {type(ENGLISH_STOP_WORDS)} and of length {len(ENGLISH_STOP_WORDS)}')

ENGLISH_STOP_WORDS is of type <class 'frozenset'> and of length 318


import the module used to build the document/term matrix

In [123]:
from sklearn.feature_extraction.text import CountVectorizer

build the word vector present in the corpus

In [126]:
parser=CountVectorizer(binary=True, stop_words=list(ENGLISH_STOP_WORDS),min_df=10 )

the min_df value defines the minimum frequency required for a word to be considered.<br>
stop_words allows you to ignore many common words, called stop words.<br>
binary outputs 1 if the word appears at least once, and 0 otherwise

build the document/term matrix on the training sample using the fit_transform function

In [131]:
XTrain=parser.fit_transform(train['message'])

In [133]:
XTrain.shape

(3900, 619)

In [135]:
XTrain[0,500:550].toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]])

display the words contained in the parser vector

In [141]:
parser.get_feature_names_out()

array(['000', '04', '0800', '08000930705', '10', '100', '1000', '10p',
       '11', '150', '150p', '150ppm', '16', '18', '1st', '200', '2000',
       '2003', '250', '2lands', '2nd', '50', '500', '5000', '750', '800',
       '8007', '86688', '87066', 'able', 'abt', 'account', 'actually',
       'address', 'aft', 'afternoon', 'ah', 'aight', 'alright', 'amp',
       'answer', 'anytime', 'apply', 'ard', 'area', 'ask', 'asked', 'ass',
       'attempt', 'await', 'award', 'awarded', 'away', 'awesome', 'b4',
       'babe', 'baby', 'bad', 'balance', 'beautiful', 'bed', 'believe',
       'best', 'better', 'big', 'birthday', 'bit', 'bonus', 'book',
       'bored', 'bout', 'box', 'boy', 'boytoy', 'break', 'bring',
       'brother', 'bslvyl', 'bt', 'bus', 'busy', 'buy', 'called',
       'calling', 'calls', 'camcorder', 'came', 'camera', 'car', 'card',
       'care', 'carlos', 'case', 'cash', 'cause', 'chance', 'change',
       'charge', 'charged', 'chat', 'check', 'chikku', 'choose', 'claim',
     

display the number of words contained in the parser vector

In [None]:
len(parser.get_feature_names_out())

numpy.ndarray

transform document/term matrix into a numpy matrix

In [147]:
mdtTrain=XTrain.toarray()
mdtTrain

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(3900, 619))

In [148]:
print(type(mdtTrain))
print(mdtTrain.shape)
print(mdtTrain)

<class 'numpy.ndarray'>
(3900, 619)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


count the number of occurrences of each word in the corpus and store it

In [154]:
freq_per_word=mdtTrain.sum(axis=0)
print(freq_per_word)
print(len(freq_per_word))

[ 19  10  11  11  24  31  23  13  12  12  52  27  41  34  28  10  16  11
  15  10  20  42  31  14  12  18  11  15  10  18  18  27  29  17  14  19
  25  22  18  58  12  10  21  16  11  57  20  10  14  12  16  29  20  16
  11  48  19  19  10  17  16  17  33  27  22  21  25  14  18  13  14  25
  19  11  13  18  15  11  24  21  16  41  21  17  18  10  17  22  29  13
  49  14  12  57  12  32  14  10  10  38  28  12  13  70  28  14  13  22
  24  15  11  13  52 158  14  34  11  10  15  12  40  11  10  31  10  55
  21  19  12  13  34  12  38  95  20  11  10  24  11  10 136  25  72  10
  15  15  24  20  81  34  23  18  18  10  14  12  59 101  95  13  29  11
  13  15  19  12  16  37  23  12  23  23  25  12  27  31  10  13  15  23
  13  15  14  11  14  39  16  11  30  23  14  11  20  10  13  16 163  11
  12  12  36  39  15  13  12  11  16  19  14  11  10  13  37  19  22  11
  24  16  14 102  14  44 145  10 163  17  69  10 165  32  38  26  20  27
  19  30  15  27  12  13  51  10  15  17  15  25  1

return a vector with the indexes of the words sorted in ascending order by their number of occurrences

In [195]:
indicies=  np.argsort(freq_per_word)
indicies

array([  1, 617,  85,  41,  97, 371,  98, 337, 320, 295, 361, 358, 404,
       394, 176, 400, 503, 504, 507, 288, 279, 278, 277, 276, 136, 139,
       153, 143, 488, 117, 124, 122, 552, 223, 227, 241, 210, 250, 561,
       550, 589, 584,  28, 557, 357,  15, 562,  19, 449,  47, 443,  58,
       257, 482, 248, 438, 193, 457, 399, 492, 116, 138, 110, 121, 269,
       255,  44, 191, 256, 215, 161, 135, 509, 510, 262, 500, 453,  73,
        77,   3, 349, 333, 532, 316, 321, 423, 205,  54, 187, 468, 197,
       590, 593,  26,  17, 209, 183, 581, 580, 610,   2, 323, 318, 305,
       326, 267, 396, 266, 128, 607, 381, 378, 345, 416, 606,  40, 101,
       501, 285, 271, 391, 169, 173, 390, 131, 165, 155, 119, 420, 300,
       302, 360, 364, 296, 375, 386,  92,  94, 377, 464, 465,  24, 558,
         8,   9, 460, 431,  49, 238, 258, 204, 199, 485, 467, 198, 264,
       111, 106, 102, 129, 203, 478, 474, 587,  69,   7, 435, 451, 211,
       239, 162, 194, 473, 177, 454, 180, 487, 159, 541, 366,  7

display the words sorted by their number of occurrences in the document, and store them

In [165]:
new_df={ 'term': np.asarray(parser.get_feature_names_out())[indicies], 'freq':freq_per_word[indicies]}
print(pd.DataFrame(new_df))

          term  freq
0           04    10
1           yr    10
2    camcorder    10
3      anytime    10
4       charge    10
..         ...   ...
614       know   176
615         ll   177
616         ok   197
617         ur   232
618       just   248

[619 rows x 2 columns]


## Logistic Regression

### Constructing the Model

In [166]:
from sklearn.linear_model import LogisticRegression

In [168]:
first_model=LogisticRegression()

In [170]:
first_model.fit(mdtTrain,train['class'])

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


display the coefficients of each token in the model and the intercept (constant)

In [171]:
print(first_model.coef_.shape)
print(first_model.coef_)
print(first_model.intercept_)

(1, 619)
[[ 3.82837011e-01  2.73826915e-01  1.85065946e+00  1.34645941e-01
   1.74884746e-01  1.31905693e+00  9.89882090e-01  9.21307159e-01
   2.36732696e-01  1.56515889e-01  1.81841007e+00  5.46046131e-01
   5.49572241e-01  1.24209249e+00  5.94620687e-01  5.58864245e-01
   2.30567771e-01  8.07058630e-01  4.81577967e-01  2.44408812e-01
   1.29284882e-01  1.97998805e+00  7.53759468e-01  5.12871335e-01
   8.48836226e-01  8.34263903e-01  5.35134483e-01  3.90631467e-01
   3.70619556e-02 -3.21891108e-01 -2.67976540e-01  6.70358846e-01
  -4.73419403e-01 -3.48538996e-01 -1.21093764e-01 -1.42430912e-01
  -2.96854942e-01 -3.91239409e-01 -3.12095468e-01 -8.14351524e-01
   2.10004045e-01  6.54308743e-02  1.04469149e+00 -3.29778540e-01
   5.21050399e-01 -6.44852709e-01  2.19266332e-01 -1.88383795e-01
   3.43326510e-01  2.20349170e-01  1.31362214e+00  1.20874877e+00
   9.68126959e-02 -3.10877802e-01  3.35127661e-01 -1.09150045e-01
  -1.17267720e-01  2.35527007e-01  5.81567651e-01 -1.91766531e-01
 

### Evaluating the Model

In [201]:
mdtTest=parser.transform(test['message'])

display the dimensions of the document/term matrix

In [173]:
mdtTest.shape

(1672, 619)

deploy the model on the test sample

In [177]:
predicted = first_model.predict(mdtTest)
predicted

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'ham'],
      shape=(1672,), dtype=object)

In [178]:
first_model.classes_

array(['ham', 'spam'], dtype=object)

calculate the confusion matrix

In [185]:
from sklearn.metrics import confusion_matrix
print(pd.crosstab(index=test['class'],columns=predicted))

cm=pd.DataFrame(confusion_matrix(test['class'],predicted),
                columns=['ham','spam'],index=['ham','spam'])
cm

col_0   ham  spam
class            
ham    1444     4
spam     26   198


Unnamed: 0,ham,spam
ham,1444,4
spam,26,198


import the methods used to display the model evaluation metrics

In [186]:
from sklearn.metrics import classification_report
print(classification_report(test['class'],predicted))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1448
        spam       0.98      0.88      0.93       224

    accuracy                           0.98      1672
   macro avg       0.98      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



### Model improvement without unnecessary variables

display only the coefficients of each token

In [202]:
'''
The coeffient shows how much the term contributes to predicitng spam or ham
Large positive or negative coefficients --> strong influence
Coefficients close to 0 -> little or no influence
We calculate the absolute to focus on the magnitude of importance
'''

coef_abs=np.abs(first_model.coef_[0,:])
print(len(coef_abs))
coef_abs

619


array([3.82837011e-01, 2.73826915e-01, 1.85065946e+00, 1.34645941e-01,
       1.74884746e-01, 1.31905693e+00, 9.89882090e-01, 9.21307159e-01,
       2.36732696e-01, 1.56515889e-01, 1.81841007e+00, 5.46046131e-01,
       5.49572241e-01, 1.24209249e+00, 5.94620687e-01, 5.58864245e-01,
       2.30567771e-01, 8.07058630e-01, 4.81577967e-01, 2.44408812e-01,
       1.29284882e-01, 1.97998805e+00, 7.53759468e-01, 5.12871335e-01,
       8.48836226e-01, 8.34263903e-01, 5.35134483e-01, 3.90631467e-01,
       3.70619556e-02, 3.21891108e-01, 2.67976540e-01, 6.70358846e-01,
       4.73419403e-01, 3.48538996e-01, 1.21093764e-01, 1.42430912e-01,
       2.96854942e-01, 3.91239409e-01, 3.12095468e-01, 8.14351524e-01,
       2.10004045e-01, 6.54308743e-02, 1.04469149e+00, 3.29778540e-01,
       5.21050399e-01, 6.44852709e-01, 2.19266332e-01, 1.88383795e-01,
       3.43326510e-01, 2.20349170e-01, 1.31362214e+00, 1.20874877e+00,
       9.68126959e-02, 3.10877802e-01, 3.35127661e-01, 1.09150045e-01,
      

display certain percentiles to check whether most variables have a coefficient close to 0

In [191]:
thresholds=np.percentile(coef_abs,[25,50,75])
thresholds

array([0.1670313 , 0.28553195, 0.56944247])

remove the 25% of tokens that have the lowest scores

In [207]:

index= np.where(coef_abs > thresholds[0])

In [208]:
index

(array([  0,   1,   2,   4,   5,   6,   7,   8,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,  29,
         30,  31,  32,  33,  36,  37,  38,  39,  40,  42,  43,  44,  45,
         46,  47,  48,  49,  50,  51,  53,  54,  57,  58,  59,  60,  62,
         63,  64,  66,  67,  68,  69,  70,  71,  72,  75,  76,  78,  79,
         80,  81,  82,  84,  87,  88,  91,  93,  94,  95,  96,  97,  98,
         99, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113,
        114, 115, 118, 120, 121, 122, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 137, 139, 140, 141, 142, 143, 145, 146,
        147, 148, 149, 150, 151, 152, 154, 156, 157, 159, 160, 163, 164,
        165, 166, 167, 168, 170, 171, 172, 173, 174, 175, 178, 180, 181,
        182, 183, 185, 187, 188, 189, 190, 191, 192, 195, 196, 197, 198,
        199, 201, 202, 203, 205, 206, 207, 209, 211, 212, 213, 214, 215,
        216, 217, 219, 220, 221, 224, 226, 228, 229

display the number of tokens that will be excluded from the document/term matrix of the training sample

In [209]:
len(index[0])

464

duplicate the two document/term matrices without the unnecessary variables

In [210]:
mdtTrainTer=mdtTrain[:,index[0]]
mdtTestTer=mdtTest[:,index[0]]
mdtTrainTer.shape

(3900, 464)

Build a new model without the unnecessary variables

In [212]:
second_model=LogisticRegression()
second_model.fit(mdtTrainTer,train['class'])
predicted_2=second_model.predict(mdtTestTer)

calculate the new metrics and compare them with the initial model

In [213]:
from sklearn import metrics

display the metrics of the retained variables

In [229]:
cm_2=metrics.confusion_matrix(test['class'],predicted_2)
print(cm_2)
print("Recall:",metrics.recall_score(test['class'],predicted_2,pos_label='spam'))
print("Precision:",metrics.precision_score(test['class'],predicted_2,pos_label='spam'))
print("F1:",metrics.f1_score(test['class'],predicted_2,pos_label='spam'))
print("Accuracy:",metrics.accuracy_score(test['class'],predicted_2))

[[1444    4]
 [  25  199]]
Recall: 0.8883928571428571
Precision: 0.9802955665024631
F1: 0.9320843091334895
Accuracy: 0.9826555023923444


display the coefficient of the selected variables

In [242]:
print(second_model.coef_)
print(second_model.intercept_)


[[ 0.38417403  0.27576987  1.83057222  0.17635274  1.32448607  1.01116564
   0.92513513  0.25078501  1.81202519  0.55262682  0.55261579  1.25191372
   0.60972015  0.55521198  0.2258688   0.81114704  0.4930443   0.24326191
   2.02944037  0.76082967  0.5366317   0.86457625  0.84603668  0.55321649
   0.39540756 -0.32547693 -0.26932056  0.67831905 -0.47789803 -0.34448041
  -0.31273072 -0.38970666 -0.32202888 -0.82601741  0.20911169  1.04940465
  -0.33012581  0.54454499 -0.63678848  0.23019592 -0.18915056  0.36559985
   0.22363813  1.32508746  1.22302373 -0.2998409   0.36126101  0.22981043
   0.55165886 -0.21900972 -0.23774328 -0.33754164  0.23572093  0.15568052
  -0.32802083  0.47578735  1.03157876 -0.26308758  0.34016919  0.6127356
  -0.26555474 -0.21134262 -0.2817419  -0.21735404 -0.25532066 -0.21121101
  -0.38617251 -0.22858792  1.04178182  1.07771605 -0.34375351 -0.1835383
   0.79611316 -0.19885251  0.43790226 -0.27789882  0.24607529  0.86974475
   1.66333284 -0.21841082  0.62191953  1

In [None]:
selected_terms=np.asarray(parser.get_feature_names_out())[index[0]]

in increasing order, a list of the indices of the coefficients

In [243]:
sorted_indicies=np.argsort(second_model.coef_[0,:])
sorted_indicies

array([243, 176, 285, 199, 184, 233, 238, 430,  33, 246, 220, 125, 106,
       198, 157, 386, 355, 284, 374, 174, 456,  90,  38, 124, 268, 299,
       240, 173, 209, 334, 178, 234, 164, 304, 450, 347, 362, 216, 113,
       322, 385, 427, 384, 459, 119,  28, 175, 463,  98, 278, 338, 429,
       272, 143, 206, 448, 235, 145, 357, 192, 315, 396, 212, 376, 424,
       327, 331, 449,  31,  88, 228,  66, 325, 311, 263, 152, 393, 169,
       202, 208, 108, 389, 147, 301, 158,  29,  70, 148,  51, 390, 242,
       428, 297,  36,  54,  25, 130, 117, 436,  32, 187, 201, 120, 134,
       461, 179, 223,  30, 181, 255, 400, 294, 244, 383, 167,  45, 144,
        96, 341, 363,  92, 435,  82, 180, 253, 439, 286,  62,  75, 420,
       356, 335, 305, 365, 254,  26, 172, 186, 132, 394, 422,  60,  57,
       397, 227, 224, 109,  64, 359, 136, 339, 214, 342, 387, 317, 226,
       149, 116, 293, 446, 378, 111, 445, 295, 267, 388, 151, 185,  50,
       337, 371, 375, 121, 409, 366, 391, 219, 107, 100,  67, 35

display the words sorted by their number of occurrences in the document, and store them

In [246]:
df_2=pd.DataFrame({'term':np.asarray(selected_terms)[sorted_indicies],'coef':second_model.coef_[0,:][sorted_indicies]})
df_2

Unnamed: 0,term,coef
0,lt,-1.191726
1,gt,-1.177067
2,ok,-1.095221
3,hope,-0.988663
4,happy,-0.942935
...,...,...
459,ringtone,2.003019
460,50,2.029440
461,new,2.101163
462,service,2.144184


### Test the Model

test the model on an arbitrary sentence

In [247]:
doc=['SIX chances to win CASH! from 100 to 20,000 pounds']

use the model’s parser to build the document/term matrix

In [254]:
term_matrix=parser.transform(doc)
term_array=term_matrix.toarray()
term_array.shape

(1, 619)

keep only the terms that are present in the model

In [259]:
term=term_array[:,index[0]]
term.shape

(1, 464)

use the predict function to deploy the model

In [261]:
y_hat=second_model.predict(term)
y_hat

array(['spam'], dtype=object)

display the probabilities of belonging to each class

In [262]:
predicted_proba=second_model.predict_proba(term)
print(predicted_proba)

[[0.38105665 0.61894335]]
