## Text Preprocessing Steps

- converting all letters to lower or upper case
- converting numbers into words or removing numbers
- removing punctuations, accent marks and other diacritics
- removing white spaces
- expanding abbreviations
- removing stop words, sparse terms, and particular words
- applying lemmatization

In [146]:
import pandas as pd
import numpy as np

import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,classification_report

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# To ignore warning
import warnings
warnings.filterwarnings('ignore')

#expanding the dispay of text sms column
pd.set_option('display.max_colwidth', -1)

In [147]:
enronSpamSubset = pd.read_csv('enronSpamSubset.csv')
completeSpamAssassin = pd.read_csv('completeSpamAssassin.csv')
lingSpam = pd.read_csv('lingSpam.csv')

train = enronSpamSubset[['Body','Label']]
val = lingSpam[['Body','Label']]
test = completeSpamAssassin[['Body','Label']]

test.dropna(inplace=True)

train.drop_duplicates(keep='first',inplace=True)
val.drop_duplicates(keep='first',inplace=True)
test.drop_duplicates(keep='first',inplace=True)

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)


In [148]:
contractions_dict = { "ain't": "are not", "'s":" is", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because",
 "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", 
 "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
 "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
 "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "isn't": "is not", 
 "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "let's": "let us", "ma'am": "madam", 
 "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", 
 "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", 
 "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", 
 "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "should've": "should have", 
 "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "that'd": "that would", "that'd've": "that would have", 
 "there'd": "there would", "there'd've": "there would have", "they'd": "they would", "they'd've": "they would have","they'll": "they will",
 "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", 
 "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
 "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what've": "what have", "when've": "when have", 
 "where'd": "where did", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who've": "who have", 
 "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", 
 "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", 
 "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
 "you'll've": "you will have", "you're": "you are", "you've": "you have"}

def preprocessing(data,text_col,target_col):
    dataframe = pd.DataFrame(columns=[text_col,target_col])
    data_size = data.shape[0]

    lemmatizer = WordNetLemmatizer()
    en = spacy.load('en_core_web_sm')
    sw_spacy = en.Defaults.stop_words
    
    contractions_re = re.compile('(%s)'%'|'.join(contractions_dict.keys()))
    def expand_contractions(s, contractions_dict=contractions_dict):
        def replace(match):
            return contractions_dict[match.group(0)]
        return contractions_re.sub(replace, s)
    
    for i in range(data_size):
        sent = data[text_col][i].lower()
        sent = expand_contractions(sent)
        sent = re.sub("http://\S+|https://\S+,http[s]?://\S+,http\S+"," ",sent)
        sent = re.sub("b[w-]+?@w+?.w{2,4}b", " ",sent)
        sent = re.sub("[^a-zA-Z]"," ",sent)
        sent = word_tokenize(sent)
        sent = [w for w in sent if not w in sw_spacy and len(w)>3]
        dataframe.loc[i] = [[lemmatizer.lemmatize(word) for word in sent],data[target_col][i]]
        
    return dataframe

In [149]:
df_train = preprocessing(train,'Body','Label')
df_val = preprocessing(val,'Body','Label')
df_test = preprocessing(test,'Body','Label')

In [178]:
len_train = []
for i in range(df_train.shape[0]):
    len_train.append(len(df_train['Body'][i]))
len_val = []
for i in range(df_val.shape[0]):
    len_val.append(len(df_val['Body'][i]))
len_test = []
for i in range(df_test.shape[0]):
    len_test.append(len(df_test['Body'][i]))

In [213]:
import plotly.express as ex
ex.box(len_train)

In [250]:
df_comp = df_train.append([df_test,df_val])
df_comp = df_comp.reset_index(drop=True)

# Word2vec

Word2Vec is a neural network model, that provides a numerical vector representation for a given word. This numerical vector is often called as "Word Embedding". Despite other methods of Bag of Words and TF-IDF, Word2Vec takes the context of the word into consideration while converting a word to a numerical vector

In [252]:
from gensim.models import Word2Vec

# Training Model with custom data
model = Word2Vec(sentences=df_comp.Body, vector_size=200,workers=-1)
vocab=list(model.wv.key_to_index.keys())

In [303]:
def avg_w2vec(sentences):
    transformed=[]
    for i in range(sentences.shape[0]):
        count=0
        vector=np.zeros(200)
        for word in sentences[i]:
            if word in vocab:
                a = model.wv.get_vector(word)
                vector+=a
                count+=1
        if count!=0:
            vector/=count
            transformed.append(vector)
        else:
            print(i)
    return np.array(transformed)

In [304]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(df_comp['Body'],df_comp['Label'],random_state=142,test_size = 0.3)

In [305]:
X_train_transformed=avg_w2vec(X_train.reset_index(drop=True))
X_test_transformed=avg_w2vec(X_test.reset_index(drop=True))

3613
4186
9089
9294
9978
10502
11807
12189


In [306]:
y_train = y_train.reset_index(drop=True)
y_train.drop([3613,4183,9089,9294,9978,10502,11807,12189],inplace=True)

In [307]:
from sklearn.naive_bayes import GaussianNB
gnb_model = GaussianNB()
gnb_model.fit(X_train_transformed,y_train)
print(gnb_model.score(X_train_transformed,y_train))


0.8011553168985437


In [309]:
gy_pred = gnb_model.predict(X_test_transformed)

print('Gaussian NB model:')
print('Accuracy Score :',accuracy_score(y_test,gy_pred))
print('Precision Score :',precision_score(y_test,gy_pred))
print('Confusion Matrix: \n',confusion_matrix(gy_pred,y_test))
print('Classification Report: \n',classification_report(y_test,gy_pred))

Gaussian NB model:
Accuracy Score : 0.7943854324734446
Precision Score : 0.7554112554112554
Confusion Matrix: 
 [[2792  632]
 [ 452 1396]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.86      0.84      3244
           1       0.76      0.69      0.72      2028

    accuracy                           0.79      5272
   macro avg       0.79      0.77      0.78      5272
weighted avg       0.79      0.79      0.79      5272



In [310]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [312]:
grid_params = { 'n_neighbors' : [10,20,30,40,50,60],
               'metric' : ['manhattan']}
knn=KNeighborsClassifier()
clf = RandomizedSearchCV(knn, grid_params, random_state=0,n_jobs=-1,verbose=1)
clf.fit(X_train_transformed,y_train)
clf.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


{'n_neighbors': 10, 'metric': 'manhattan'}

In [316]:
knn_model=KNeighborsClassifier(n_neighbors= 10, metric= 'manhattan')
knn_model.fit(X_train_transformed,y_train)
knn_pred = knn_model.predict(X_test_transformed)

print('KNN model:')
print('Accuracy Score :',accuracy_score(y_test,gy_pred))
print('Precision Score :',precision_score(y_test,gy_pred))
print('Confusion Matrix: \n',confusion_matrix(gy_pred,y_test))
print('Classification Report: \n',classification_report(y_test,gy_pred))

KNN model:
Accuracy Score : 0.7943854324734446
Precision Score : 0.7554112554112554
Confusion Matrix: 
 [[2792  632]
 [ 452 1396]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.86      0.84      3244
           1       0.76      0.69      0.72      2028

    accuracy                           0.79      5272
   macro avg       0.79      0.77      0.78      5272
weighted avg       0.79      0.79      0.79      5272



In [205]:
tfidf = TfidfVectorizer(max_features = 4000)

X_train = tfidf.fit_transform([' '.join(text) for text in df_train['Body']]).toarray()
y_train = df_train['Label']

X_val = tfidf.fit_transform([' '.join(text) for text in df_val['Body']]).toarray()
y_val = df_val['Label']

X_test = tfidf.fit_transform([' '.join(text) for text in df_test['Body']]).toarray()
y_test = df_test['Label']

In [206]:
vectorizer = CountVectorizer(max_features=4000)

X_train = vectorizer.fit_transform([' '.join(text) for text in df_train['Body']]).toarray()
y_train = df_train['Label']

X_val = vectorizer.fit_transform([' '.join(text) for text in df_val['Body']]).toarray()
y_val = df_val['Label']

X_test = vectorizer.fit_transform([' '.join(text) for text in df_test['Body']]).toarray()
y_test = df_test['Label']

In [207]:
from sklearn.naive_bayes import GaussianNB
gnb_model = GaussianNB()
gnb_model.fit(X_train,y_train)
print(gnb_model.score(X_train,y_train))


from sklearn.naive_bayes import MultinomialNB
mnb_model = MultinomialNB()
mnb_model.fit(X_train, y_train)
print(mnb_model.score(X_train,y_train))

0.9393000929080211
0.9708888200681326


In [208]:
gy_pred = gnb_model.predict(X_val)

print('Gaussian NB model:')
print('Accuracy Score :',accuracy_score(y_val,gy_pred))
print('Precision Score :',precision_score(y_val,gy_pred))
print('Confusion Matrix: \n',confusion_matrix(gy_pred,y_val))
print('Classification Report: \n',classification_report(y_val,gy_pred))

Gaussian NB model:
Accuracy Score : 0.5059822462369742
Precision Score : 0.19545131485429992
Confusion Matrix: 
 [[1036  148]
 [1132  275]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.48      0.62      2168
           1       0.20      0.65      0.30       423

    accuracy                           0.51      2591
   macro avg       0.54      0.56      0.46      2591
weighted avg       0.76      0.51      0.57      2591



In [209]:
my_pred = mnb_model.predict(X_val)

print('Multinominal NB model:')
print('Accuracy Score :',accuracy_score(y_val,my_pred))
print('Precision Score :',precision_score(y_val,my_pred))
print('Confusion Matrix: \n',confusion_matrix(my_pred,y_val))
print('Classification Report: \n',classification_report(y_val,my_pred))


Multinominal NB model:
Accuracy Score : 0.6703975299112311
Precision Score : 0.2700106723585913
Confusion Matrix: 
 [[1484  170]
 [ 684  253]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.68      0.78      2168
           1       0.27      0.60      0.37       423

    accuracy                           0.67      2591
   macro avg       0.58      0.64      0.57      2591
weighted avg       0.79      0.67      0.71      2591



In [210]:
gy_pred1 = gnb_model.predict(X_test)

print('Accuracy Score :',accuracy_score(y_test,gy_pred1))
print('Precision Score :',precision_score(y_test,gy_pred1))
print('Confusion Matrix: \n',confusion_matrix(gy_pred1,y_test))
print('Classification Report: \n',classification_report(y_test,gy_pred1))

Accuracy Score : 0.39731721141129794
Precision Score : 0.20755326016785022
Confusion Matrix: 
 [[1460  735]
 [2455  643]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.67      0.37      0.48      3915
           1       0.21      0.47      0.29      1378

    accuracy                           0.40      5293
   macro avg       0.44      0.42      0.38      5293
weighted avg       0.55      0.40      0.43      5293



In [211]:
my_pred1 = mnb_model.predict(X_test)

print('Accuracy Score :',accuracy_score(y_test,my_pred1))
print('Precision Score :',precision_score(y_test,my_pred1))
print('Confusion Matrix: \n',confusion_matrix(my_pred1,y_test))
print('Classification Report: \n',classification_report(y_test,my_pred1))

Accuracy Score : 0.447194407708294
Precision Score : 0.18510984540276648
Confusion Matrix: 
 [[1912  923]
 [2003  455]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.67      0.49      0.57      3915
           1       0.19      0.33      0.24      1378

    accuracy                           0.45      5293
   macro avg       0.43      0.41      0.40      5293
weighted avg       0.55      0.45      0.48      5293

