In [1]:
#importing the required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#loading the dataset
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")

#splitting the datframe into X and y
X_train = train_df['Message']
y_train = train_df['Label']
X_val = val_df['Message']
y_val = val_df['Label']
X_test = test_df['Message']
y_test = test_df['Label']

In [3]:
# get tf-idf values of training data

tfidf = TfidfVectorizer()

train_tfidf = tfidf.fit_transform(X_train)
train_tfidf

<3342x6674 sparse matrix of type '<class 'numpy.float64'>'
	with 44354 stored elements in Compressed Sparse Row format>

In [4]:
#get the tf-idf values of testing data
test_tfidf = tfidf.transform(X_test)
test_tfidf

<1115x6674 sparse matrix of type '<class 'numpy.float64'>'
	with 13927 stored elements in Compressed Sparse Row format>

In [5]:
#get the tf-idf values of validation data
val_tfidf = tfidf.transform(X_val)
val_tfidf

<1115x6674 sparse matrix of type '<class 'numpy.float64'>'
	with 13535 stored elements in Compressed Sparse Row format>

## Applying Naive Bayes

In [6]:
#use Naive Bayes classifier to fit the training data
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(train_tfidf, y_train)

#predict the results on test data
y_pred = naive_bayes_classifier.predict(test_tfidf)

#accuracy score of Naive Bayes
score1 = metrics.accuracy_score(y_test, y_pred)
print("The accuracy score of Naive Bayes is : ",score1)

#Confusion matrix of Naive Bayes

print(metrics.classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
print(metrics.confusion_matrix(test_df["Label"], y_pred))

The accuracy score of Naive Bayes is :  0.9587443946188341
              precision    recall  f1-score   support

         Ham       0.95      1.00      0.98       968
        Spam       1.00      0.69      0.81       147

    accuracy                           0.96      1115
   macro avg       0.98      0.84      0.90      1115
weighted avg       0.96      0.96      0.96      1115

[[968   0]
 [ 46 101]]


## Applying SVM

In [7]:
from sklearn import svm

#fitting SVM
clf = svm.SVC()
clf.fit(train_tfidf,y_train)
y_pred_svm = clf.predict(test_tfidf)

#accuracy score of SVM
score2 = metrics.accuracy_score(y_test, y_pred_svm)
print("The accuracy score of SVM is : ",score2)

#Confusion matrix of SVM

print(metrics.classification_report(y_test, y_pred_svm, target_names=['Ham', 'Spam']))
print(metrics.confusion_matrix(test_df["Label"], y_pred_svm))

The accuracy score of SVM is :  0.9847533632286996
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       968
        Spam       1.00      0.88      0.94       147

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.99      0.98      0.98      1115

[[968   0]
 [ 17 130]]


#### Hyperparameter Tuning for SVM

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid={"C":[0.1, 1, 10, 100], "kernel":("linear", "rbf")}
grid= GridSearchCV(clf, param_grid, refit=True, verbose=3)
grid.fit(train_tfidf,y_train)
y_pred_grid = grid.predict(val_tfidf)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.897 total time=   0.2s
[CV 2/5] END ..............C=0.1, kernel=linear;, score=0.895 total time=   0.2s
[CV 3/5] END ..............C=0.1, kernel=linear;, score=0.894 total time=   0.2s
[CV 4/5] END ..............C=0.1, kernel=linear;, score=0.894 total time=   0.2s
[CV 5/5] END ..............C=0.1, kernel=linear;, score=0.895 total time=   0.2s
[CV 1/5] END .................C=0.1, kernel=rbf;, score=0.857 total time=   0.4s
[CV 2/5] END .................C=0.1, kernel=rbf;, score=0.857 total time=   0.4s
[CV 3/5] END .................C=0.1, kernel=rbf;, score=0.858 total time=   0.4s
[CV 4/5] END .................C=0.1, kernel=rbf;, score=0.858 total time=   0.4s
[CV 5/5] END .................C=0.1, kernel=rbf;, score=0.858 total time=   0.4s
[CV 1/5] END ................C=1, kernel=linear;, score=0.966 total time=   0.2s
[CV 2/5] END ................C=1, kernel=linear;,

In [9]:
#accuracy score of SVM
score2 = metrics.accuracy_score(y_val, y_pred_grid)
print("The accuracy score of SVM is : ",score2)

#Confusion matrix of SVM

print(metrics.classification_report(y_val, y_pred_grid, target_names=['Ham', 'Spam']))
print(metrics.confusion_matrix(val_df["Label"], y_pred_grid))

The accuracy score of SVM is :  0.9928251121076234
              precision    recall  f1-score   support

         Ham       0.99      1.00      1.00       992
        Spam       0.99      0.94      0.97       123

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

[[991   1]
 [  7 116]]


In [10]:
grid.best_params_

{'C': 10, 'kernel': 'linear'}

In [11]:
#fitting SVM after hyperparameter tuning
clf = svm.SVC(C=10, kernel ='linear')
clf.fit(train_tfidf,y_train)
y_pred_svm = clf.predict(test_tfidf)

#accuracy score of SVM
score2 = metrics.accuracy_score(y_test, y_pred_svm)
print("The accuracy score of SVM is : ",score2)

#Confusion matrix of SVM

print(metrics.classification_report(y_test, y_pred_svm, target_names=['Ham', 'Spam']))
print(metrics.confusion_matrix(test_df["Label"], y_pred_svm))

The accuracy score of SVM is :  0.9910313901345291
              precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       968
        Spam       0.98      0.95      0.97       147

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

[[965   3]
 [  7 140]]


## Neural Network

In [12]:
#Importing the required libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional

In [13]:
# Defining pre-processing hyperparameters
max_len = 50 
trunc_type = "post" 
padding_type = "post" 
oov_tok = "<OOV>" 
vocab_size = 500
embeding_dim = 16
n_dense = 24

#Tokenizing training data
tokenizer = Tokenizer(500)
tokenizer.fit_on_texts(X_train)

In [14]:
# Sequencing and padding on training and testing 
train_seq = tokenizer.texts_to_sequences(X_train)
train_pad = pad_sequences(train_seq, maxlen = max_len)
val_seq = tokenizer.texts_to_sequences(X_val)
val_pad = pad_sequences(val_seq, maxlen = max_len)
test_seq = tokenizer.texts_to_sequences(X_test)
test_pad = pad_sequences(test_seq, maxlen = max_len)

In [15]:
#Creating the neural network
model = Sequential()
model.add(Embedding(vocab_size, embeding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [16]:
# fitting the model to our data
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])
fit_model = model.fit(train_pad, y_train, epochs=30, validation_data=(val_pad, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [17]:
# Model performance on validation data 
model.evaluate(val_pad, y_val)



[0.05018720403313637, 0.9892376661300659]

#### Hyperparameter tuning for Neural Network

In [18]:
#Changing the loss function
# fitting the model on our data
model.compile(loss='binary_crossentropy',optimizer='SGD' ,metrics=['accuracy'])
fit_model = model.fit(train_pad, y_train, epochs=30, validation_data=(val_pad, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [19]:
# Model performance on validation data 
model.evaluate(val_pad, y_val)



[0.05253727734088898, 0.9874439239501953]

In [20]:
# Changing the number of epochs

# fitting a dense spam detector model
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])
fit_model = model.fit(train_pad, y_train, epochs=50, validation_data=(val_pad, y_val))
# Model performance on validation data 
model.evaluate(val_pad, y_val)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[0.09667690098285675, 0.9847533702850342]

As we can see the best accuracy is observed for Adam and number of epochs 30, so we predict the model on the test data.

In [21]:
# fitting the model 
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])
fit_model = model.fit(train_pad, y_train, epochs=30, validation_data=(test_pad, y_test))

# Model performance on test data 
results = model.evaluate(test_pad, y_test)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [23]:
print("Neural Network gives us an accuracy of :", results[1])

Neural Network gives us an accuracy of : 0.9865471124649048
