<a href="https://colab.research.google.com/github/SameerR007/Spam_Classifier_ANN/blob/main/Spam_Classifier_ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Building a Spam Classifier using Artificial Neural Network

#Importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#loading the data from csv file to a pandas Dataframe

In [None]:
raw_mail_data = pd.read_csv('/content/mail_data.csv')

In [None]:
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


#Data Preprocessing

In [None]:
#checking for null values
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

1. There are no null values in the dataset.

In [None]:
mail_data = raw_mail_data

In [None]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5572, 2)

1. We have 5572 number of records classified as Category and Message.

In [None]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

In [None]:
# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

In [None]:
print(X)
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


#Splitting data into train test dataset

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [None]:
docs=X.astype("string")
docs_train=X_train.astype("string")
docs_test=X_test.astype("string")

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(docs_train)

In [None]:
tokenizer.word_index

{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'u': 6,
 'and': 7,
 'in': 8,
 'is': 9,
 'my': 10,
 'me': 11,
 'for': 12,
 'your': 13,
 'it': 14,
 'of': 15,
 'have': 16,
 'call': 17,
 '2': 18,
 'on': 19,
 'that': 20,
 'now': 21,
 'are': 22,
 'so': 23,
 'not': 24,
 'but': 25,
 'do': 26,
 'or': 27,
 'can': 28,
 'at': 29,
 'will': 30,
 'if': 31,
 'be': 32,
 'get': 33,
 'ur': 34,
 "i'm": 35,
 'with': 36,
 'no': 37,
 'just': 38,
 'we': 39,
 'this': 40,
 '4': 41,
 'gt': 42,
 'lt': 43,
 'up': 44,
 'when': 45,
 'how': 46,
 'all': 47,
 'from': 48,
 'free': 49,
 'out': 50,
 'go': 51,
 'ok': 52,
 'know': 53,
 'what': 54,
 'good': 55,
 'like': 56,
 'was': 57,
 'got': 58,
 'time': 59,
 'then': 60,
 'am': 61,
 'day': 62,
 'come': 63,
 'its': 64,
 'love': 65,
 'only': 66,
 'send': 67,
 'there': 68,
 'text': 69,
 'one': 70,
 'want': 71,
 'by': 72,
 'as': 73,
 'going': 74,
 'ü': 75,
 'home': 76,
 'he': 77,
 'about': 78,
 'txt': 79,
 'need': 80,
 'sorry': 81,
 "i'll": 82,
 'stop': 83,
 'r': 84,
 'see'

In [None]:
#total words in training corpus
len(tokenizer.word_index)

7951

In [None]:
#convert training and test texts into a series of word token indices
sequences_train = tokenizer.texts_to_sequences(docs_train)
sequences_test=tokenizer.texts_to_sequences(docs_test)

In [None]:
sequences_train

[[746, 53, 1, 1688, 127, 138, 1123],
 [26,
  3,
  53,
  156,
  303,
  1453,
  1454,
  403,
  13,
  1455,
  23,
  20,
  70,
  132,
  9,
  461,
  12,
  3,
  576,
  163,
  2027,
  391,
  2028,
  72,
  1263,
  13,
  747,
  36,
  65],
 [3861, 550, 6, 318, 50, 1124],
 [290, 35, 49],
 [3862,
  3863,
  26,
  3,
  71,
  4,
  103,
  192,
  2029,
  577,
  109,
  2601,
  36,
  1689,
  49,
  421,
  2,
  105,
  104,
  698,
  49,
  422,
  49,
  1012,
  98,
  27,
  17,
  926],
 [1,
  93,
  53,
  6,
  7,
  6,
  93,
  53,
  11,
  67,
  225,
  2,
  663,
  21,
  7,
  1264,
  171,
  632,
  240,
  66,
  183,
  127,
  1013,
  927,
  748,
  749,
  602,
  862,
  1125,
  285,
  521,
  27,
  199],
 [2030, 1, 166, 105, 928, 36, 2031, 50, 5, 392],
 [121, 60, 13, 109, 2602, 11, 25, 14, 3864],
 [158, 251, 2603, 11, 404, 6, 223, 405, 141, 633],
 [121, 121, 2604, 364, 863, 3865, 19, 406, 87, 634, 308],
 [37, 332, 36, 5, 2605, 1, 407, 26, 14, 140, 462, 25, 1, 88, 53, 209, 393],
 [3, 264, 351, 54, 3, 154, 635, 1, 33, 20

In [None]:
import numpy as np
import statistics
statistics.median((docs_train.apply(len)).values)

61

In [None]:
#padding each sentence to uniform length of median length
from keras.utils import pad_sequences
sequences_train = pad_sequences(sequences_train,padding='post',maxlen=61)
sequences_test = pad_sequences(sequences_test,padding='post',maxlen=61)

In [None]:
#Creating ANN model
from keras import Sequential
from keras.layers import Dense,Embedding,Flatten

In [None]:
model = Sequential()
model.add(Embedding(7952,2,input_length=61))
model.add(Flatten())
model.add(Dense(5,activation="relu"))
model.add(Dense(5,activation="relu"))
model.add(Dense(1, activation='sigmoid'))

In [None]:
X_train=sequences_train
X_test=sequences_test

In [None]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
Y_train=Y_train.to_numpy()
Y_test=Y_test.to_numpy()

In [None]:
X_test

array([[196,  13, 104, ...,   0,   0,   0],
       [ 52,   1,  30, ...,   0,   0,   0],
       [629,  18, 104, ...,   0,   0,   0],
       ...,
       [  1, 746, 520, ...,   0,   0,   0],
       [206,   1,  53, ...,   0,   0,   0],
       [101,  26,   1, ...,   0,   0,   0]], dtype=int32)

In [None]:
Y_test=Y_test.reshape(-1,1)

In [None]:
Y_train=Y_train.reshape(-1,1)

In [None]:
Y_train.shape

(4457, 1)

In [None]:
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor="val_loss",
                                        mode="min", patience=5,
                                        restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train,Y_train,epochs=50,validation_data=(X_test,Y_test),callbacks=[earlystopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


<keras.callbacks.History at 0x7f6183b47880>

In [None]:
from keras.utils import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
sequences = tokenizer.texts_to_sequences(docs)
sequences = pad_sequences(sequences,padding='post',maxlen=61)
voc_size=len(tokenizer.word_index)
model = Sequential()
model.add(Embedding(voc_size+1,2,input_length=61))
model.add(Flatten())
model.add(Dense(5,activation="relu"))
model.add(Dense(5,activation="relu"))
model.add(Dense(1, activation='sigmoid'))
X=sequences
Y=Y.to_numpy()
Y=Y.astype("int")
Y=Y.reshape(-1,1)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X,Y,epochs=13)

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


<keras.callbacks.History at 0x7f616ecbd540>

#Inputting the message to be classified as spam or non spam

In [None]:
input_mail=[input()]
#convert text to feature vectors
seq=tokenizer.texts_to_sequences(input_mail)
inp=pad_sequences(seq,padding='post',maxlen=61)
a=model.predict(inp)
value=a[0][0]
if(value>0.5):
  print('Non-spam message')
else:
  print('Spam message')

Congrats,  Rs.1500 SPECIAL Bonus Cash for you Ind vs Aus ODI on My11Circle Prize pool - 1,61,00,000 Entry Fee - Rs.49 Join Now - http://gmg.im/bYkEYf
Spam message
