<a href="https://colab.research.google.com/github/Ajay-Sai-Kiran/Natural-Language-Processing/blob/main/Performing_Email_Spam_Detection_Using_ELECTRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -m pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 9.4 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.7.3


In [2]:
import tensorflow_hub as hub

import pandas as pd

import tensorflow_text as text

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import tensorflow as tf

import numpy as np

In [3]:
# load data
df = pd.read_csv("/content/spam_data.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# check count and unique and top values and their frequency
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
# check percentage of data - states how much data needs to be balanced
print(str(round(747/4825,2))+'%')

0.15%


In [6]:
# creating 2 new dataframe as df_ham , df_spam

df_spam = df[df['Category']=='spam']

df_ham = df[df['Category']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 2)
Spam Dataset Shape: (747, 2)


In [7]:

# downsampling ham dataset - take only random 747 example
# will use df_spam.shape[0] - 747
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [8]:
# concating both dataset - df_spam and df_ham_balanced to create df_balanced dataset
df_balanced = pd.concat([df_spam , df_ham_downsampled])

In [9]:
df_balanced['Category'].value_counts()

ham     747
spam    747
Name: Category, dtype: int64

In [10]:
df_balanced.sample(10)

Unnamed: 0,Category,Message
939,spam,Urgent! call 09061749602 from Landline. Your c...
3229,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
1227,spam,Reply with your name and address and YOU WILL ...
584,ham,my ex-wife was not able to have kids. Do you w...
1912,ham,Lol grr my mom is taking forever with my presc...
2535,ham,Ok enjoy . R u there in home.
2046,ham,Okay... I booked all already... Including the ...
1446,ham,I am in a marriage function
5289,ham,Hey! Congrats 2u2. id luv 2 but ive had 2 go h...
589,ham,Ya srsly better than yi tho


In [11]:
# creating numerical repersentation of category - one hot encoding
df_balanced['spam'] = df_balanced['Category'].apply(lambda x:1 if x=='spam' else 0)

In [12]:
df_balanced.sample(4)

Unnamed: 0,Category,Message,spam
1311,ham,"I.ll always be there, even if its just in spir...",0
4710,ham,"Haha awesome, I might need to take you up on t...",0
3698,spam,You are a winner you have been specially selec...,1
1082,ham,Can u get pic msgs to your phone?,0


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['spam'],
                                                    stratify = df_balanced['spam'])

In [14]:
#bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
#bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [17]:
preprocess = hub.KerasLayer('https://hub.tensorflow.google.cn/tensorflow/bert_en_uncased_preprocess/3')
electra = hub.KerasLayer('https://hub.tensorflow.google.cn/google/electra_large/2')

In [18]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'Inputs')
preprocessed_text = preprocess(text_input)
embeed = electra(preprocessed_text)
dropout = tf.keras.layers.Dropout(0.1, name = 'Dropout')(embeed['pooled_output'])
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Dense')(dropout)

In [19]:
# creating final model
model = tf.keras.Model(inputs = [text_input], outputs = [outputs])

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Inputs (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['Inputs[0][0]']                 
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [21]:
Metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')
           ]

In [22]:
# compiling our model
model.compile(optimizer ='adam',
               loss = 'binary_crossentropy',
               metrics = Metrics)

In [23]:
history = model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
# Evaluating performance
model.evaluate(X_test,y_test)



[0.15200963616371155,
 0.9679144620895386,
 0.9781420826911926,
 0.9572192430496216]

In [25]:
# getting y_pred by predicting over X_text and flattening it
y_pred = model.predict(X_test)
y_pred = y_pred.flatten()

In [None]:
from sklearn.metrics import confusion_matrix , classification_report

# creating confusion matrix 

cm = confusion_matrix(y_test,y_pred)

cm

In [41]:
predict_text = [
                # Spam
                 
                'Netflix is sending you a refund of $12.99. Please reply with your bank account and routing number to verify and get your refund',
                 
                #ham
                'The article was published on 18th August itself'
                
]

In [43]:
test_results = model.predict(predict_text)
test_results

array([[0.7171719 ],
       [0.22484104]], dtype=float32)

In [44]:
output = np.where(test_results>0.3,'spam', 'ham')
output

array([['spam'],
       ['ham']], dtype='<U4')