In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
import pandas as pd

df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
747/4825 #Only 15% is spam email and rest is ham emails. That is one Imbalanced Dataset

0.15481865284974095

In [5]:
df_spam = df[df['Category']=='spam']
df_ham = df[df['Category']=='ham']

In [6]:
df_spam.head()

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [7]:
df_ham.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


In [8]:
#We are gonna use dropout method to balance the dataset

df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [9]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

(1494, 2)

In [10]:
df_balanced['Category'].value_counts()

Category
ham     747
spam    747
Name: count, dtype: int64

In [11]:
df_balanced['spam'] = df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced['spam'].value_counts()

spam
0    747
1    747
Name: count, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['spam'], stratify=df_balanced['spam'])

In [13]:
X_train.head()

3089    Wnevr i wana fal in luv vth my books, My bed f...
1458    CLAIRE here am havin borin time & am now alone...
5030    I'd like to tell you my deepest darkest fantas...
4529    HOW ARE U? I HAVE MISSED U! I HAVENT BEEN UP 2...
5052       With my sis lor... We juz watched italian job.
Name: Message, dtype: object

In [14]:
bert_preprocess = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3")

bert_encoder = hub.KerasLayer(
    'https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/3',
    trainable=True
)

In [15]:
def get_sentence_embeding(sentence):
    preprocessed_text = bert_preprocess(sentence)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    'Free! Free! Free! New mega sale',
    'Hey this is Awwab',
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8567158 , -0.44839314, -0.71187186, ..., -0.266167  ,
        -0.68420464,  0.87120867],
       [-0.796976  , -0.25925672,  0.18617256, ...,  0.17042439,
        -0.54326385,  0.85316294]], dtype=float32)>

In [16]:
em = get_sentence_embeding([
    'banana',
    'orange',
    'apple',
    'Cristiano Ronaldo',
    'Lionel Messi',
    'Elon Musk',
    'Jeff Bezoz'
])

em

<tf.Tensor: shape=(7, 768), dtype=float32, numpy=
array([[-0.76069176, -0.14219391,  0.4960463 , ...,  0.4216533 ,
        -0.532214  ,  0.8031217 ],
       [-0.83630794, -0.23830129,  0.38453633, ...,  0.45564768,
        -0.6078616 ,  0.8278892 ],
       [-0.8196457 , -0.296096  ,  0.20951825, ...,  0.25593394,
        -0.58742994,  0.8434556 ],
       ...,
       [-0.94586456, -0.4398571 , -0.69372   , ..., -0.62341523,
        -0.62076056,  0.9259536 ],
       [-0.75041324, -0.268126  , -0.2668956 , ...,  0.02839448,
        -0.5938097 ,  0.7974985 ],
       [-0.85063255, -0.44003707, -0.67120284, ..., -0.09733316,
        -0.5980282 ,  0.8084023 ]], dtype=float32)>

In [17]:
#It shows how similar are the two words
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([em[0]], [em[1]])

array([[0.9841549]], dtype=float32)

In [18]:
# Bert Layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
output = bert_encoder(preprocessed_text)

# Neural Layer
l = tf.keras.layers.Dropout(0.1, name='dropout')(output['pooled_output'])
l = tf.keras.layers.Dense(1, name='output')(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs=[l])

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [20]:
len(X_train)

1120

In [21]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [22]:
 tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [25]:
with tf.device("/gpu:0"):
    model.fit(X_train, y_train, epochs=5)    

Epoch 1/5


AlreadyExistsError: Graph execution error:

2 root error(s) found.
  (0) ALREADY_EXISTS:  Resource __per_step_0/Adam/gradients/StatefulPartitionedCall/gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1066/gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1066/gradients/bert_encoder/StatefulPartitionedCall_grad/StatefulPartitionedCall/gradients/AddN_63/tmp_var/frame:0/iter:0/struct tensorflow::TemporaryVariableOp::TmpVar
	 [[{{node Adam/gradients/StatefulPartitionedCall/gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1066/gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1066/gradients/bert_encoder/StatefulPartitionedCall_grad/StatefulPartitionedCall/gradients/AddN_63/tmp_var}}]]
	 [[model/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/bert_pack_inputs/PartitionedCall/map/while/body/_10302/map/while/strided_slice/_72]]
  (1) ALREADY_EXISTS:  Resource __per_step_0/Adam/gradients/StatefulPartitionedCall/gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1066/gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1066/gradients/bert_encoder/StatefulPartitionedCall_grad/StatefulPartitionedCall/gradients/AddN_63/tmp_var/frame:0/iter:0/struct tensorflow::TemporaryVariableOp::TmpVar
	 [[{{node Adam/gradients/StatefulPartitionedCall/gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1066/gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1066/gradients/bert_encoder/StatefulPartitionedCall_grad/StatefulPartitionedCall/gradients/AddN_63/tmp_var}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_84469]

In [44]:
model.evaluate(X_test, y_test)



[7.712474346160889, 0.5]

In [45]:
reviews = [
    'Reply to win Â£100 weekly! Where will the 2006 FIFA World Cup be held? Send STOP to 87239 to end service',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)



array([[-2.9854891],
       [-2.985491 ],
       [-2.9854925],
       [-2.9854872],
       [-2.9854965]], dtype=float32)