In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("SMSCollection.csv")
df.head()

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby("Class").describe()

Unnamed: 0_level_0,sms,sms,sms,sms
Unnamed: 0_level_1,count,unique,top,freq
Class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [5]:
df.Class.unique()

array(['ham', 'spam'], dtype=object)

In [6]:
df_spam = df[df.Class == "spam"]
df_spam.shape

(747, 2)

In [7]:
df_ham = df[df.Class == "ham"]
df_ham.shape

(4825, 2)

In [8]:
df_spam.shape[0]

747

In [9]:
all_ham = df_ham.sample(df_spam.shape[0])
all_ham.shape

(747, 2)

In [10]:
df_balanced = pd.concat([df_spam, all_ham])
df_balanced.Class.value_counts()

spam    747
ham     747
Name: Class, dtype: int64

In [11]:
df_balanced.sample(5)

Unnamed: 0,Class,sms
4355,spam,important information 4 orange user 0789xxxxxx...
3272,spam,You have 1 new voicemail. Please call 08719181...
415,spam,100 dating service cal;l 09064012103 box334sk38ch
3587,spam,I am hot n horny and willing I live local to y...
1342,ham,Just chill for another 6hrs. If you could slee...


In [12]:
df_balanced["spam"] = df_balanced["Class"].apply(lambda x : 1 if x == "spam" else 0)

In [13]:
df_balanced.sample(5)

Unnamed: 0,Class,sms,spam
956,ham,Sorry i now then c ur msg... Yar lor so poor t...,0
319,spam,December only! Had your mobile 11mths+? You ar...,1
3908,ham,No that just means you have a fat head,0
5520,ham,No. I dont want to hear anything,0
134,spam,Sunshine Quiz Wkly Q! Win a top Sony DVD playe...,1


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced["sms"],df_balanced["Class"], stratify=df_balanced["spam"])

In [15]:
X_train.head()

3906    Do you want a new video handset? 750 anytime a...
1722    Am watching house – very entertaining – am get...
273     HMV BONUS SPECIAL 500 pounds of genuine HMV vo...
1203    Me also da, i feel yesterday night  wait til 2...
4112    URGENT! Your Mobile number has been awarded a ...
Name: sms, dtype: object

In [16]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [17]:
# def get_sentence_embeding(sentences):
#     preprocessed_text = bert_preprocess(sentences)
#     return bert_encoder(preprocessed_text)["pooled_output"]

# get_sentence_embeding([
#     "500$ discount. hurry up",
#     "Bhavin, are you up for a volleyball game tommorow?"
# ])

In [18]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351695, -0.5132727 , -0.88845736, ..., -0.74748874,
        -0.75314736,  0.91964495],
       [-0.87208354, -0.50543964, -0.94446677, ..., -0.8584749 ,
        -0.7174534 ,  0.88082975]], dtype=float32)>

In [19]:
e = get_sentence_embeding([
    "banana", 
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [20]:
e

<tf.Tensor: shape=(6, 768), dtype=float32, numpy=
array([[-0.7606918 , -0.1421939 ,  0.49604574, ...,  0.42165288,
        -0.5322141 ,  0.8031218 ],
       [-0.8602324 , -0.21242957,  0.4915687 , ...,  0.39797997,
        -0.6050631 ,  0.84471667],
       [-0.7128861 , -0.154639  ,  0.38401738, ...,  0.35278767,
        -0.50991327,  0.734741  ],
       [-0.82533467, -0.35550553, -0.5906963 , ..., -0.01613665,
        -0.61417556,  0.872303  ],
       [-0.7504136 , -0.2681262 , -0.26689762, ...,  0.0283935 ,
        -0.59380996,  0.7974989 ],
       [-0.78544384, -0.29949707,  0.4102738 , ...,  0.5222538 ,
        -0.49573562,  0.81507534]], dtype=float32)>

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
cosine_similarity([e[1]], [e[2]])

array([[0.9850744]], dtype=float32)

In [23]:
cosine_similarity([e[1]], [e[3]])

array([[0.87739784]], dtype=float32)

In [24]:
cosine_similarity([e[3]], [e[4]])

array([[0.98720354]], dtype=float32)

In [25]:
# text_input = tensorflow.Keras.layers.Input(shape(), dtype = tf.string, name = "text") 
# preprocessed_text = bert_preprocess(text_input)
# outputs = bert_encoder(preprocessec_text)


# l = tensorflow.keras.layers.Dropout(0.1, name = "dropout")(outputs["pooled_output"])
# l = tensorflow.keras.layers.Dense(1, activation="sigmoid", name = "output")(1)

# model = tensorflow.keras.Model(inputs = [text_input], outputs = [l])

In [26]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [27]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [28]:
len(X_train)


1120

In [29]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [30]:
model.fit(X_train, y_train, epochs=10)


Epoch 1/10


UnimplementedError: Graph execution error:

Detected at node 'binary_crossentropy/Cast' defined at (most recent call last):
    File "C:\Users\nikhi\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\nikhi\anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\nikhi\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\nikhi\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\Users\nikhi\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\nikhi\anaconda3\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "C:\Users\nikhi\anaconda3\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "C:\Users\nikhi\anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "C:\Users\nikhi\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "C:\Users\nikhi\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "C:\Users\nikhi\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_cell
      result = self._run_cell(
    File "C:\Users\nikhi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2947, in _run_cell
      return runner(coro)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3172, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\nikhi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3364, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\nikhi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\nikhi\AppData\Local\Temp/ipykernel_9244/154610302.py", line 1, in <module>
      model.fit(X_train, y_train, epochs=10)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\engine\training.py", line 890, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\engine\training.py", line 948, in compute_loss
      return self.compiled_loss(
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\losses.py", line 139, in __call__
      losses = call_fn(y_true, y_pred)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\losses.py", line 243, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\nikhi\anaconda3\lib\site-packages\keras\losses.py", line 1920, in binary_crossentropy
      y_true = tf.cast(y_true, y_pred.dtype)
Node: 'binary_crossentropy/Cast'
Cast string to float is not supported
	 [[{{node binary_crossentropy/Cast}}]] [Op:__inference_train_function_70816]