In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
!pip install tensorflow_text

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:

import pandas as pd

column_names = ['Target', 'Text']
interviews_df = pd.read_csv(train_file_path, sep='\t',names=column_names)

df = interviews_df.copy()
df

In [None]:
df.groupby('Target').describe()

Here we can see that tere is a class imbalance.

In [None]:
df_spam  = df[df.Target == "spam"]
print(df_spam.shape)


In [None]:
df_ham  = df[df.Target == "ham"]
print("Before down sampleing:")
print(df_ham.shape)
print("After down sampleing:")
df_ham = df_ham.sample(df_spam.shape[0])
print(df_ham.shape)

Sampling down so that it can work better ie same abount of data they both have

In [None]:
df_balanced = pd.concat([df_spam,df_ham])
df_balanced.Target.value_counts() #Balanced dataset

In [None]:
df_balanced.Target = df_balanced.Target.map({'spam':1 , 'ham':0})
df_balanced

In [None]:
y = df_balanced.Target
X = df_balanced.Text

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify = df_balanced.Target)

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def preprocessed_text(pred_text):
  preprocessed_text = bert_preprocess(pred_text)
  prediction = bert_encoder(preprocessed_text)['pooled_output']


  return (prediction)


In [None]:
#bert Layers
text_input = tf.keras.layers.Input(shape=(),dtype= tf.string,name="text")
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

#Neural Network Layer
M = tf.keras.layers.Dropout(0.1,name = "Dropout")(outputs['pooled_output'])
J = tf.keras.layers.Dense(16,activation = 'sigmoid', name = 'middle')(M)
L = tf.keras.layers.Dense(1,activation = 'sigmoid', name = 'output')(J)


model = tf.keras.Model(inputs = [text_input], outputs = [L])

In [None]:
model.summary()

In [None]:
Metric = [
    tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
    tf.keras.metrics.Precision(name = 'precision'),
    tf.keras.metrics.Recall(name='recall')


]
model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = Metric )

In [None]:
model.fit(X_train,y_train,epochs = 10)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()

y_predicted = np.where(y_predicted > 0.7, 1, 0)

In [None]:
test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]
y_predicted = model.predict( test_messages)

y_predicted = y_predicted.flatten()
print(y_predicted)
y_predicted = np.where(y_predicted > 0.5, 1, 0)
print(y_predicted)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
print(classification_report(y_test, y_predicted))

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  hello = {}
  y_predicted = model.predict( test_messages)

  y_predicted = y_predicted.flatten()
  y_predicted = np.where(y_predicted > 0.3, 'spam', 'ham')

  return y_predicted

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True
  i = 0
  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[i] != ans:
      passed = False
    i = i+1

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
