In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2022-12-29 13:53:24--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2022-12-29 13:53:25 (8.13 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2022-12-29 13:53:25--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2022-12-29 13:53:25 (4.73 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [None]:
# load data
train_data = pd.read_csv(
    train_file_path,
    sep = '\t',
    header = 0,
    names = ['class', 'value']
)

test_data = pd.read_csv(
    test_file_path,
    sep = '\t',
    header = 0,
    names = ['class', 'value']
)

train_data.tail()

Unnamed: 0,label,value
4173,ham,just woke up. yeesh its late. but i didn't fal...
4174,ham,what do u reckon as need 2 arrange transport i...
4175,spam,free entry into our £250 weekly competition ju...
4176,spam,-pls stop bootydelious (32/f) is inviting you ...
4177,ham,tell my bad character which u dnt lik in me. ...


In [None]:
# prepare datasets
train_labels = train_data.copy()
train_labels = train_labels.pop('label')
train_labels = pd.factorize(train_labels)[0]

test_labels = test_data.copy()
test_labels = test_labels.pop('label')
test_labels = pd.factorize(test_labels)[0]

train_features = train_data.copy()
train_features = train_features.pop('value')
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))

test_features = test_data.copy()
test_features = test_features.pop('value')
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))

train_dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [None]:
# shuffle data for training and create batches
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
# text vectorization layer
vectorization_layer = keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens = 1000,
    output_mode = 'int',
    output_sequence_length = 1000
    )
vectorization_layer.adapt(train_dataset.map(lambda text, label: text))

In [None]:
vocab = np.array(vectorization_layer.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'to', 'i', 'you', 'a', 'the', 'u', 'and', 'in', 'is',
       'me', 'my', 'for', 'your', 'of', 'it', 'call', 'have', 'on'],
      dtype='<U15')

In [None]:
# create and compile model
model = tf.keras.Sequential([
    vectorization_layer,
    tf.keras.layers.Embedding(
        input_dim = len(vectorization_layer.get_vocabulary()),
        output_dim = 32,
        mask_zero = True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
              optimizer = tf.keras.optimizers.Adam(1e-4),
              metrics = ['accuracy'])

In [None]:
# train model
history = model.fit(train_dataset, epochs = 6,
                    validation_data = test_dataset,
                    validation_steps = 30)


Epoch 1/6



Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])

def predict_message(pred_text):
  prediction = model.predict([pred_text])
  if prediction < - 0.5:
    predicted_class = 'ham'
  else:
    predicted_class = 'spam'
  return ([prediction[0][0], predicted_class])

pred_text = "how are you doing today"
prediction = predict_message(pred_text)

print(prediction)

[-3.504548, 'ham']


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
