In [1]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tensorflow
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow import keras
from keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.18.0


In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-03-14 21:39:30--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-03-14 21:39:30 (8.57 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-03-14 21:39:30--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-03-14 21:39:30 (3.65 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [3]:
df_train_dataset = pd.read_csv(train_file_path, sep='\t', names=['label', 'text'])
df_test_dataset = pd.read_csv(test_file_path, sep='\t', names=['label', 'text'])
train_dataset = df_train_dataset['text'].to_numpy()
test_dataset = df_test_dataset['text'].to_numpy()
train_label = df_train_dataset['label'].map({'ham':0, 'spam':1}).to_numpy()
test_label = df_test_dataset['label'].map({'ham':0, 'spam':1}).to_numpy()

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

In [4]:
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset)

In [5]:
train_dataset = encoder(train_dataset).numpy()
test_dataset = encoder(test_dataset).numpy()

In [6]:
train_dataset = sequence.pad_sequences(train_dataset, MAXLEN)
test_dataset = sequence.pad_sequences(test_dataset, MAXLEN)

In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['accuracy'])

In [8]:
history = model.fit(train_dataset, train_label, epochs=10, batch_size=BATCH_SIZE, validation_split=0.2, verbose=1)

Epoch 1/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 458ms/step - accuracy: 0.8446 - loss: 0.4873 - val_accuracy: 0.9701 - val_loss: 0.1220
Epoch 2/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 408ms/step - accuracy: 0.9802 - loss: 0.0851 - val_accuracy: 0.9833 - val_loss: 0.0667
Epoch 3/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 434ms/step - accuracy: 0.9897 - loss: 0.0411 - val_accuracy: 0.9833 - val_loss: 0.0600
Epoch 4/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 456ms/step - accuracy: 0.9927 - loss: 0.0257 - val_accuracy: 0.9880 - val_loss: 0.0535
Epoch 5/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 418ms/step - accuracy: 0.9960 - loss: 0.0136 - val_accuracy: 0.9569 - val_loss: 0.1395
Epoch 6/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 449ms/step - accuracy: 0.9989 - loss: 0.0097 - val_accuracy: 0.9904 - val_loss: 0.0787
Epoch 7/10
[1m53/53[

In [9]:
results = model.evaluate(test_dataset, test_label)
print(results)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 116ms/step - accuracy: 0.9845 - loss: 0.0941
[0.08127982914447784, 0.9841954112052917]


In [10]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):

  pred_text = encoder([pred_text]).numpy()
  pred_text = sequence.pad_sequences(pred_text, MAXLEN)
  pred_text = model.predict(pred_text)[0][0]
  label = 'spam' if pred_text >= 0.5 else 'ham'

  return [float(pred_text), label]

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[2.2950785023567732e-06, 'ham']


In [11]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 266ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
You passed the challenge. Great job!
