### FCC Challenge 5
This notebook contains the program aimed at solving the 5th challenge in the freeCodeCamp Machine Learning with Python course.

Reference - https://www.tensorflow.org/text/tutorials/text_classification_rnn

In [None]:
# import libraries
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.18.0


In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-06-25 12:10:31--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 104.26.3.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-06-25 12:10:31 (9.16 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-06-25 12:10:31--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 104.26.3.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-06-25 12:10:32 (6.00 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



### Data Inspection

In [None]:
train_df = pd.read_csv(train_file_path, sep = "\t", header = None, names = ["class","text"])
test_df = pd.read_csv(test_file_path, sep = "\t", header = None, names = ["class","text"])

In [None]:
train_df.head()

Unnamed: 0,class,text
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...


In [None]:
train_df.sample(5).text.values

array(["alright. i'm out--have a good night!",
       'remember to ask alex about his pizza',
       'hey no i ad a crap nite was borin without ya 2 boggy with me u boring biatch! thanx but u wait til nxt time il ave ya',
       "ok. every night take a warm bath drink a cup of milk and you'll see a work of magic. you still need to loose weight. just so that you know",
       'mom wants to know where you at'], dtype=object)

In [None]:
test_df.head()

Unnamed: 0,class,text
0,ham,i am in hospital da. . i will return home in e...
1,ham,"not much, just some textin'. how bout you?"
2,ham,i probably won't eat at all today. i think i'm...
3,ham,don‘t give a flying monkeys wot they think and...
4,ham,who are you seeing?


In [None]:
test_df.sample(5).text.values

array(['and stop being an old man. you get to build snowman snow angels and snowball fights.',
       "joy's father is john. then john is the name of joy's father. mandan",
       'i want  &lt;#&gt;  rs da:)do you have it?',
       "i call you later, don't have network. if urgnt, sms me.",
       'and popping &lt;#&gt; ibuprofens was no help.'], dtype=object)

### Splitting the dataset

In [None]:
#Reordering columns
train_df = train_df[["text","class"]]
test_df = test_df[["text","class"]]

In [None]:
#Converting the text labels into integers. 0 for ham and 1 for spam
mapping = {"ham":np.int8(0), "spam":np.int8(1)}

train_df['class'] = train_df['class'].map(mapping)
test_df['class'] = test_df['class'].map(mapping)

In [None]:
#Creating a validation dataframe from the train dataset
val_df = train_df.sample(frac = 0.2, random_state = 47)
train_df = train_df.drop(val_df.index)

In [None]:
#Creating a function to convert the dataframe to tensorflow datasets
def df_to_ds(df):
  text = df['text'].values
  labels = df['class'].values
  labels = tf.expand_dims(labels, -1)
  return tf.data.Dataset.from_tensor_slices((text, labels))

train_ds = df_to_ds(train_df)
val_ds = df_to_ds(val_df)
test_ds = df_to_ds(test_df)

In [None]:
for text, label in train_ds.take(1):
  print(text.numpy())
  print(label.numpy())

b'ahhhh...just woken up!had a bad dream about u tho,so i dont like u right now :) i didnt know anything about comedy night but i guess im up for it.'
[0]


In [None]:
for text, label in val_ds.take(1):
  print(text.numpy())
  print(label.numpy())

b"u wake up already? thanx 4 e tau sar piah it's quite nice."
[0]


In [None]:
for text, label in test_ds.take(1):
  print(text.numpy())
  print(label.numpy())

b'i am in hospital da. . i will return home in evening'
[0]


In [None]:
buffer_size = 10000
batch_size = 64

train_ds = train_ds.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)

### Data Preprocessing

In [None]:
#Creating a function to preprocess the text
def preprocess_text(input):
  lowercase = tf.strings.lower(input)
  stripped_text = tf.strings.regex_replace(lowercase, r":-\)"," ")
  stripped_text = tf.strings.regex_replace(stripped_text, r"&lt;", "")
  stripped_text = tf.strings.regex_replace(stripped_text, r"&gt;", "")
  stripped_text = tf.strings.regex_replace(stripped_text, r"\s+", " ")
  return stripped_text

In [None]:
#Creating a TextVectorization layer
max_features = 10000    #Maximum vocabulary size
sequence_len = 100      #Len of the output string

#So the vectorizer will return an array for each text, with a specific integer for every unique word in the text.
#Then all the texts will be made of the same length by padding or concatenating it to sequence length.
#Only top max_features occuring words will be saved in the dictionary.
#These text vector's output will then be used by the Embedding Layer

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize = preprocess_text,
    max_tokens = max_features,
    output_mode = 'int', #Unique integer for each token
    output_sequence_length = sequence_len
)

vectorize_layer.adapt(train_ds.map(lambda text, label: text))

In [None]:
vocab = np.array(vectorize_layer.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'to', 'i', 'you', 'a', 'the', 'u', 'and', 'in', 'is',
       'me', 'my', 'for', 'your', 'of', 'it', 'call', 'have', 'on'],
      dtype='<U15')

###Creating the model

In [None]:
#This model has been created as per the text classification reference on the tensorflow website using RNN
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(
        input_dim = len(vectorize_layer.get_vocabulary()),
        output_dim = 64,
        mask_zero = True        # Use masking to handle the variable sequence lengths
        ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation = "relu"),
    tf.keras.layers.Dense(1, activation = "sigmoid") #Since we want to get the probability of text being spam
])

In [None]:
#Compiling the model
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
#Training the model
history = model.fit(train_ds, epochs=5,
                    validation_data=val_ds)

Epoch 1/5
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 119ms/step - accuracy: 0.7998 - loss: 0.6839 - val_accuracy: 0.8804 - val_loss: 0.6381
Epoch 2/5
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 97ms/step - accuracy: 0.8642 - loss: 0.6051 - val_accuracy: 0.8804 - val_loss: 0.4526
Epoch 3/5
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 111ms/step - accuracy: 0.8614 - loss: 0.4372 - val_accuracy: 0.8804 - val_loss: 0.3234
Epoch 4/5
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 180ms/step - accuracy: 0.8791 - loss: 0.3029 - val_accuracy: 0.9342 - val_loss: 0.2181
Epoch 5/5
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 105ms/step - accuracy: 0.9474 - loss: 0.2031 - val_accuracy: 0.9749 - val_loss: 0.1582


In [None]:
#Evaluation
test_loss, test_acc = model.evaluate(test_ds)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.9470 - loss: 0.1922


In [None]:
#Checking on a sample text
sample_text = ('Hello, who dis')
predictions = model.predict(tf.constant([sample_text]))
print(predictions[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[0.3502392]


Creating a function called predict_message that takes a message string as an argument and returns a list. The first element in the list is a number between zero and one that indicates the likeliness of "ham" (0) or "spam" (1). The second element in the list is the word "ham" or "spam", depending on which is most likely.

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  input = tf.constant([pred_text])
  output = model.predict(input)
  label = "spam"
  if output[0] <=0.5:
    label = "ham"


  prediction = [
      output[0][0],
      label
  ]

  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[np.float32(0.115458935), 'ham']


In [None]:
#Evaluating the predicted labels of the test messages
test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]
for i in test_messages:
  print(predict_message(i))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[np.float32(0.0960649), 'ham']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[np.float32(0.5805825), 'spam']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[np.float32(0.0038926147), 'ham']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[np.float32(0.5642972), 'spam']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[np.float32(0.5898609), 'spam']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[np.float32(0.035307754), 'ham']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[np.float32(0.014360937), 'ham']


### Testing

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
You passed the challenge. Great job!
