In [1]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt



In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-06-14 11:26:29--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.2.33, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-06-14 11:26:29 (8.31 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-06-14 11:26:29--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.2.33, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-06-14 11:26:30 (3.44 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [4]:
train_df = pd.read_csv("train-data.tsv", sep='\t', header=None, names=["label", "message"])
valid_df = pd.read_csv("valid-data.tsv", sep='\t', header=None, names=["label", "message"])


In [23]:
print(train_df.head())
valid_df.tail()

   label                                            message
0      0  ahhhh...just woken up!had a bad dream about u ...
1      0                           you can never do nothing
2      0  now u sound like manky scouse boy steve,like! ...
3      0  mum say we wan to go then go... then she can s...
4      0  never y lei... i v lazy... got wat? dat day ü ...


Unnamed: 0,label,message
1387,0,true dear..i sat to pray evening and felt so.s...
1388,0,"what will we do in the shower, baby?"
1389,0,where are you ? what are you doing ? are yuou ...
1390,1,ur cash-balance is currently 500 pounds - to m...
1391,1,not heard from u4 a while. call 4 rude chat pr...


In [5]:
# Encode labels: 'ham' -> 0, 'spam' -> 1
train_df['label'] = train_df['label'].map({'ham': 0, 'spam': 1})
valid_df['label'] = valid_df['label'].map({'ham': 0, 'spam': 1})

In [6]:
#Text Vectorization
from tensorflow.keras.layers import TextVectorization

max_vocab_size = 10000
max_sequence_length = 100

vectorizer = TextVectorization(max_tokens=max_vocab_size, output_sequence_length=max_sequence_length)
vectorizer.adapt(train_df['message'].values)

In [7]:
# Vectorize text
X_train = vectorizer(train_df['message'].values)
X_test = vectorizer(valid_df['message'].values)
y_train = np.array(train_df['label'].values)
y_test = np.array(valid_df['label'].values)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

In [9]:
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=16, input_length=max_sequence_length),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])



In [10]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
# Train the Model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

Epoch 1/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8660 - loss: 0.3868 - val_accuracy: 0.8657 - val_loss: 0.3688
Epoch 2/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8660 - loss: 0.3651 - val_accuracy: 0.8657 - val_loss: 0.3609
Epoch 3/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8660 - loss: 0.3551 - val_accuracy: 0.8657 - val_loss: 0.3551
Epoch 4/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8660 - loss: 0.3398 - val_accuracy: 0.8657 - val_loss: 0.3288
Epoch 5/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8660 - loss: 0.2969 - val_accuracy: 0.8671 - val_loss: 0.2684
Epoch 6/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8933 - loss: 0.2218 - val_accuracy: 0.9418 - val_loss: 0.1878
Epoch 7/10
[1m131/131[0m

In [12]:
#Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9727 - loss: 0.0806
Test Accuracy: 0.9727


In [14]:
#Checking Probability of test data
y_pred = model.predict(X_test)
y_pred

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


array([[4.5609634e-04],
       [1.0971003e-02],
       [3.6143669e-04],
       ...,
       [2.2226570e-02],
       [9.5251441e-01],
       [9.1706181e-01]], dtype=float32)

In [15]:
print(y_pred.shape)
print(y_pred[0])

(1392, 1)
[0.0004561]


In [19]:
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    # Preprocess the input message
    vec_msg = vectorizer(tf.convert_to_tensor([msg]))
    prediction = model.predict(vec_msg)[0][0]  # Get the scalar prediction
    label = "spam" if prediction >= 0.5 else "ham"

    print(f"Message: {msg}\nPrediction: {label}, Actual: {ans}\n")

    if label != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")


In [24]:
def predict_message(pred_text):
  # Convert to tensor and preprocess
  vec_text = vectorizer(tf.convert_to_tensor([pred_text]))

  # Predict
  prediction = model.predict(vec_text)[0][0]

  # Convert probability to label
  label = "spam" if prediction >= 0.5 else "ham"

  return label

# Example usage
pred_text = "ur cash-balance is currently 500 pounds - to maximize ur cash-in now send cash to 86688 only 150p/msg. cc: 08708800282 hg/suite342/2lands row/w1j6hl"
prediction = predict_message(pred_text)
print(prediction)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
spam
