<a href="https://colab.research.google.com/github/Vishaal-batcoderda/SMS-Text-Classification-Using-Neural-Networks/blob/main/SMS_Text_Classification_Using_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries
try:
  %tensorflow_version
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
# !pip install tensorflow-datasets
# import tensorflow_datasets as tfds (not used)
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
2.19.0


In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2026-01-02 05:55:03--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2026-01-02 05:55:03 (23.1 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2026-01-02 05:55:03--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2026-01-02 05:55:03 (15.2 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [3]:
# First i created training and testing dataframe
train_data = pd.read_csv(train_file_path, sep="\t", header=None, names=['label', 'message'])
test_data = pd.read_csv(test_file_path, sep="\t", header=None, names=['label','message'])

# I mapped every ham as 0 and spam as 1 in both the dataframes
train_data['label'] = train_data['label'].map({'ham': 0, 'spam': 1})
test_data['label'] = test_data['label'].map({'ham': 0, 'spam': 1})

from tensorflow.keras.preprocessing.text import Tokenizer

VOCAB_SIZE = 10000

# Tokenized every word, and characterized unseen words ( if occurs during prediction) as <OOV> format token
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['message'])

train_sequences = tokenizer.texts_to_sequences(train_data['message'])
test_sequences = tokenizer.texts_to_sequences(test_data['message'])

train_labels = train_data['label'].values
test_labels = test_data['label'].values

# Then i padded all data to the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAXLEN = 250

train_padded = pad_sequences(train_sequences, maxlen=MAXLEN)
test_padded = pad_sequences(test_sequences, maxlen=MAXLEN)

# I built the model with 10000 input nodes (of embedding layer) of output dimensions 32, then an LSTM layer to remember context, then finally an output layer
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['acc'])

# Fitting and training 80 % and testing 20%
history = model.fit(train_padded, train_labels, epochs=10, validation_split=0.2)

model.summary()

Epoch 1/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - acc: 0.8734 - loss: 0.3689 - val_acc: 0.9557 - val_loss: 0.1375
Epoch 2/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - acc: 0.9784 - loss: 0.0927 - val_acc: 0.9856 - val_loss: 0.0618
Epoch 3/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - acc: 0.9883 - loss: 0.0503 - val_acc: 0.9916 - val_loss: 0.0472
Epoch 4/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - acc: 0.9909 - loss: 0.0340 - val_acc: 0.9892 - val_loss: 0.0439
Epoch 5/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - acc: 0.9958 - loss: 0.0251 - val_acc: 0.9892 - val_loss: 0.0403
Epoch 6/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - acc: 0.9950 - loss: 0.0198 - val_acc: 0.9868 - val_loss: 0.0577
Epoch 7/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/

In [4]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])

def predict_message(pred_text):
  sequence = tokenizer.texts_to_sequences([pred_text])

  padded = pad_sequences(sequence, maxlen=MAXLEN)
  pred = model.predict(padded)[0][0]  # Extract predicted value from a nested numpy array

  label = 'spam' if pred > 0.5 else 'ham'
  prediction = float(pred), label
  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
(0.00037854857509955764, 'ham')


In [5]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
You passed the challenge. Great job!
