<a href="https://colab.research.google.com/github/OmarGhoz/FreeCodeCamp-Machine-Learning-Projects/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
# Load the training and test datasets into pandas DataFrames
train_data = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])
test_data = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])

# Inspect the first few rows of the training data
train_data.head()

# Check the distribution of labels in the training data
train_data['label'].value_counts()


In [None]:
# Convert 'ham' to 0 and 'spam' to 1
train_data['label'] = train_data['label'].map({'ham': 0, 'spam': 1})
test_data['label'] = test_data['label'].map({'ham': 0, 'spam': 1})

# Tokenize the messages (convert words into integers)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['message'])

# Convert text to sequences (list of integers)
train_sequences = tokenizer.texts_to_sequences(train_data['message'])
test_sequences = tokenizer.texts_to_sequences(test_data['message'])

# Pad sequences to ensure consistent input length
max_length = 150  # You can adjust this value
train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')


In [None]:
# Define the model architecture
model = tf.keras.Sequential([
    # Embedding layer to convert words into dense vectors
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    # LSTM layer for sequential data
    tf.keras.layers.LSTM(64),
    # Fully connected layer with ReLU activation
    tf.keras.layers.Dense(64, activation='relu'),
    # Output layer with sigmoid activation to predict 0 or 1 (ham or spam)
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary to inspect the architecture
model.summary()


In [None]:
# Train the model on the training data
history = model.fit(train_sequences, train_data['label'], epochs=5, batch_size=32, validation_data=(test_sequences, test_data['label']))

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training vs Validation Accuracy')
plt.show()


In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(message):
    # Preprocess the message (convert to sequence, pad)
    sequence = tokenizer.texts_to_sequences([message])
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')

    # Predict the label (0 for ham, 1 for spam)
    prediction = model.predict(padded_sequence)

    # Return the probability and the predicted class
    if prediction[0] >= 0.5:
        return [prediction[0], "spam"]
    else:
        return [1 - prediction[0], "ham"]


In [None]:
# Test the predict_message function
sample_message = "Congratulations, you've won a free iPhone! Click here to claim."
prediction = predict_message(sample_message)
print(f"Prediction: {prediction}")


In [None]:
# Save the model
model.save("sms_text_classifier.h5")


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
