In [56]:
# import libraries
#!pip install tensorflow-datasets
import tensorflow_datasets as tfds

#!pip install tensorflow
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import layers

print(tf.__version__)

2.15.0


In [57]:
# get data files

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

if os.path.exists(train_file_path) and os.path.exists(test_file_path):
    print("The files exist. Skipping downloads")
else:
    print("The files do not exist. Downloading...")
    !wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
    !wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
    print("Files downloaded.")


The files exist. Skipping downloads


In [58]:
# Load Train and Valid TSV files into Pandas dataframes
train_df = pd.read_csv(train_file_path, sep='\t', names=['Label', 'Text'])
valid_df = pd.read_csv(test_file_path, sep='\t', names=['Label', 'Text'])

# Encode 'ham' to 1 and 'spam' to 0
train_df['Label'] = train_df['Label'].apply(lambda x: 1 if x == 'spam' else 0)
valid_df['Label'] = valid_df['Label'].apply(lambda x: 1 if x == 'spam' else 0)

print(train_df)
print(valid_df)

df = pd.concat([train_df, valid_df], axis=0, ignore_index=True)
print(df)

# Create folders if they don't exist
folders = ['train/neg', 'train/pos']
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)

# Iterate through DataFrame and create text files
for index, row in df.iterrows():
    label = row['Label']
    text = row['Text']

    label_folder = 'train/neg' if label == 0 else 'train/pos'

    file_name = f'{index}.txt'
    file_path = os.path.join(label_folder, file_name)

    with open(file_path, 'w') as file:
        file.write(text)

print("Text files created based on labels.")

      Label                                               Text
0         0  ahhhh...just woken up!had a bad dream about u ...
1         0                           you can never do nothing
2         0  now u sound like manky scouse boy steve,like! ...
3         0  mum say we wan to go then go... then she can s...
4         0  never y lei... i v lazy... got wat? dat day ü ...
...     ...                                                ...
4174      0  just woke up. yeesh its late. but i didn't fal...
4175      0  what do u reckon as need 2 arrange transport i...
4176      1  free entry into our £250 weekly competition ju...
4177      1  -pls stop bootydelious (32/f) is inviting you ...
4178      0  tell my  bad character which u dnt lik in me. ...

[4179 rows x 2 columns]
      Label                                               Text
0         0  i am in hospital da. . i will return home in e...
1         0         not much, just some textin'. how bout you?
2         0  i probably won't 

In [59]:
batch_size = 32
raw_train_ds = keras.utils.text_dataset_from_directory(
    "train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = keras.utils.text_dataset_from_directory(
    "train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")

Found 5571 files belonging to 2 classes.
Using 4457 files for training.
Found 5571 files belonging to 2 classes.
Using 1114 files for validation.
Number of batches in raw_train_ds: 140
Number of batches in raw_val_ds: 35


In [60]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])

b'the word checkmate in chess comes from the persian phrase shah maat which means; the king is dead.. goodmorning.. have a good day..:)'
0
b'not yet chikku..wat abt u?'
0
b'k, wat s tht incident?'
0
b'free 1st week entry 2 textpod 4 a chance 2 win 40gb ipod or \xc2\xa3250 cash every wk. txt pod to 84128 ts&cs www.textpod.net custcare 08712405020.'
1
b'nite nite pocay wocay luv u more than n e thing 4eva i promise ring u 2morrowxxxx'
0


In [61]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

# vectorization layer. We are using this layer to normalize, split, and map strings to integers
vectorize_layer = keras.layers.TextVectorization(
    standardize="lower",
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Now that the vectorize_layer has been created, call `adapt` on a text-only dataset

# Let's make a text-only dataset (no labels):
text_ds = raw_train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

In [62]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)

In [63]:
# A integer input for vocab indices.
inputs = keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [64]:
epochs = 2

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x795972f11d80>

In [65]:
# A string input
inputs = keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = model(indices)

# Our end to end model
end_to_end_model = keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)

In [66]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):

    predict_data = end_to_end_model.predict([pred_text])
    predict = [predict_data[0][0]]
    if predict_data[0][0] < .5:
        predict.append("ham")
    else:
        predict.append("spam")
    return predict
pred_text = "how are you doing today"

prediction = predict_message(pred_text)
print(prediction)

[0.0030478453, 'ham']


In [69]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    print(prediction) # REMOVE WHEN DONE
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[0.0030478453, 'ham']
[0.9871975, 'spam']
[0.0030447172, 'ham']
[0.99958855, 'spam']
[0.9996754, 'spam']
[0.0030541674, 'ham']
[0.0041368036, 'ham']
You passed the challenge. Great job!
