In [1]:
from google.colab import files
data_to_load = files.upload()

Saving kaggle.json to kaggle.json


In [2]:
! mkdir ~/.kaggle
! mv kaggle.json ~/.kaggle/
! chmod 600 /root/.kaggle/kaggle.json

In [3]:
! kaggle datasets download -d augustinkpadonou/spam-ham-filter

Downloading spam-ham-filter.zip to /content
  0% 0.00/934k [00:00<?, ?B/s]
100% 934k/934k [00:00<00:00, 104MB/s]


In [None]:
! unzip spam-ham-filter.zip

In [5]:
import os 
import pathlib 
import shutil
import random

base_dir = pathlib.Path("spam-filter-data")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("spam","ham"):
  os.makedirs(val_dir / category)
  files = os.listdir(train_dir / category)
  random.Random(1337).shuffle(files)
  num_val_samples = int(0.2 * len(files))
  val_files = files[-num_val_samples:]
  for fname in val_files :
    shutil.move(train_dir / category / fname,
                val_dir / category / fname)

In [6]:
import tensorflow as tf
from tensorflow import keras 
from keras import layers  
from keras.layers import TextVectorization

In [7]:
batch_size = 32 

train_ds = keras.utils.text_dataset_from_directory("spam-filter-data/train",
                                                   batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory("spam-filter-data/val",
                                                 batch_size = batch_size)

test_ds = keras.utils.text_dataset_from_directory("spam-filter-data/test",
                                                  batch_size=batch_size)

Found 560 files belonging to 2 classes.
Found 140 files belonging to 2 classes.
Found 260 files belonging to 2 classes.


In [47]:
# Displaying the shapes and dtypes of the first batch

for inputs , targets in train_ds :
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[5])
  print("targets[0]:", targets[5])
  break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'consolidate debt home improvement is limitedtime offer homeowners remove instruction bottom email homeowner are entitled free consultation are interest home improvement consolidate monthly bill reduce payment unanticipate need least reason are owner occupant single family free stand residence day s important ever most own home help ve create variety unique finance program option remember valuable information absolutely free learn hour preapproval equity consolidate bills home improvements pay college buy car boat even nd home save thousands yearly interest payments even re selfemployed heavy debt been previously declined t prove income plus certain program s prepayment penalty appraisal required thousand already respond help however offer available indefinitely act receive complimentary confidential consultation click highlight link below http www assetmax com

## Processing words as a set: The bag-of-words approach

In [13]:
# Preprocessing our datasets with a TextVectorization layer

text_vectorization = TextVectorization(
    max_tokens=10000, 
    output_mode = "multi_hot"
)

text_only_train_ds = train_ds.map(lambda x,y : x)
text_vectorization.adapt(text_only_train_ds)

In [14]:
binary_1gram_train_ds = train_ds.map(
          lambda x, y: (text_vectorization(x), y),
          num_parallel_calls=4)

binary_1gram_val_ds = val_ds.map(
          lambda x, y: (text_vectorization(x), y),
          num_parallel_calls=4)

binary_1gram_test_ds = test_ds.map(
          lambda x, y: (text_vectorization(x), y),
          num_parallel_calls=4)

In [37]:
for inputs , targets in binary_1gram_train_ds :
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 10000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 0. 1. ... 0. 0. 0.], shape=(10000,), dtype=float32)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


In [20]:

def get_model(max_tokens=10000, hidden_dense=16):
  tf.keras.backend.clear_session()
  inputs = keras.Input(shape=(max_tokens ,))
  x = layers.Dense(hidden_dense, activation="relu")(inputs)
  x = layers.Dropout(0.5)(x)
  outputs = layers.Dense(1, activation="sigmoid")(x)

  model = keras.Model(inputs, outputs)
  model.compile(optimizer="rmsprop",
                loss="binary_crossentropy",
                metrics=["accuracy"])
  return model 

In [21]:
model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10000)]           0         
                                                                 
 dense (Dense)               (None, 16)                160016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [22]:
callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                save_best_only=True)]

model.fit(binary_1gram_train_ds.cache(), 
          validation_data = binary_1gram_val_ds.cache(),
          epochs = 10 ,
          callbacks= callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7ed617cbb0>

In [40]:
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Test acc: 0.996


In [None]:
prompt = input("Enter a prompt :")

In [68]:
import numpy as np       
prompt = text_vectorization(prompt)
prompt = np.expand_dims(prompt, axis=0)

In [None]:
model.predict(prompt)