In [1]:
! pip install -q tensorflow_datasets

In [118]:
!git clone https://github.com/CQCL/qnlp_lorenz_etal_2021_resources
!mv qnlp_lorenz_etal_2021_resources/datasets mc_rp_dataset

fatal: destination path 'qnlp_lorenz_etal_2021_resources' already exists and is not an empty directory.
mv: cannot stat 'qnlp_lorenz_etal_2021_resources/datasets': No such file or directory


In [184]:
from pathlib import Path
import pandas as pd
from joblib import load, dump
from sklearn.model_selection import train_test_split

In [120]:
mc_rp_sets_path = Path("mc_rp_dataset")
mc_rp_sets = list(mc_rp_sets_path.glob("*.txt"))
mc_datasets, rp_datasets = list(filter(lambda x: x.name.startswith("mc"), mc_rp_sets)), list(filter(lambda x: x.name.startswith("rp"), mc_rp_sets))

In [121]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

import matplotlib.pyplot as plt

In [153]:
def read_process_mcrp(datapaths: dict):
  def rm(text):
    return " ".join(list(map(lambda x: x[:x.find('_')], text.split())))
  retval = {}
  for datapath in datapaths:
    if "rp" in str(datapath):
      sel = 2
    else:
      sel = 3
    df = pd.DataFrame(list(map(lambda x: [int(x[0]), x[sel:]], datapath.read_text().split("\n"))), columns=['label', 'text'])
    df['text'] = df['text'].apply(rm)
    retval[datapath.name.split(".")[0]] = df
  return retval

mc_data, rp_data = read_process_mcrp(mc_datasets), read_process_mcrp(rp_datasets)

In [168]:
rp_data.keys()

dict_keys(['rp_test_data', 'rp_train_data'])

# MC Task

In [154]:
mc_data.keys()

dict_keys(['mc_dev_data', 'mc_train_data', 'mc_test_data'])

In [155]:
ds=tf.data.Dataset.from_tensor_slices((
            tf.cast(mc_data['mc_train_data'].text.values, tf.string),
            tf.cast(mc_data['mc_train_data'].label.values, tf.int64)
        ))

In [156]:
v_data=tf.data.Dataset.from_tensor_slices((
            tf.cast(mc_data['mc_test_data'].text.values, tf.string),
            tf.cast(mc_data['mc_test_data'].label.values, tf.int64)
        ))

In [157]:
BUFFER_SIZE = 100
BATCH_SIZE = 10


ds = ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
v_data = v_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [158]:
for example, label in ds.take(1):
    print(f"Texts : {example.numpy()[:3]} \n")
    print(f"Labels: {label.numpy()[:3]} \n")
    print("----")

Texts : [b'woman runs useful application' b'man cooks tasty meal'
 b'person cooks meal'] 

Labels: [0 1 1] 

----


In [161]:
VOCAB_SIZE = 20
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(ds.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())

In [159]:
example.numpy()[:3]

array([b'woman runs useful application', b'man cooks tasty meal',
       b'person cooks meal'], dtype=object)

In [160]:
encoded_example = encoder(example)[:].numpy()
encoded_example

array([[ 4, 11, 13, 18],
       [ 2,  8, 10,  7],
       [ 3,  8,  7,  0],
       [ 2,  6, 13, 18],
       [ 2, 11, 14,  0],
       [ 5,  2, 11, 14],
       [ 5,  3,  9,  7],
       [ 3,  6, 14,  0],
       [ 4,  9, 12,  0],
       [ 4,  8, 10, 12]])

In [162]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[ 4, 11, 13, 18],
       [ 2,  8, 10,  7],
       [ 3,  8,  7,  0]])

In [163]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=2,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [164]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_6 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 embedding_9 (Embedding)     (None, None, 2)           38        
                                                                 
 bidirectional_9 (Bidirecti  (None, 16)                704       
 onal)                                                           
                                                                 
 dense_18 (Dense)            (None, 8)                 136       
                                                                 
 dense_19 (Dense)            (None, 1)                 9         
                                                                 
Total params: 887 (3.46 KB)
Trainable params: 887 (3.4

In [165]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

In [166]:
history = model.fit(ds, epochs=180,
                    validation_data=v_data,
                    validation_steps=10)

Epoch 1/180
1/7 [===>..........................] - ETA: 55s - loss: 0.6919 - accuracy: 0.4000



Epoch 2/180
Epoch 3/180
Epoch 4/180
Epoch 5/180
Epoch 6/180
Epoch 7/180
Epoch 8/180
Epoch 9/180
Epoch 10/180
Epoch 11/180
Epoch 12/180
Epoch 13/180
Epoch 14/180
Epoch 15/180
Epoch 16/180
Epoch 17/180
Epoch 18/180
Epoch 19/180
Epoch 20/180
Epoch 21/180
Epoch 22/180
Epoch 23/180
Epoch 24/180
Epoch 25/180
Epoch 26/180
Epoch 27/180
Epoch 28/180
Epoch 29/180
Epoch 30/180
Epoch 31/180
Epoch 32/180
Epoch 33/180
Epoch 34/180
Epoch 35/180
Epoch 36/180
Epoch 37/180
Epoch 38/180
Epoch 39/180
Epoch 40/180
Epoch 41/180
Epoch 42/180
Epoch 43/180
Epoch 44/180
Epoch 45/180
Epoch 46/180
Epoch 47/180
Epoch 48/180
Epoch 49/180
Epoch 50/180
Epoch 51/180
Epoch 52/180
Epoch 53/180
Epoch 54/180
Epoch 55/180
Epoch 56/180
Epoch 57/180
Epoch 58/180
Epoch 59/180
Epoch 60/180
Epoch 61/180
Epoch 62/180
Epoch 63/180
Epoch 64/180
Epoch 65/180
Epoch 66/180
Epoch 67/180
Epoch 68/180
Epoch 69/180
Epoch 70/180
Epoch 71/180
Epoch 72/180
Epoch 73/180
Epoch 74/180
Epoch 75/180
Epoch 76/180
Epoch 77/180
Epoch 78/180
Epoch 7

# RP Task

In [170]:
ds=tf.data.Dataset.from_tensor_slices((
            tf.cast(rp_data['rp_train_data'].text.values, tf.string),
            tf.cast(rp_data['rp_train_data'].label.values, tf.int64)
        ))

In [171]:
v_data=tf.data.Dataset.from_tensor_slices((
            tf.cast(rp_data['rp_test_data'].text.values, tf.string),
            tf.cast(rp_data['rp_test_data'].label.values, tf.int64)
        ))

In [172]:
BUFFER_SIZE = 100
BATCH_SIZE = 10


ds = ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
v_data = v_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [173]:
for example, label in ds.take(1):
    print(f"Texts : {example.numpy()[:3]} \n")
    print(f"Labels: {label.numpy()[:3]} \n")
    print("----")

Texts : [b'person that join movement' b'quality that church teach'
 b'player that strike batter'] 

Labels: [0 1 0] 

----


In [175]:
VOCAB_SIZE = 100
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(ds.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())

In [176]:
encoded_example = encoder(example)[:].numpy()
encoded_example

array([[ 9,  2, 14, 37],
       [24,  2, 12, 63],
       [ 8,  2, 21, 54],
       [ 8,  2, 96, 72],
       [ 3,  2, 71, 14],
       [10,  2, 30, 21],
       [ 6,  2, 68, 48],
       [ 4,  2, 15, 11],
       [56,  2, 53, 23],
       [ 3,  2, 47, 12]])

In [177]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=2,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [178]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_7 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 embedding_10 (Embedding)    (None, None, 2)           196       
                                                                 
 bidirectional_10 (Bidirect  (None, 16)                704       
 ional)                                                          
                                                                 
 dense_20 (Dense)            (None, 8)                 136       
                                                                 
 dense_21 (Dense)            (None, 1)                 9         
                                                                 
Total params: 1045 (4.08 KB)
Trainable params: 1045 (

In [179]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

In [180]:
history = model.fit(ds, epochs=300,
                    validation_data=v_data,
                    validation_steps=10)

Epoch 1/300



Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 7

In [181]:
!wget https://archive.ics.uci.edu/static/public/331/sentiment+labelled+sentences.zip
!unzip -o sentiment+labelled+sentences.zip
!rm "sentiment labelled sentences/readme.txt"

--2023-10-30 20:44:01--  https://archive.ics.uci.edu/static/public/331/sentiment+labelled+sentences.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘sentiment+labelled+sentences.zip’

sentiment+labelled+     [  <=>               ]  82.21K   407KB/s    in 0.2s    

2023-10-30 20:44:01 (407 KB/s) - ‘sentiment+labelled+sentences.zip’ saved [84188]

Archive:  sentiment+labelled+sentences.zip
   creating: sentiment labelled sentences/
  inflating: sentiment labelled sentences/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/sentiment labelled sentences/
  inflating: __MACOSX/sentiment labelled sentences/._.DS_Store  
  inflating: sentiment labelled sentences/amazon_cells_labelled.txt  
  inflating: sentiment labelled sentences/imdb_labelled.txt  
  inflating: __MACOSX/sentiment labelled sente

In [185]:
# Separating Amazon, IMDb, Yelp dataset paths
datasets_path = Path("sentiment labelled sentences")
datasets = list(datasets_path.glob("*.txt"))

# Reading all data and creating data loaders from our data loading strategy
def read_process(datapaths: list):
    retval = {}
    for datapath in datapaths:
        df = pd.DataFrame(list(map(lambda x: x.split("\t"), datapath.read_text().split("\n"))), columns=['text', 'label']).dropna()
        df['label'] = df['label'].apply(lambda  x: int(x))
        retval[datapath.name.split(".")[0]] = df
    return retval

def ttsplit(data: pd.DataFrame, test_size=0.2):
    train_data, test_data = train_test_split(data, test_size=test_size, stratify=data['label'], random_state=42)
    train_data, test_data = train_data.reset_index().drop(columns=['index']), test_data.reset_index().drop(columns=['index'])
    # train_data = pd.DataFrame({'text': X_train, 'labels': y_train})
    # test_data = pd.DataFrame({'text': X_test, 'labels': y_test})
    return train_data, test_data



datadict = read_process(datasets)
amazon_data = datadict['amazon_cells_labelled']
imdb_data = datadict['imdb_labelled']
yelp_data = datadict['yelp_labelled']

amazon_train, amazon_test = ttsplit(amazon_data, 0.2)
imdb_train, imdb_test = ttsplit(imdb_data, 0.2)
yelp_train, yelp_test = ttsplit(yelp_data, 0.2)

# Amazon

In [187]:
ds=tf.data.Dataset.from_tensor_slices((
            tf.cast(amazon_train.text.values, tf.string),
            tf.cast(amazon_train.label.values, tf.int64)
        ))

In [188]:
v_data=tf.data.Dataset.from_tensor_slices((
            tf.cast(amazon_test.text.values, tf.string),
            tf.cast(amazon_test.label.values, tf.int64)
        ))

In [189]:
BUFFER_SIZE = 1000
BATCH_SIZE = 100


ds = ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
v_data = v_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [190]:
for example, label in ds.take(1):
    print(f"Texts : {example.numpy()[:3]} \n")
    print(f"Labels: {label.numpy()[:3]} \n")
    print("----")

Texts : [b'We would recommend these to others.'
 b'The first thing that happened was that the tracking was off.'
 b"But when I check voice mail at night, the keypad backlight turns off a few seconds into the first message, and then I'm lost."] 

Labels: [1 0 0] 

----


In [192]:
VOCAB_SIZE = 2400
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(ds.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())

In [193]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=2,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [194]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_9 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 embedding_11 (Embedding)    (None, None, 2)           3274      
                                                                 
 bidirectional_11 (Bidirect  (None, 16)                704       
 ional)                                                          
                                                                 
 dense_22 (Dense)            (None, 8)                 136       
                                                                 
 dense_23 (Dense)            (None, 1)                 9         
                                                                 
Total params: 4123 (16.11 KB)
Trainable params: 4123 

In [195]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

In [196]:
history = model.fit(ds, epochs=300,
                    validation_data=v_data,
                    validation_steps=10)

Epoch 1/300



Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 7

# IMDB

In [197]:
ds=tf.data.Dataset.from_tensor_slices((
            tf.cast(imdb_train.text.values, tf.string),
            tf.cast(imdb_train.label.values, tf.int64)
        ))

In [198]:
v_data=tf.data.Dataset.from_tensor_slices((
            tf.cast(imdb_test.text.values, tf.string),
            tf.cast(imdb_test.label.values, tf.int64)
        ))

In [199]:
BUFFER_SIZE = 1000
BATCH_SIZE = 100


ds = ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
v_data = v_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [200]:
VOCAB_SIZE = 3300
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(ds.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())

In [201]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=2,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [202]:
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_10 (Tex  (None, None)              0         
 tVectorization)                                                 
                                                                 
 embedding_12 (Embedding)    (None, None, 2)           5518      
                                                                 
 bidirectional_12 (Bidirect  (None, 16)                704       
 ional)                                                          
                                                                 
 dense_24 (Dense)            (None, 8)                 136       
                                                                 
 dense_25 (Dense)            (None, 1)                 9         
                                                                 
Total params: 6367 (24.87 KB)
Trainable params: 6367 

In [203]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

In [204]:
history = model.fit(ds, epochs=300,
                    validation_data=v_data,
                    validation_steps=10)

Epoch 1/300



Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 7

# YELP

In [205]:
ds=tf.data.Dataset.from_tensor_slices((
            tf.cast(yelp_train.text.values, tf.string),
            tf.cast(yelp_train.label.values, tf.int64)
        ))

In [206]:
v_data=tf.data.Dataset.from_tensor_slices((
            tf.cast(yelp_test.text.values, tf.string),
            tf.cast(yelp_test.label.values, tf.int64)
        ))

In [207]:
BUFFER_SIZE = 1000
BATCH_SIZE = 100


ds = ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
v_data = v_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [208]:
VOCAB_SIZE = 2000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(ds.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())

In [209]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=2,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [210]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_11 (Tex  (None, None)              0         
 tVectorization)                                                 
                                                                 
 embedding_13 (Embedding)    (None, None, 2)           3630      
                                                                 
 bidirectional_13 (Bidirect  (None, 16)                704       
 ional)                                                          
                                                                 
 dense_26 (Dense)            (None, 8)                 136       
                                                                 
 dense_27 (Dense)            (None, 1)                 9         
                                                                 
Total params: 4479 (17.50 KB)
Trainable params: 4479 

In [211]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

In [212]:
history = model.fit(ds, epochs=300,
                    validation_data=v_data,
                    validation_steps=10)

Epoch 1/300



Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 7