In [88]:
# Load the Numpy binaries from file

import numpy as np

with open('tweets_xids.npy', 'rb') as f:
    Xids = np.load(f, allow_pickle=True)
with open('tweets_xmask.npy', 'rb') as f:
    Xmask = np.load(f, allow_pickle=True)
with open('twitter-labels.npy', 'rb') as f:
    labels = np.load(f, allow_pickle=True)

Take the three arrays and create a TensorFlow dataset object with them

In [89]:
import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(2,), dtype=tf.float64, name=None))>

There are two tensors required for inputs (input_ids, attention_mask) - so, the **\<inputs\>** tensor will be entered as a dictionary:

```
{
    'input_ids': <input_id_tensor>,
    'attention_mask': <mask_tensor>
}
```

In [90]:
def map_func(input_ids, masks, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(2,), dtype=tf.float64, name=None))>

Next, shuffle the data, and batch it. Drop any samples that don't fit evenly into chunks of 64.

In [91]:
batch_size = 64

dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(64, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(64, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(64, 2), dtype=tf.float64, name=None))>

The final step is to split our data into training and validation sets.

In [92]:
# Dataset will be split into ration 90:10 such that training set gets 90, and vakidation set gets 10
split = 0.9 

# we need to calculate how many batches must be taken to create 90% training set
size = int((Xids.shape[0] / batch_size) * split)

size

449

In [93]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

# free up memory
del dataset

Save prepared datasets to file using [`tf.data.experimental.save`](https://www.tensorflow.org/api_docs/python/tf/data/experimental/save).

In [94]:
tf.data.Dataset.save(train_ds, 'train')
tf.data.Dataset.save(val_ds, 'val')

Load these files and check element specification

In [95]:
train_ds.element_spec

({'input_ids': TensorSpec(shape=(64, 512), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(64, 512), dtype=tf.int32, name=None)},
 TensorSpec(shape=(64, 2), dtype=tf.float64, name=None))

In [96]:
val_ds.element_spec == train_ds.element_spec

True

In [97]:
ds = tf.data.Dataset.load('train', element_spec=train_ds.element_spec)