In [1]:
!pip install 'fhnw-nlp-utils>=0.1.3'
!pip install pyarrow fastparquet fasttext 
from fhnw.nlp.utils.storage import load_dataframe
from fhnw.nlp.utils.storage import download
from fhnw.nlp.utils.colab import runs_on_colab

import numpy as np
import pandas as pd

import tensorflow as tf

print("Tensorflow version:", tf.__version__)

#physical_devices = tf.config.list_physical_devices('GPU') 
#tf.config.experimental.set_memory_growth(physical_devices[0], True)
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Collecting pyarrow
  Downloading pyarrow-5.0.0-cp36-cp36m-manylinux2014_x86_64.whl (23.6 MB)
[K     |████████████████████████████████| 23.6 MB 1.6 MB/s eta 0:00:01
[?25hCollecting fastparquet
  Downloading fastparquet-0.7.1-cp36-cp36m-manylinux2010_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 3.7 MB/s eta 0:00:01
Collecting thrift>=0.11.0
  Downloading thrift-0.15.0.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 8.8 MB/s  eta 0:00:01
[?25hCollecting cramjam>=2.3.0
  Downloading cramjam-2.4.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 33.9 MB/s eta 0:00:01
[?25hCollecting fsspec
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 28.0 MB/s eta 0:00:01
Building wheels for collected packages: thrift
  Building wheel for thr

In [2]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [3]:
file = "data/german_news_articles_original_train_and_test_tokenized.parq"
data_all = load_dataframe(file)

In [None]:
data_train_orig = data_all.loc[(data_all["split"] == "train")]
data_test_orig = data_all.loc[(data_all["split"] == "test")]

In [6]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
list(text_dataset.as_numpy_iterator())

[b'foo', b'bar', b'baz']

In [9]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
max_features = 5000  # Maximum vocab size.
max_len = 4  # Sequence length to pad the outputs to.

# Create the layer.
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
vectorize_layer.adapt(text_dataset.batch(64))

# Create the model that uses the vectorize text layer
model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
# indices.
model.add(vectorize_layer)

# Now, the model can map strings to integers, and you can add an embedding
# layer to map these integers to learned embeddings.
input_data = [["foo qux foo bar"], ["qux baz"], ['foo']]
model.predict(input_data)



array([[2, 1, 2, 4],
       [1, 3, 0, 0],
       [2, 0, 0, 0]])

In [17]:
vocab_data = ["earth", "wind", "and", "fire"]
max_len = 4  # Sequence length to pad the outputs to.

# Create the layer, passing the vocab directly. You can also pass the
# vocabulary arg a path to a file containing one vocabulary word per
# line.
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len,
 vocabulary=vocab_data)

# Create the model that uses the vectorize text layer
model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
# indices.
model.add(vectorize_layer)

# Because we've passed the vocabulary directly, we don't need to adapt
# the layer - the vocabulary is already set. The vocabulary contains the
# padding token ('') and OOV token ('[UNK]') as well as the passed tokens.
vectorize_layer.get_vocabulary()


['', '[UNK]', 'earth', 'wind', 'and', 'fire']

In [20]:
# Now, the model can map strings to integers, and you can add an embedding
# layer to map these integers to learned embeddings.
input_data = [["earth wind among other things fire"], ["fire fighter"], ['foo']]
model.predict(input_data)

array([[2, 3, 1, 1],
       [5, 1, 0, 0],
       [1, 0, 0, 0]])