In [72]:
import os

os.environ["KERAS_BACKEND"] = "jax"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

In [73]:
df = pd.read_csv('data.csv', usecols=['type', 'metre', 'text'])
df = df[['metre', 'text']]

In [74]:
df['metre'].unique()

array([0, 1, 4, 2, 5, 3, 6, 7])

In [75]:
train, val, test = np.split(df.sample(frac=1), [int(.8 * len(df)), int(.9 * len(df))])

  return bound(*args, **kwds)


In [76]:
len(train), len(val), len(test)

(1670806, 208851, 208851)

In [77]:
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
  df = dataframe.copy()
  labels = df.pop('metre')
  df = df["text"]
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)

  return ds

In [82]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>


In [79]:
encoder = tf.keras.layers.TextVectorization()
encoder.adapt(train_data.map(lambda text, label: text))

In [61]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]
len(vocab)

25722

In [62]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=32,
        mask_zero=True
    ),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(8, activation='sigmoid')
])

In [63]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [64]:
history = model.fit(train_data, epochs=20, validation_data=valid_data)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [66]:
model.evaluate(test_data)



[0.2925044894218445, 0.9175776243209839]

In [67]:
model.save('metre_detector.keras')

In [71]:
np.argmax(model.predict(test_data)[0])
list(test_data)[0]



(<tf.Tensor: shape=(1024,), dtype=string, numpy=
 array([b'"a zid vy pra vo val da vu"',
        b'"i kra lov ska krev hri chy zcer na la"',
        b'"a tak sni ve taj na pl ne"', ...,
        b'"a zi je od vu ne a ca sem od li ban ku"',
        b'"kde pak je ta hol ka nu ty jsi nam da la"',
        b'"ostre jsem vec sou de ruz nych hle disk"'], dtype=object)>,
 <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([1, 0, 1, ..., 0, 1, 1])>)