In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [3]:
data = pd.read_csv("./datos/datos_p1.csv")

In [4]:
data

Unnamed: 0,Description,Main Genre
0,winning will make you famous. losing means cer...,Young Adult
1,there is a door at the end of a silent corrido...,Fantasy
2,the unforgettable novel of a childhood in a sl...,Classics
3,about three things i was absolutely positive.f...,Young Adult
4,trying to make sense of the horrors of world w...,Historical
...,...,...
43791,avi steinberg is stumped. after defecting from...,Nonfiction
43792,"in this fearless and half-crazy story, howard ...",Sports
43793,from the icons of the game to the players who ...,Nonfiction
43794,"soon to be a major motion picture, from brad p...",Nonfiction


In [5]:
data.values[:, 1]

array(['Young Adult', 'Fantasy', 'Classics', ..., 'Nonfiction',
       'Nonfiction', 'Criticism'], dtype=object)

In [6]:
dataset = np.array(data)

In [7]:
np.random.shuffle(dataset)

In [8]:
dataset.shape

(43796, 2)

In [9]:
out_vector = np.unique(dataset[:, 1])
diff_values = len(out_vector)

In [10]:
out_vector[:5]

array(['Abandoned', 'Academic', 'Action', 'Adult', 'Adult Fiction'],
      dtype=object)

In [11]:
label_vector = []
for i, label in enumerate(dataset[:, 1]):
    label_vector.append([])
    for index in out_vector:
        if label == index:
            label_vector[i].append(1)
        else:
            label_vector[i].append(0)

In [12]:
label_vector = np.array(label_vector)

In [13]:
label_vector.shape

(43796, 194)

In [14]:
texts = dataset[:, 0]

In [15]:
size_train = int(dataset.shape[0] * 0.7)

In [16]:
size_train

30657

In [17]:
train_dataset_X = texts[:size_train]
test_dataset_X = texts[size_train:]

In [18]:
docs = tf.data.Dataset.from_tensor_slices((texts, label_vector)).batch(10)

In [19]:
docs

<BatchDataset shapes: ((None,), (None, 194)), types: (tf.string, tf.int64)>

In [20]:
for i, j in docs.take(1):
    print(j)

tf.Tensor(
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]], shape=(10, 194), dtype=int64)


In [21]:
vocab_size = 10000
sequence_length = 100

In [22]:
vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [24]:
vectorize_layer.adapt(dataset[:, 0])

In [25]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [26]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(diff_values)
])

In [27]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [28]:
model.fit(docs, epochs=15)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f37019a55e0>