In [1]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Bidirectional, \
    Conv1D, MaxPool1D, Dense, Attention
from tensorflow.keras.models import Sequential
from tensorflow.keras.activations import softmax
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

import mlflow

In [None]:
os.listdir("./data/train/avenged_sevenfold/")

In [None]:
TRAIN_BASE_DIR = "./data/train/"
TEST_BASE_DIR = "./data/test/"

N_CLASSES = 5

BATCH_SIZE = 16
MAX_VOCABULARY_SIZE = 10000 #number unique words
MAX_OUTPUT_LENGTH = 500 # kolko e dalga posledovatelnostta
EMBEDDING_DIMENSIONS = 128
LEARNING_RATE = 1e-2

In [None]:
train_dataset = tf.keras.utils.text_dataset_from_directory(TRAIN_BASE_DIR, BATCH_SIZE) \
    .cache() \
    . prefetch(buffer_size = tf.data.AUTOTUNE)

In [None]:
test_dataset = tf.keras.utils.text_dataset_from_directory(TREST_BASE_DIR, BATCH_SIZE)

In [None]:
for item in train_dataset:
    print(item[1])
    break

In [None]:
for texts, labels in train_dataset:
    print(texts[1])
    print(labels[1])
    break

In [None]:
list(enumerate(train_dataset.class_names))

In [None]:
vectorizer = TextVectorization(
    max_tokens = MAX_VOCABULARY_SIZE, 
    output_sequence_length = MAX_OUTPUT_LENGTH)

In [None]:
def get_texts(texts, labels):
    return(texts)

for i in train_dataset.map(get_texts).take(1):
    print(i)

In [None]:
vectorizer.vocabulary_size()

In [None]:
vectorizer.get_vocabulary()

In [None]:
vectorizer.adapt(train_dataset.map(get_texts))

In [None]:
vectorizer(["before the story begins is it such a sin", "this is a text"])

In [None]:
vectorizer(texts[0].numpy())

In [None]:
Embedding(input_dim = MAX_VOCABULARY_SIZE, output_dim = EMBEDDING_DIMENSIONS)

In [None]:
experiment_id = mlflow.create_experiment("simplest possible LSTM")

In [None]:
tf.keras.backend.clear_session()

In [None]:
tf.summary.histogram("name", [1,2,3]))

texts_model = Sequential([
    vectorizer,
    Embedding(input_dim = vectorizer.vocabulary_size(), output_dim = EMBEDDING_DIMENSIONS),
    LSTM(64, return_sequences = True),
    Dense(len(train_dataset.class_names), activation = softmax)
])

In [None]:
texts_model.summary()

In [None]:
texts_model.predict(["this is a test"]).shape

In [None]:
texts_model.predict(["this is a test"])

In [None]:
texts_model.predict(["this is a test", "this is another test"])

In [None]:
texts_model.compile(
    loss = tf.keras.losses.SparseCategorical.Crossentropy
    )

In [None]:
texts_model.compile(
    loss = "sparse_categorical_crossentropy",
    optimizer = "adam"
    )

In [None]:
texts_model.compile(
    loss = SparseCatgoricalCrossentropy(),
    optimizer = Adam(learning_rate = LEARNING_RATE),
    metrics = ["acc"]
    )

In [None]:
with mlflow.start_run(experiment_id = experiment_id) as run:
    mlfow.log_param("learning_rate", LEARNING_RATE)
    
    run_id = mlflow.active_run().infor.run_id
    #experiment_dir = mlflow.get_experiment(experiment_id).artifact_location
    history = texts_model.fit(
        train_dataset, 
        epochs = 10, 
        validation_data = test_dataset,
        callbacks = [
            TensorBoard(
                log_dir = f"./logs-(experiment_id)-(run_id)",
                write_graph = True),
            ModelCheckpoint(
                f"./checkpoints-(experiment_id)-(run_id)", 
                save_weights_only = False, 
                save_best_only = False)])
    
    mlfow.log_param("history", history)

In [None]:
texts_model.load_weights()

In [None]:
history.history("loss")

In [None]:
experiment_id = mlflow.create_experiment("BiDirectional LSTM")

In [None]:
tf.keras.backend.clear_session()
texts_model = Sequential([
    vectorizer, 
    Embedding(input_dim = vectorizer.vocabulary_size(), output_dim = EMBEDDING_DIMENSIONS),
    Bidirectional(LSTM(128)),
    Dense(len(train_dataset.class_names), activation = softmax)
])

texts_model.compile(
    loss = SparseCatgoricalCrossentropy(),
    optimizer = Adam(learning_rate = LEARNING_RATE),
    metrics = ["acc"]
)

In [None]:
texts_model.layers[2].weights

In [None]:
with mlflow.start_run(experiment_id = experiment_id) as run:
    mlfow.log_param("learning_rate", LEARNING_RATE)
     mlfow.log_param("lstm_units", LEARNING_RATE)
    
    run_id = mlflow.active_run().infor.run_id
    #experiment_dir = mlflow.get_experiment(experiment_id).artifact_location
    history = texts_model.fit(
        train_dataset, 
        epochs = 10, 
        validation_data = test_dataset,
        callbacks = [
            TensorBoard(
                log_dir = f"./logs-(experiment_id)-(run_id)",
                write_graph = True)])
 #           ModelCheckpoint(
 #               f"./checkpoints-(experiment_id)-(run_id)", 
 #               save_weights_only = False, 
#                save_best_only = False)])

In [None]:
experiment_id = mlflow.create_experiment("Multi_layer BiDirectional LSTM")

In [None]:
tf.keras.backend.clear_session()
texts_model = Sequential([
    vectorizer, 
    Embedding(input_dim = vectorizer.vocabulary_size(), output_dim = EMBEDDING_DIMENSIONS),
    Bidirectional(LSTM(64, return_sequences = True)),
    Bidirectional(LSTM(64)),
    Dense(N_CLASSES, activation = softmax)
])

texts_model.compile(
    loss = SparseCatgoricalCrossentropy(),
    optimizer = Adam(learning_rate = LEARNING_RATE),
    metrics = ["acc"]
)

In [None]:
texts_model.summary()

In [None]:
with mlflow.start_run(experiment_id = experiment_id) as run:
    mlfow.log_param("learning_rate", LEARNING_RATE)
        
    run_id = mlflow.active_run().infor.run_id
    #experiment_dir = mlflow.get_experiment(experiment_id).artifact_location
    history = texts_model.fit(
        train_dataset, 
        epochs = 10, 
        validation_data = test_dataset,
        callbacks = [
            TensorBoard(
                log_dir = f"./logs-(experiment_id)-(run_id)",
                write_graph = True)])
 #           ModelCheckpoint(
 #               f"./checkpoints-(experiment_id)-(run_id)", 
 #               save_weights_only = False, 
#                save_best_only = False)])

In [None]:
experiment_id = mlflow.create_experiment("1D CNN")

In [None]:
tf.keras.backend.clear_session()
texts_model = Sequential([
    vectorizer, 
    Embedding(input_dim = vectorizer.vocabulary_size(), output_dim = EMBEDDING_DIMENSIONS),
    Conv1d(64, 3, padding = "same", activation = "relu"),
    Conv1d(64, 3, padding = "same", activation = "relu"),
    MaxPool1D(),
    Conv1d(128, 3, padding = "same", activation = "relu"),
    
    Dense(N_CLASSES, activation = softmax)
])

texts_model.compile(
    loss = SparseCatgoricalCrossentropy(),
    optimizer = Adam(learning_rate = LEARNING_RATE),
    metrics = ["acc"]
)

In [None]:
texts_model.summary()

In [None]:
input_layer = ((tf.keras.layers.Input((1, ))))

b1 = block(embedding)
b2 = block(b1)
b3 = block(2)

def block(input_1):
    embedding = Embedding(input_dim = MAX_VOCABULARY_SIZE, output_dim = EMBEDDING_DIMENSIONS)(input_1)
    conv1 = Conv1d(128, 3, padding = "same", activation = "relu")(embedding)

    conv21 = Conv1d(128, 3, padding = "same", activation = "relu")(conv1)
    conv22 = Conv1d(128, 3, padding = "same", activation = "relu")(conv1)
    conv23 = Conv1d(128, 3, padding = "same", activation = "relu")(conv1)

    concat = tf.keras.layers.Concatenate()(conv21, conv22, conv23)
    return concat

In [None]:
test_model = tf.keras.models.Model(inputs = embedding, outputs = concat)

In [None]:
test_model.summary()

In [None]:
with mlflow.start_run(experiment_id = experiment_id) as run:
    mlfow.log_param("learning_rate", LEARNING_RATE)
        
    run_id = mlflow.active_run().infor.run_id
    #experiment_dir = mlflow.get_experiment(experiment_id).artifact_location
    history = texts_model.fit(
        train_dataset, 
        epochs = 10, 
        validation_data = test_dataset,
        callbacks = [
            TensorBoard(
                log_dir = f"./logs-(experiment_id)-(run_id)",
                write_graph = True)])
 #           ModelCheckpoint(
 #               f"./checkpoints-(experiment_id)-(run_id)", 
 #               save_weights_only = False, 
#                save_best_only = False)])

In [None]:
experiment_id = mlflow.create_experiment("Attention")

In [None]:
tf.keras.backend.clear_session()
texts_model = Sequential([
    vectorizer, 
    Embedding(input_dim = vectorizer.vocabulary_size(), output_dim = EMBEDDING_DIMENSIONS),
    Bidirectional(LSTM(64, return_sequences = True)),
    Bidirectional(LSTM(64)),
    tf.keras.layers.Attenion(),
    Dense(N_CLASSES, activation = softmax)
])

texts_model.compile(
    loss = SparseCatgoricalCrossentropy(),
    optimizer = Adam(learning_rate = LEARNING_RATE),
    metrics = ["acc"]
)

In [None]:
encoder = Sequential([
    vectorizer, 
    Embedding(input_dim = vectorizer.vocabulary_size(), output_dim = EMBEDDING_DIMENSIONS),
    Bidirectional(LSTM(64, return_sequences = True)),
    Bidirectional(LSTM(64))
])

attention = Attention()([encoder.output, encoder.output])
dense = Dense(5, activation = softmax)(attention)

In [None]:
band_id = 0
line_vectorizer = TextVectorization(output_sequence_length = 20, vocabulary = vocabulary.get_vocabulary())
line_dataset_train = tf.data.TextlineDataset([", "]) \
    .filter(lambda text: text != "") \
    .map(lambda text: (vectorizer(text, band_id))

In [None]:
for line in line_dataset_train.take(5):
    print(line)