<h1>Overview<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Load-the-Data" data-toc-modified-id="Load-the-Data-0.1"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Load the Data</a></span></li><li><span><a href="#Create-Model" data-toc-modified-id="Create-Model-0.2"><span class="toc-item-num">0.2&nbsp;&nbsp;</span>Create Model</a></span></li><li><span><a href="#Compile-Model" data-toc-modified-id="Compile-Model-0.3"><span class="toc-item-num">0.3&nbsp;&nbsp;</span>Compile Model</a></span></li><li><span><a href="#Fit-Model" data-toc-modified-id="Fit-Model-0.4"><span class="toc-item-num">0.4&nbsp;&nbsp;</span>Fit Model</a></span></li></ul></li><li><span><a href="#Terminal" data-toc-modified-id="Terminal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Terminal</a></span></li><li><span><a href="#Test-on-English-next-char-prediction" data-toc-modified-id="Test-on-English-next-char-prediction-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Test on English next-char prediction</a></span></li><li><span><a href="#Detour-Get-everything-underneath-to-work-again" data-toc-modified-id="Detour-Get-everything-underneath-to-work-again-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Detour Get everything underneath to work again</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#SDPA-attention-model" data-toc-modified-id="SDPA-attention-model-3.0.1"><span class="toc-item-num">3.0.1&nbsp;&nbsp;</span>SDPA attention model</a></span></li><li><span><a href="#MHA-model" data-toc-modified-id="MHA-model-3.0.2"><span class="toc-item-num">3.0.2&nbsp;&nbsp;</span>MHA model</a></span></li><li><span><a href="#Compression-model" data-toc-modified-id="Compression-model-3.0.3"><span class="toc-item-num">3.0.3&nbsp;&nbsp;</span>Compression model</a></span></li></ul></li></ul></li><li><span><a href="#CT-model,-v.2" data-toc-modified-id="CT-model,-v.2-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>CT model, v.2</a></span></li><li><span><a href="#garbage" data-toc-modified-id="garbage-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>garbage</a></span></li></ul></div>

In [None]:
import os
import re
import sys
import numpy as np
import pandas as pd

from time import sleep
from keras import backend as K
from keras.models import Model 
from keras.models import Sequential as SequentialModel
from keras.layers import Dense, Conv1D, LSTM, Dropout, Embedding, Layer, Input, Flatten, concatenate as Concatenate, Lambda, Add
from keras.callbacks import Callback
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer as KerasTokenizer
from tqdm.notebook import tqdm
from sometimer import timer, time_this_method

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

sys.path.insert(0, '../ct')

import load
from preprocess import preprocess
from preprocess import Tokenizer
from preprocess.preprocess import separator_samples

from model.layers import LayerNormalization
from model.layers import ContentBasedAttention_CT
from model.layers import ScaledDotProductAttention
from model.layers import MultiHeadAttention
from model.layers import content_based_attention

from model import CompressiveTransformer
from model import AttentionReconstruction

## Load the Data

In [None]:
train_data = pd.read_pickle('../data/processed/spooky-author/train.pkl')

x_train = np.array(train_data.x.tolist())
y_train = np.array(train_data.y.tolist())

_x_train = np.zeros((x_train.shape[0], 128))  # samples, d_model
_x_train[:,:x_train.shape[1]] = x_train
x_train = _x_train

## Create Model

In [None]:
d_model = 256
sequence_length = 128

model = CompressiveTransformer(d_model=d_model, sequence_length=sequence_length, batch_size=1)

In [None]:
model.summary()

## Compile Model

In [None]:
model.compile(optimizer='Adam',
              loss='categorical_crossentropy')

## Fit Model

In [None]:
x_train.shape

In [None]:
# redo everyth
model = CompressiveTransformer(d_model=d_model, sequence_length=sequence_length, batch_size=1)

model.compile(optimizer='Adam',
              loss='categorical_crossentropy')


for batch in range(0, len(x_train), 1):
    x_batch = x_train[batch:batch+1, :]
    y_batch = y_train[batch:batch+1]
    
    model.train_on_batch(x_batch,
                         y_batch)

# Terminal

In [None]:
a = K.variable(np.array([i for i in range(200)]).reshape((2, 5, 20)))

def call(x, units=None, gain=None, bias=None):
    if units is None:
        units = np.prod(x.shape[1:])
        print(f'units={units}')
    
    mean = K.sum(x) / units
    std_dev = K.sqrt(K.sum(K.square(x - mean)) / units)

    y = (x - mean) / std_dev
    if gain:
        y *= gain
    if bias:
        y += bias
    return y

K.eval(call(a))

# Test on English next-char prediction

In [None]:
input_paths_tokenizer = ['..\\data\\wma-en-de\\input\\train-en-ascii.txt',
                        # '..\\data\\wma-en-de\\input\\train-de-ascii.txt'
                        ]

In [None]:
tokenizer = Tokenizer(input_paths=input_paths_tokenizer)

data = []
for path in input_paths_tokenizer:
    with open(path) as file:
        data.extend(file.readlines())
df_ = pd.DataFrame(data={'text': data})
df = df_[:10000]

df['encoding'] = tokenizer.encode_batch(df.text.tolist())
df['tokens'] = df.encoding.apply(lambda e: e.tokens)
df['token_ids'] = df.encoding.apply(lambda e: e.ids)

token_ids = [t for tokens in df.token_ids for t in tokens]

In [None]:
df.head()

In [None]:
d_model = 256
sequence_length = 128

model = CompressiveTransformer(d_model=d_model, sequence_length=sequence_length, batch_size=1)
model.compile(optimizer='Adam',
              loss='categorical_crossentropy')

In [None]:
def batch_generator(input_data, epochs=1, batch_size=1, d_model=128):
    chunk_size = len(input_data) // batch_size
    data = [input_data[i:i+chunk_size] for i in range(0, len(input_data), chunk_size)]
    data = [[token for s in chunk for token in s] for chunk in data]
    sample_size = len(data[0])
    print(len(data))
    
    for e in range(epochs):
        print(f'epoch: {e:5}')
        for i in tqdm(range(chunk_size)):
            print(f'       {e:5}-{i}')
            x = [t[i:i+d_model] for t in data]
            y = [t[i+d_model] for t in data]
            
            x = np.array(x)
            y = np.array(y)
            
            y = keras.utils.to_categorical(y, num_classes=20000)
            
            print(y)
            
            model.train_on_batch(x=x,
                                 y=y)
            
batch_generator(df.token_ids.tolist())

# Detour Get everything underneath to work again

### SDPA attention model

In [None]:
train_data = pd.read_pickle('../data/processed/spooky-author/train.pkl')

x_train = np.array(train_data.x.tolist())
y_train = np.array(train_data.y.tolist())

_x_train = np.zeros((x_train.shape[0], 128))  # samples, d_model
_x_train[:,:x_train.shape[1]] = x_train
x_train = _x_train

In [None]:
def create_sdpa_model(d_model=128, compile=True):
    x = Input(shape=(128,))
    embed = Embedding(input_dim=20000, output_dim=128)(x)
    sdpa = ScaledDotProductAttention(d_model=d_model, d_k=16, d_v=16, verbose=True)(embed)
    flat = Flatten()(sdpa)
    dense = Dense(units=10)(flat)
    y = Dense(units=3, activation='softmax')(dense)
    
    model = Model(inputs=[x],
                  outputs=[y])
    if compile:
        model.compile(optimizer='Adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
    return model, sdpa


model, _ = create_sdpa_model()

In [None]:
model.fit(x=x_train,
          y=y_train,
          validation_split=0.3)

#### Detour sample submission on Spooky-author

In [None]:
spooky_train = pd.read_csv('../data/input/spooky-author/train.csv')
spooky_test = pd.read_csv('../data/input/spooky-author/test.csv')

max_length = 128
tokens = 'keras_token_ids'


tokenizer = KerasTokenizer(num_words=15000,
                          lower=True,
                          char_level=False)
tokenizer.fit_on_texts(spooky_train.text)

spooky_train['keras_token_ids'] = tokenizer.texts_to_sequences(spooky_train.text)
spooky_train.keras_token_ids = [[min(s, tokenizer.num_words) for s in seq] for seq in spooky_train.keras_token_ids]

spooky_test['keras_token_ids'] = tokenizer.texts_to_sequences(spooky_test.text)
spooky_test.keras_token_ids = [[min(s, tokenizer.num_words) for s in seq] for seq in spooky_test.keras_token_ids]

tokenizer = Tokenizer(input_paths=['../data/input/spooky-author/train.txt'],
                      lowercase=True,
                      vocab_size=15000)

spooky_train['encoding'] = tokenizer.encode_batch(spooky_train.text.tolist())
spooky_test['encoding'] = tokenizer.encode_batch(spooky_test.text.tolist())

spooky_train['tokens'] = spooky_train.encoding.apply(lambda e: e.tokens)
spooky_test['tokens'] = spooky_test.encoding.apply(lambda e: e.tokens)

spooky_train['token_ids'] = spooky_train.encoding.apply(lambda e: e.ids)
spooky_test['token_ids'] = spooky_test.encoding.apply(lambda e: e.ids)

author_to_id = {'EAP': 0, 'HPL': 1, 'MWS': 2}
spooky_train['author_id'] = spooky_train.author.apply(lambda a: author_to_id[a])

spooky_train.token_ids = spooky_train.token_ids.apply(lambda a: (a + [0]*(max_length - len(a)))[:max_length])
spooky_test.token_ids = spooky_test.token_ids.apply(lambda a: (a + [0]*(max_length - len(a)))[:max_length])
spooky_train.keras_token_ids = spooky_train.keras_token_ids.apply(lambda a: (a + [0]*(max_length - len(a)))[:max_length])
spooky_test.keras_token_ids = spooky_test.keras_token_ids.apply(lambda a: (a + [0]*(max_length - len(a)))[:max_length])

x_train = np.array(spooky_train[tokens].tolist())
y_train = to_categorical(np.array(spooky_train.author_id.tolist()))

x_test = np.array(spooky_test[tokens].tolist())

In [None]:
model = create_sdpa_model()

model.fit(x=x_train,
          y=y_train,
          epochs=3,
          validation_split=0.3)

### MHA model

In [None]:
def create_mha_model(d_heads=4, d_model=128, sequence_length=128, compile=True):
    x = Input(shape=(sequence_length,))
    embed = Embedding(input_dim=15000, output_dim=d_model)(x)
    sdpa = [ScaledDotProductAttention(d_model=d_model, d_k=16, d_v=16)(embed) for _ in range(d_heads)]
    mha = MultiHeadAttention(d_heads=d_heads, 
                             d_model=d_model, 
                             d_k=16,
                             d_v=16, 
                             sequence_length=sequence_length,
                             verbose=True)(sdpa)
    
    flat = Flatten()(mha)
    dense = Dense(units=100)(flat)
    dropout = Dropout(rate=0.2)(dense)
    y = Dense(units=3, activation='softmax')(dropout)
    
    model = Model(inputs=[x],
                  outputs=[y])
    if compile:
        model.compile(optimizer='Adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
    return model


model = create_mha_model(d_heads=2)

In [None]:
model.fit(x=x_train,
          y=y_train,
          epochs=1,
          validation_split=0.3)

In [None]:
model.summary()

### Compression model

**Failed.**

In [None]:
batch_size = 32
sequence_length = 128
d_model = 128
compression_rate = 3  # n_s/c=32

h_shape = (batch_size, sequence_length, d_model)
compressed_memory_shape = (batch_size, sequence_length // compression_rate, d_model)

h = K.zeros(shape=h_shape)
old_mem = K.zeros(shape=h_shape)
new_cm = K.zeros(shape=compressed_memory_shape)

In [None]:
%reload_ext autoreload
reconstruction_model = AttentionReconstruction(input_shape=[h_shape, h_shape], heads=[None], verbose=True)

reconstruction_model.compile(optimizer='Adam', loss='mean_squared_error')

In [None]:
# forward pass
output = reconstruction_model([h, old_mem])
print(output.shape)

# train on batch
loss = reconstruction_model.train_on_batch(x=[K.eval(h), K.eval(old_mem)],
                                           y=K.eval(new_cm))
print(loss)

#### Detour really simple compression Model

In [None]:
class DoubleInput(Model):
    def __init__(self, input_shape, heads=None):
        h_shape, old_mem_shape = input_shape
        
        h = Input(batch_shape=h_shape, name='h')
        old_mem = Input(batch_shape=old_mem_shape, name='ar_old_mem')
        
        # zeros = Lambda(lambda _h: _h*0.00001, name='ar_pseudo_use_h')(h)
        # pseudo_old_mem = Add(name='ar_add_zeros')([old_mem, zeros])
        
        output_layer = Conv1D(filters=128,
                              kernel_size=3,
                              strides=3,
                              activation='relu',
                              name='ar_conv1D')(old_mem)
        
        super().__init__(inputs=[h, old_mem], outputs=output_layer)
        self.heads = heads
        
    def compile(self, *args, **kwargs):
        print('compiling...')
        return super().compile(*args, **kwargs)
        
    def train_on_batch(self, *args, **kwargs):
        print('training on batch...')
        return super().train_on_batch(*args, **kwargs)

In [None]:
di_model = DoubleInput(input_shape=[h_shape, h_shape], heads=[None])

di_model.compile(optimizer='Adam',
                 loss='mse')
di_model.summary()

In [None]:
output = di_model([h, old_mem])
print(output)

loss = di_model.train_on_batch(x=[K.eval(h), K.eval(old_mem)],
                               y=K.eval(new_cm))
print(loss)

#### Detour AR in notebook ...

In [None]:
# get the head
def create_sdpa_model(d_model=128, compile=True):
    x = Input(shape=(128,))
    embed = Embedding(input_dim=20000, output_dim=128)(x)
    head = ScaledDotProductAttention(d_model=d_model, d_k=16, d_v=16, verbose=True)
    sdpa = head(embed)
    flat = Flatten()(sdpa)
    dense = Dense(units=10)(flat)
    y = Dense(units=3, activation='softmax')(dense)
    
    model = Model(inputs=[x],
                  outputs=[y])
    if compile:
        model.compile(optimizer='Adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
    return model, head


model, head = create_sdpa_model()

In [None]:
_max_pool = ['max-pool', 'max_pool', 'max pool', 'max']
_1d_conv = ['1d-conv', '1d_conv', '1d conv', 'conv']
_all_compressions = _max_pool[:1] + _1d_conv[:1]


class AttentionReconstruction_notebook(Model):

    def __init__(self,
                 input_shape,
                 heads,
                 *args,
                 compression='1d-conv',
                 compression_rate=3,
                 name='AttentionReconstruction',
                 verbose=False,
                 **kwargs):
        assert isinstance(heads, list)
        if len(heads) > 1:
            raise NotImplementedError()
        # heads

        h_shape, old_mem_shape = input_shape
        assert h_shape == old_mem_shape

        h = Input(batch_shape=h_shape, name='ar_h')
        old_mem = Input(batch_shape=old_mem_shape, name='ar_old_mem')

        # zeros = Lambda(lambda _h: _h*0.00001, name='ar_pseudo_use_h')(h)
        # pseudo_old_mem = Add(name='ar_add_zeros')([old_mem, zeros])

        if compression in _max_pool:
            raise NotImplementedError()
        elif compression in _1d_conv:
            filters = kwargs.get('conv_filters', 128)
            activation = kwargs.get('conv_activation', 'relu')

            output_layer = Conv1D(filters=filters,
                                  kernel_size=compression_rate,
                                  strides=compression_rate,
                                  activation=activation,
                                  name='ar_conv1D')
            output = output_layer(old_mem)
        else:
            raise ValueError(f'unsupported compression: {compression}. '
                             f'Select one from {_all_compressions}')

        super().__init__(*args, inputs=[h, old_mem], outputs=output, name=name, **kwargs)
        self.heads = heads
        self.compression = compression
        self.compression_rate = compression_rate
        self._current_batch = dict(h=[h],
                                   old_mem=[old_mem],
                                   new_cm=[new_cm])
        self.verbose = verbose
        self._custom_layers = dict(output=output_layer)

        if verbose:
            print(self.summary())
            
    def compile(self,
                optimizer,
                loss='attention_reconstruction',
                metrics=None,
                loss_weights=None,
                **kwargs):
        if loss == 'attention_reconstruction':
            loss = self.attention_reconstruction_loss()
            print(loss)
        else:
            warnings.warn('using non-standard loss for AttentionReconstruction', RuntimeWarning)
        
        # self.add_loss(lambda: K.reduce_mean(self._current_batch['h']))
        super().compile(optimizer=optimizer,
                        loss=loss,
                        metrics=metrics,
                        loss_weights=loss_weights,
                        **kwargs)
    
    def train_on_batch(self, x, y, sample_weight=None, class_weight=None, reset_metrics=True):
        self._current_batch['h'] = [x[0]]
        self._current_batch['old_mem'] = [x[1]]
        self._current_batch['new_cm'] = [y]
    
        loss = super().train_on_batch(x=x,
                                      y=y,
                                      sample_weight=sample_weight,
                                      class_weight=class_weight,
                                      reset_metrics=reset_metrics)
        return loss
    
    def attention_reconstruction_loss(self):
    
        def _attention_reconstruction_loss(y_true, y_pred):
            # assert len(self.heads) == 1
            # assert len(self._current_batch_old_mem) == 1
            # assert len(self._current_batch_new_cm) == 1
            print('   calculating loss...')
            return K.sqrt((y_true - y_pred) ** 2)
    
#             for head, h, old_mem, new_cm in zip(self.heads, 
#                                                 self._current_batch['h'],
#                                                 self._current_batch['old_mem'], 
#                                                 self._current_batch['new_cm']):
#                 print(h, old_mem, head.w_q, head.w_k, head.w_v, sep='\n')
#                 old_attention = content_based_attention(h=h, m=old_mem, w_q=head.w_q, w_k=head.w_k, w_v=head.w_v)
#                 new_attention = content_based_attention(h=h, m=new_cm, w_q=head.w_q, w_k=head.w_k, w_v=head.w_v)
#                 loss_head = (old_attention - new_attention)
    
#                 loss += loss_head

#             print((y_true - y_pred).shape)
#             print(self._current_batch['h'][0])
#             # # works
#             # return y_true - y_pred
#             return y_true - self._current_batch['new_cm'][0]
            
#             # # doesn't work
#             # return K.zeros(shape=y_pred.shape)
    
        return _attention_reconstruction_loss

reconstruction_model = AttentionReconstruction_notebook(input_shape=[h_shape, h_shape], heads=[head], verbose=True)

reconstruction_model.compile(optimizer='Adam', loss=reconstruction_model.attention_reconstruction_loss())


In [None]:
# FROM REPO
reconstruction_model = AttentionReconstruction(input_shape=[h_shape, h_shape], heads=[head], verbose=True)

reconstruction_model.compile(optimizer='Adam', loss='attention_reconstruction')

In [None]:
# forward pass
output = reconstruction_model([h, old_mem])
print(output.shape)

# train on batch
loss = reconstruction_model.train_on_batch(x=[K.eval(h), K.eval(old_mem)],
                                           y=K.eval(new_cm))
print(loss)

#### Potential solution to input-tracked-for-topology losses

https://keras.io/api/losses/#creating-custom-losses

If this is not the case for your loss (if, for example, your loss references a Variable of one of the model's layers), you can wrap your loss in a zero-argument lambda. These losses are not tracked as part of the model's topology since they can't be serialized.

Example
```python
inputs = tf.keras.Input(shape=(10,))
d = tf.keras.layers.Dense(10)
x = d(inputs)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs, outputs)
# Weight regularization.
model.add_loss(lambda: tf.reduce_mean(d.kernel))
```

In [None]:
reconstruction_model.train_on_batch(x=[K.eval(h), K.eval(old_mem)],
                                    y=K.eval(new_cm))

# CT model, v.2

In [None]:
validation_split = 0.3

train_data = pd.read_pickle('../data/processed/spooky-author/train.pkl')
val_index = int(validation_split*len(train_data))

x_train = np.array(train_data.x.tolist())
y_train = np.array(train_data.y.tolist())

_x_train = np.zeros((x_train.shape[0], 128))  # samples, d_model
_x_train[:,:x_train.shape[1]] = x_train
x_train = _x_train

x_val = x_train[-val_index:]
y_val = y_train[-val_index:]

x_train = x_train[:-val_index]
y_train = y_train[:-val_index]

In [None]:
epochs=10
batch_size=32
samples=len(x_train)
validation_samples=len(x_val)

In [None]:
ct = CompressiveTransformer(d_layers=1,
                            sequence_length=128, 
                            d_model=128,
                            memory_size=256,
                            compressed_memory_size=256,
                            d_k=16, 
                            d_heads=2, 
                            output_size=3,
                            batch_size=32,
                            vocab_size=30000)

In [None]:
ct.compile(optimizer='Adam',
           loss='categorical_crossentropy',
           metrics=['accuracy'])

In [None]:
ct.summary()

#### Train CT:

In [None]:
def evaluate(ct, x_val):
    y_pred = [ct.predict(x=[x_val[i:i+ct.batch_size], 
                            ct.memory, 
                            ct.compressed_memory]) \
                  for i in range(0, len(x_val) - len(x_val) % ct.batch_size, ct.batch_size)]
    y_pred = np.concatenate(y_pred)
    
    accuracy = (y_pred.argmax(axis=1) == y_val[:len(y_pred)].argmax(axis=1)).sum() / len(y_pred)
    return accuracy

In [None]:
for epoch in range(epochs):
    print(f'epoch: {epoch}')
    epoch_loss = []
    epoch_acc = []
    
    for i in range(0, samples - samples % batch_size, batch_size):
            
        (loss, acc), loss_ar = ct.train_on_batch(x=[x_train[i:i+batch_size], ct.memory, ct.compressed_memory],
                                                 y=y_train[i:i+batch_size])
        ct.memory *= 0
        ct.compressed_memory *= 0
        
        epoch_loss.append(loss)
        epoch_acc.append(acc)
        
        if (i // batch_size) % 20 == 0:
            print(f'    i: {i:4d}    loss={np.mean(epoch_loss):.3f}, accuracy={np.mean(epoch_acc):.3f}')
        # print(ct.memory[0])
        # print('\n\n\n')
    val_acc = evaluate(ct, x_val)
    print(f'val_acc={val_acc:.3f}        loss={np.mean(epoch_loss):.3f}, accuracy={np.mean(epoch_acc):.3f}\n')

#### Eval CT:

In [None]:
results = pd.DataFrame(data=dict(y_pred=y_pred.argmax(axis=1),
                                 y_true=y_val[:validation_samples - validation_samples % batch_size].argmax(axis=1)))
results['correct'] = results.y_pred == results.y_true

In [None]:
r = results.groupby('y_true')['correct'].agg(['count', 'sum'])
r['accuracy'] = r['sum'] / r['count']
r

In [None]:
results.y_true.size

In [None]:
# 1: 40.033
# 5: 55.682
# 9: 89.1

In [None]:
e = ct.get_layer(name='h_L0').get_weights()[0]

In [None]:
print(e.shape)
print(np.prod(e.shape))

In [None]:
nan_count = np.isnan(e).sum()
nan_count

In [None]:
e = ct.get_layer(name='output').get_weights()[0]

In [None]:
e

In [None]:
nan_count = np.isnan(np.array(e)).sum()
nan_count

In [None]:
# Var blir outputten riktigt stor ? 
# nan - detector ?

In [None]:
x_batch = [x_train[32:32+batch_size], 
           np.zeros_like(ct.memory), 
           np.zeros_like(ct.compressed_memory')]

In [None]:
x_batch = [x_train[32:32+batch_size], 
           ct.memory, 
           ct.compressed_memory]

In [None]:
print(ct.memory)
print(ct.compressed_memory)

for layer in [ct.layers[1]] + ct.layers[4:]:
    print(layer.name)
    lv = K.function(ct.input, layer.output)(x_batch)
    print(lv[0])
    print()
    print(f'{lv.min():.3f},   {lv.mean():.3f},   {lv.max():.3f}')
    print(np.isnan(lv).sum())
    print('\n\n\n')

In [None]:
# COMPRESSED MEMORY GETS NANS FOR SOME REASON.
#     ->  fix: 

In [None]:
[layer.name for layer in ct.layers]

# garbage