# Преамбула

In [None]:
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

tfds.disable_progress_bar()
tf.enable_v2_behavior()

print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.experimental.list_physical_devices()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import copy

In [None]:
from sklearn.neighbors import KernelDensity
from sklearn.neighbors import BallTree
from sklearn.neighbors import KDTree

from sklearn.metrics import pairwise_distances_argmin_min

from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV

In [None]:
from joblib import Parallel, delayed

In [None]:
def normalize_img(image, label):
    """Нормализация изображений: `uint8` -> `float32`."""
    return tf.cast(image, tf.float32) / 255.0, label

In [None]:
def crop_pixels(x):
    """Обрезание значений пикселей нормированного изображения."""
    return min(1.0, max(0.0, x))

In [None]:
def imshow_array(array):
    """Отображение массива нормированных пикселей."""
    plt.axis('off')
    plt.imshow((255.0 * array).astype(np.uint8), cmap=plt.get_cmap("gray"), vmin=0, vmax=255)

In [None]:
def dataset_Y_to_X(X, Y):
    """Поменять у датасета пары (X, Y) на (X, X) (нужно, например, для обучения автоэнкодера)."""
    return X, X

In [None]:
def similarity_loss(y_true, y_pred):
    """Функция потерь, которая показала результаты лучше, чем MAE."""
    delta = tf.keras.backend.abs(y_true - y_pred)
    squared = tf.keras.backend.square(y_true - y_pred)
    return tf.keras.backend.mean(delta - 0.5 * squared, axis=-1)

In [None]:
import json
import csv

info = dict()

## Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = "/content/drive/My Drive/MNIST-Information/"

# Эксперимент для синтетических данных

In [None]:
import scipy.stats as sps

In [None]:
def norm_generator(size=1, loc=0, scale=1):
    return sps.norm(loc=loc, scale=scale).rvs(size=size)

In [None]:
dataset_dim = 32 # Размерность данных.
latent_dim  = 8  # Реальная (скрытая) размерность данных.
samples_number = 60000 # Размер выборки.
tests_number   = 10000 # Размер тестовой выборки.

### Функции, задающие многообразия.

$$
y = f_2(x): \quad y_i = \sin \left ( \sin (x_{i \; \text{mod} \; d}) + \sin(x_{[i / d]}) \right )
$$

In [None]:
def function_2(X, dataset_dim, latent_dim):
    """
    Функция 2, задающая малоразмерное многообразие.
    """

    Y = np.zeros(dataset_dim)

    for i in range(dataset_dim):
        Y[i] = np.sin(np.sin(X[i % latent_dim]) + np.sin(X[i // latent_dim]))

    return Y

In [None]:
def gen_samples(samples_number, dataset_dim, latent_dim, ln_scale = 2.0, fn_scale = 0.05):
    """
    Генерация набора данных.
    """

    # Шум во внутреннем представлении.
    W = norm_generator(loc=0.0, scale=ln_scale, size=(samples_number, latent_dim))
    
    # Отображение шума в пространство большей размерности.
    base_sample = np.zeros((samples_number, dataset_dim))
    for i in range(samples_number):
        base_sample[i] = function_2(W[i], dataset_dim, latent_dim)
    
    # Дополнительный шум, накладываемый на итоговое многообразие.
    noises_sample = norm_generator(loc=0, scale=fn_scale, size=(samples_number, dataset_dim))
    sample = base_sample + noises_sample
    
    # Обрезание результатов.
    for i in range(samples_number):
        for j in range(dataset_dim):
            if sample[i][j] > 1:
                sample[i][j] = 1
            if sample[i][j] < -1:
                sample[i][j] = -1
            
    return sample

In [None]:
samples = gen_samples(samples_number, dataset_dim, latent_dim)
test = gen_samples(tests_number, dataset_dim, latent_dim)

## Автоэнкодер

### Размерность кода и число эпох.

In [None]:
# РАЗМЕРНОСТЬ КОДА.
# #
# #

codes_dim = 8

# #
# #

epochs = 200

In [None]:
info['codes_dim'] = codes_dim
info['epochs'] = epochs

In [None]:
import os
rel_path = path + "Synthetic/Models/Autoencoders/" + str(codes_dim) + "_" + str(epochs) + "/"
os.makedirs(rel_path, exist_ok=True)

In [None]:
def normal_autoencoder(shape_input, dimension):
    # Инициализация весов.
    init = tf.keras.initializers.RandomNormal(stddev = 0.02)

    # Входные данные генератора / выборки.
    input_layer = tf.keras.layers.Input(shape_input)
    next_layer = input_layer
    next_layer = tf.keras.layers.GaussianNoise(0.03)(next_layer)

    # 1 блок слоёв.
    next_layer = tf.keras.layers.Dense(64, kernel_initializer = init)(next_layer)
    next_layer = tf.keras.layers.LeakyReLU(alpha=0.2)(next_layer)

    # 2 блок слоёв.
    next_layer = tf.keras.layers.Dense(32, kernel_initializer = init)(next_layer)
    next_layer = tf.keras.layers.LeakyReLU(alpha=0.2)(next_layer)
    
    # 3 блок слоёв.
    #next_layer = tf.keras.layers.Dense(16, kernel_initializer = init)(next_layer)
    #next_layer = tf.keras.layers.LeakyReLU(alpha=0.2)(next_layer)

    # Бутылочное горлышко.
    next_layer = tf.keras.layers.Dense(dimension)(next_layer)
    bottleneck = tf.keras.layers.Activation('sigmoid')(next_layer)

    # Модель кодировщика.
    encoder = tf.keras.Model(input_layer, bottleneck)

    # Начало модели декодировщика.
    input_code_layer = tf.keras.layers.Input((dimension))
    next_layer = input_code_layer

    # 3 блок слоёв.
    #next_layer = tf.keras.layers.Dense(16, kernel_initializer = init)(next_layer)
    #next_layer = tf.keras.layers.LeakyReLU(alpha=0.2)(next_layer)
    
    # 2 блок слоёв.
    next_layer = tf.keras.layers.Dense(32, kernel_initializer = init)(next_layer)
    next_layer = tf.keras.layers.LeakyReLU(alpha=0.2)(next_layer)

    # 1 блок слоёв.
    next_layer = tf.keras.layers.Dense(64, kernel_initializer = init)(next_layer)
    next_layer = tf.keras.layers.LeakyReLU(alpha=0.2)(next_layer)
    
    # 0 блок слоёв.
    next_layer = tf.keras.layers.Dense(shape_input[0])(next_layer) # Подразумевается, что вход - всё равно вектор.
    next_layer = tf.keras.layers.Activation('tanh')(next_layer)
    
    output_layer = next_layer

    
    # Модель.
    decoder = tf.keras.models.Model(input_code_layer, output_layer) # Декодировщик.
    autoencoder = tf.keras.Sequential([encoder, decoder])

    # Компиляция модели.
    opt = tf.keras.optimizers.Adam(lr = 1e-3)
    autoencoder.compile(loss = 'mse', optimizer = opt, loss_weights = [1.0])
    
    return encoder, decoder, autoencoder

In [None]:
# Загрузка моделей.

#encoder = tf.keras.models.load_model(rel_path + "encoder.h5")
#decoder = tf.keras.models.load_model(rel_path + "decoder.h5")
#autoencoder = autoencoder = tf.keras.Sequential([encoder, decoder])

In [None]:
encoder, decoder, autoencoder = normal_autoencoder((dataset_dim,), codes_dim)

In [None]:
history_callback = autoencoder.fit(samples, samples, epochs=epochs, validation_data=(test, test), batch_size=128)

In [None]:
# Сохранение динамики loss-функции.
loss_history = np.array(history_callback.history["loss"])
val_loss_history = np.array(history_callback.history["val_loss"])

np.savetxt(rel_path + "loss.csv", loss_history, delimiter="\n")
np.savetxt(rel_path + "val_loss.csv", val_loss_history, delimiter="\n")

# Сохранение моделей.
autoencoder.save(rel_path + "autoencoder.h5")
encoder.save(rel_path + "encoder.h5")
decoder.save(rel_path + "decoder.h5")

# Сохранение информации.
with open(rel_path + 'info.json', 'w') as fp:
    json.dump(info, fp)

### Получение кодов всех элементов датасета

In [None]:
codes = np.array(encoder.predict(samples))

In [None]:
pca_codes_dim = codes_dim
pca_codes = PCA(n_components=pca_codes_dim, whiten=True)
codes_pca = np.array(pca_codes.fit_transform(codes))

### KDE для кодов

In [None]:
# Загрузка параметров KDE.

#with open(rel_path + 'info.json', 'r') as fp:
#    info = json.load(fp)

#kde_codes = KernelDensity(bandwidth=info['bandwidth'], kernel='gaussian')
#kde_codes.fit(codes_pca)

In [None]:
def smart_gridsearch(begin, end, resolution = 7, rel_x_epsilon = 0.01, rtol = 0.001, n_jobs = 2, cv = 5):
    while True:
        grid = np.logspace(np.log10(begin), np.log10(end), resolution)
        print("Поиск по сетке: ", grid)
        params = {'bandwidth': grid}
        
        grid_search = GridSearchCV(KernelDensity(rtol = rtol), params, n_jobs = n_jobs, verbose = 10, cv = cv)
        grid_search.fit(codes_pca)
        
        if grid_search.best_index_ == 0:
            begin *= begin / end
            end = grid[1]
        elif grid_search.best_index_ == resolution - 1:
            end *= end / begin
            begin = grid[-1]
        else:
            begin = grid[grid_search.best_index_ - 1]
            end = grid[grid_search.best_index_ + 1]

        if end - begin < rel_x_epsilon * grid[grid_search.best_index_]:
            return grid_search 

In [None]:
kde_codes = smart_gridsearch(0.01, 0.45).best_estimator_
kde_codes.set_params(rtol = 0.0)
print(kde_codes.get_params())

In [None]:
info['bandwidth'] = kde_codes.get_params()['bandwidth']

# Сохранение информации.
with open(rel_path + 'info.json', 'w') as fp:
    json.dump(info, fp)

## Подсчёт энтропии

In [None]:
import math

In [None]:
def entropy_monte_carlo(kde, N, random_state = 42):
    samples  = kde.sample(N, random_state)
    log_prob = np.array(kde.score_samples(samples))
    
    average = -math.fsum(log_prob) / N

    squared_deviations = np.zeros(N)
    for i in range(N):
        squared_deviations[i] = (log_prob[i] - average)**2

    standard_deviation = np.sqrt(math.fsum(squared_deviations) / (N * (N - 1)))

    return average, standard_deviation

In [None]:
entropy, entropy_error = entropy_monte_carlo(kde_codes, len(codes_pca))
entropy_error *= 3.3 # Коэффициент Стьюдента.
print("H: %f, errH: %f" % (entropy, entropy_error))

In [None]:
info['MC entropy'] = entropy
info['MC entropy error'] = entropy_error

# Сохранение информации.
with open(rel_path + 'info.json', 'w') as fp:
    json.dump(info, fp)

In [None]:
def _lvo_step(bandwidth, samples, i):
    lvo_samples = samples
    np.delete(lvo_samples, i)
    
    kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
    kde.fit(lvo_samples)
    return kde.score_samples([samples[i]])[0]

In [None]:
def entropy_leave_one_out_parallel(bandwidth, samples, random_state = 42):
    N = len(samples)
    log_prob = np.array(Parallel(n_jobs=2, verbose=10, batch_size=2)(delayed(_lvo_step)(bandwidth, samples, i) for i in range(N)))

    average = -math.fsum(log_prob) / N
    
    squared_deviations = np.zeros(N)
    for i in range(N):
        squared_deviations[i] = (log_prob[i] - average)**2

    standard_deviation = np.sqrt(math.fsum(squared_deviations) / (N * (N - 1)))
        
    return average, standard_deviation

In [None]:
entropy, entropy_error = entropy_leave_one_out_parallel(kde_codes.get_params()['bandwidth'], codes_pca)
entropy_error *= 3.3 # Коэффициент Стьюдента.
print("H: %f, errH: %f" % (entropy, entropy_error))

In [None]:
info['LOO entropy'] = entropy
info['LOO entropy error'] = entropy_error

# Сохранение информации.
with open(rel_path + 'info.json', 'w') as fp:
    json.dump(info, fp)

## Оценка размерности кодов

In [None]:
tree_codes = BallTree(codes_pca, leaf_size=8)

In [None]:
def calc_pairs(tree, samples, radius):
    total = sum(tree.query_radius(samples, r=radius, count_only=True)) - len(samples)
    return total // 2

In [None]:
max_pairs = len(codes_pca) * (len(codes_pca) - 1) // 2
print(max_pairs)

In [None]:
def ineq_binary_search(func, a, b, rel_eps = 0.01):
    while np.abs(1 - a / b) > rel_eps:
        print("Бинарный поиск: [%f, %f]" % (a, b))
        pos = (a + b) / 2

        if func(pos):
            a = pos
        else:
            b = pos

    return a, b

In [None]:
# Начальное предположение.
min_radius_a = 0.0
min_radius_b = 2.0
max_radius_a = 5.0
max_radius_b = 50.0

min_radius, _ = ineq_binary_search(lambda x: calc_pairs(tree_codes, codes_pca, x) == 0, min_radius_a, min_radius_b)
_, max_radius = ineq_binary_search(lambda x: calc_pairs(tree_codes, codes_pca, x) < max_pairs, max_radius_a, max_radius_b)

assert calc_pairs(tree_codes, codes_pca, min_radius) == 0
assert calc_pairs(tree_codes, codes_pca, max_radius) == max_pairs

In [None]:
info['min_radius'] = min_radius
info['max_radius'] = max_radius

# Сохранение информации.
with open(rel_path + 'info.json', 'w') as fp:
    json.dump(info, fp)

In [None]:
resolution = 64

grid = np.logspace(np.log10(min_radius), np.log10(max_radius), resolution)
pairs = np.zeros(resolution, dtype='int64')
for i in range(resolution):
    pairs[i] = calc_pairs(tree_codes, codes_pca, grid[i])
    print("%f, %d" % (grid[i], pairs[i]))
    
    #if pairs[i] == max_pairs:
    #    break

In [None]:
writer = csv.writer(open(rel_path + "pairs.csv", 'w'))
for i in range(resolution):
    writer.writerow([grid[i], pairs[i]])

In [None]:
log_grid__pairs = np.column_stack((grid, pairs))
for i in range(resolution):
    log_grid__pairs[i][0] = np.log(log_grid__pairs[i][0])
    log_grid__pairs[i][1] = np.log(log_grid__pairs[i][1] / max_pairs)

np.savetxt(rel_path + "log_pairs.csv", log_grid__pairs, delimiter=",", newline='\n')