In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from functions.data_cleaning import DataClean as dc
from functions.figure_plotting import FigurePlot as fp
import matplotlib.ticker as ticker
from matplotlib import rcParams
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import seaborn as sns
import yaml
# For NMCE
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from functions.utilities import *
from functions.metrics import *
from vae.vae_torch import VAE
from gan.wgan_torch import Critic, Generator
from nmce.manifold_clustering import MaximalCodingRateReduction, Z_loss, chunk_avg, Gumble_Softmax, get_data, MLP_net
# Layout
rcParams.update({'figure.autolayout': True})

# Finland NMCE training

In [None]:
sns.set_theme()
sns.set_style("ticks")

#Avant Garde palette
CB91_Brown = "#7d5a1b"
CB91_Brown_Gray = "#816b51"
CB91_Green = "#294013"
CB91_Light_Green = "#d8ffc4"
CB91_Red = "#84290d"

# Out of palette
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Yellow = "#dfc077"

Green_Grad = ["#d8ffc4", "#bbdfa7", "#9ebf89", "#81a06c", "#63804e", "#466031", 
              "#294013", "#24380f", "#1e310b", "#192908", "#132204", "#0e1a00"]
Red_Grad = ["#fff1e3", "#efc6b0", "#df9b7d", "#cf6f4a", "#bf4417", "#ab3b14",
            "#983210", "#84290d", "#70200a", "#5c1707", "#490e03", "#350500"]
Brown_Grad = ["#fffac3", "#efda9d", "#dfc077", "#cfa550",  "#bf8b2a", "#a97a24", 
              "#93681e", "#7d5718", "#684512", "#52340c", "#3c2206", "#261100"]

color_list = [CB91_Brown, CB91_Green, CB91_Light_Green, CB91_Red,
              CB91_Purple, CB91_Violet,  CB91_Yellow, CB91_Brown_Gray]

#A list of hex colours running between blue and purple
CB91_Grad_BP = ['#2cbdfe', '#2fb9fc', '#33b4fa', '#36b0f8',
                '#3aacf6', '#3da8f4', '#41a3f2', '#449ff0',
                '#489bee', '#4b97ec', '#4f92ea', '#528ee8',
                '#568ae6', '#5986e4', '#5c81e2', '#607de0',
                '#6379de', '#6775dc', '#6a70da', '#6e6cd8',
                '#7168d7', '#7564d5', '#785fd3', '#7c5bd1',
                '#7f57cf', '#8353cd', '#864ecb', '#894ac9',
                '#8d46c7', '#9042c5', '#943dc3', '#9739c1',
                '#9b35bf', '#9e31bd', '#a22cbb', '#a528b9',
                '#a924b7', '#ac20b5', '#b01bb3', '#b317b1']

sns.set(font="Verdana",
        rc={
        "axes.axisbelow": False,
        "axes.edgecolor": "lightgrey",
        "axes.facecolor": "None",
        "axes.grid": False,
        "axes.labelcolor": "dimgrey",
        "axes.spines.right": False,
        "axes.spines.top": False,
        "figure.facecolor": "white",
        "lines.solid_capstyle": "round",
        "patch.edgecolor": "w",
        "patch.force_edgecolor": True,
        "text.color": "dimgrey",
        "xtick.bottom": False,
        "xtick.color": "dimgrey",
        "xtick.direction": "out",
        "xtick.top": False,
        "ytick.color": "dimgrey",
        "ytick.direction": "out",
        "ytick.left": False,
        "ytick.right": False})
 
custom_colors = color_list
sns.set_palette(sns.color_palette(color_list, 8))

sns.set_context("notebook", rc={"font.size":16,
                                "axes.titlesize":20,
                                "axes.labelsize":18})

In [None]:
train_new = True
causal_model = False
vae_model_number = 2  # 0=15, 1=30, 2=50, 3=100
gan_model_number = 3

def get_generated(torch_model, dim_batch, lat_feature):
    noise = torch.rand(dim_batch, lat_feature)
    with torch.no_grad():
        s_data = torch_model(noise)
    s_data = s_data.detach()
    return s_data

In [None]:
np.random.seed(42)
torch.manual_seed(42)

In [None]:
vae_model_configuration_file = "configurations/torch_vae.yaml"
gan_model_configuration_file = "configurations/torch_gan.yaml"
nmce_model_configuration_file = "configurations/torch_nmce.yaml"
preprocessing_configuration_file = "configurations/categorical_preprocessing_configuration.yaml"
plotting_configuration_file = "configurations/print.yaml"
draw = fp(plotting_configuration_file)

In [None]:
data_filename = "data/original_preprocessed.csv"
begin = time.time()
data = dc(datafile=data_filename,
          prepared=True,
          configuration_file=preprocessing_configuration_file
          )
duration = time.time() - begin
print(duration)

In [None]:
with open(vae_model_configuration_file, 'r') as file:
    cfg = yaml.safe_load(file)
vae_cfg = cfg["model"]

In [None]:
with open(gan_model_configuration_file, 'r') as file:
    cfg = yaml.safe_load(file)
gan_cfg = cfg["model"]

In [None]:
with open(nmce_model_configuration_file, 'r') as file:
    cfg = yaml.safe_load(file)
nmce_cfg = cfg

In [None]:
if causal_model:
    df = data.get_data_causal()
else:
    df = data.get_data()
df.head()

In [None]:
model_name = nmce_cfg["globals"]["model_name"][0]
model_type = nmce_cfg["globals"]["model_type"]  # "categorical"
fig_dpi = draw.cfg["fig_dpi"]

In [None]:
x_data = data.get_data() # data to run in deep generative models

In [None]:
feature_dimension = df.shape[1]
vae_latent_dimension = int(vae_cfg["latent_dimensions"][vae_model_number])
gan_latent_dimension = int(gan_cfg["latent_dimensions"][gan_model_number])

In [None]:
print("Vae latent: {}\tWgan latent: {}".format(vae_latent_dimension, gan_latent_dimension))

In [None]:
gan_learning_rate = float(gan_cfg["learning_rate"][0])
beta_1 = float(gan_cfg["beta1"])
beta_2 = float(gan_cfg["beta2"])

In [None]:
if causal_model:
    feature_dimension_base = data.get_data().shape[1]
    critic = Critic(feature_dimension_base, output_dim=1)
    generator = Generator(feature_dimension_base, gan_latent_dimension)
else:
    critic = Critic(feature_dimension, output_dim=1)
    generator = Generator(feature_dimension, gan_latent_dimension)

In [None]:
opt_critic = optim.Adam(critic.parameters(), lr=gan_learning_rate, betas=(beta_1, beta_2))  # GAN + WGAN-GP
opt_generator = optim.Adam(generator.parameters(), lr=gan_learning_rate, betas=(beta_1, beta_2))  # GAN + WGAN-GP

In [None]:
vae_learning_rate = float(vae_cfg["learning_rate"][0])

In [None]:
model_vae = VAE(feature_dimension, vae_latent_dimension)
optimiser = optim.RMSprop(model_vae.parameters(), lr=vae_learning_rate)

In [None]:
if causal_model:
    vae_dict = torch.load(vae_cfg["model_dict_file_causal_" + str(vae_latent_dimension)])
    opt_dict = torch.load(vae_cfg["opt_dict_file_causal_" + str(vae_latent_dimension)])
else:
    vae_dict = torch.load(vae_cfg["model_dict_file_" + str(vae_latent_dimension)])
    opt_dict = torch.load(vae_cfg["opt_dict_file_" + str(vae_latent_dimension)])

model_vae.load_state_dict(vae_dict)
optimiser.load_state_dict(opt_dict)

In [None]:
print(gan_cfg["generator_dict_file_100"])

In [None]:
generator.load_state_dict(torch.load(gan_cfg["generator_dict_file_100"]))
opt_generator.load_state_dict(torch.load(gan_cfg["opt_gen_dict_file_100"]))
critic.load_state_dict(torch.load(gan_cfg["critic_dict_file_100"]))
opt_critic.load_state_dict(torch.load(gan_cfg["opt_critic_dict_file_100"]))

In [None]:
encoder = model_vae.encoder
decoder = model_vae.decoder

In [None]:
"""
Transfer learning can be used by building on a freezed model 
or taking the outputs from another model as input to a new

Helper functions to freeze a model:
set requrie_grads = False on parameters
"""

def freeze_model(model):
    for name, para in model.named_parameters():
        para.requires_grad = False
        print("_"*20)
        print(f"name: {name}")
        print("values: ")
        print(para)
def unfreeze_model(model):
    for name, para in model.named_parameters():
        para.requires_grad = True
        print("_"*20)
        print(f"name: {name}")
        print("values: ")
        print(para)

def show_model_parameters(model):
    for name, para in model.named_parameters():
        print("_"*20)
        print(f"name: {name}")
        print("values: ")
        print(para)


# Create Synthetic Population

1. Decide sample size (default = same as origininal data)
2. Create tf_synthetic which is a tensorflow EagerTensor object representing data
3. Convert EagerTensor object to pandas dataframe
4. Compare original and synthetic population

# Classified Resampling of Synthetic Population

The difference in clustering patterns between the original data and the synthetic populations (either gan or vae), shows than these neural networks do not capture statistical underlying patterns despite the good reproduction of a complete synthetic population against the original data. This is a known weakness of neural networks, that can be counteracted with methods that capture these structures. Neural manifold clustering and embedding is suggested as one such method.

1. Synthetic populations are generated using the pretrained VAE or WGAN models
2. The trained NMCE is used to cluster the original data and sets of synthetic data to demonstrate differences or similarities in clustering
3. For each cluster in the original data, cluster data are extracted and a new batch of synthetic data based on these cluster data created with the pretrained VAE or WGAN. 
4. Then the combined original and synthetic single-cluster data are used to train a second VAE, that now specialice on a single cluster.
5. The complete workflow from original data, through a first batch of synthetic data and a second cluster specific VAE or WGAN results in a model that can output a desired number of clusters. We first test this for the complete population, and next to create a profiled synthetic population for regions.

In [None]:
# Create torch_synthetic from pretrained models

In [None]:
df = data.get_data()
df_causal_o = data.get_data_causal()

In [None]:
df.shape

In [None]:
df_causal_o.shape

In [None]:
torch_synthetic_gan = get_generated(generator, df.shape[0], gan_latent_dimension)
torch_synthetic_vae = get_generated(decoder, df.shape[0], vae_latent_dimension)

In [None]:
df_s = data.get_synthetic(torch_synthetic_gan, df.columns)
df_sv = data.get_synthetic(torch_synthetic_vae, df.columns)

# Plot Univariate Marginals WGAN

In [None]:
combine = pd.concat([df.mean(), df_s.mean()], axis=1)
wgan_column = "WGAN-" + str(gan_latent_dimension)
combine.columns = ["Original", wgan_column]

In [None]:
# One-hots
draw.plot_compare(data=combine,
                  title="compare",
                  model_type=model_type,
                  model="WGAN",
                  model_name=model_name,
                  #save=True
                  )

# Plot Univariate Marginals VAE

In [None]:
combine_vae = combine_data(df, df_sv, ["Original", "VAE-" + str(vae_latent_dimension)])

In [None]:
# One-hots
draw.plot_compare(data=combine_vae,
                  title="compare",
                  model_type=model_type,
                  model="VAE",
                  model_name=model_name,
                  #save=True
                  )

# Check for similar records

In [None]:
df_o = data.get_data()
duplicate_records = len(df_o)-len(df_o.drop_duplicates())
print (duplicate_records)

In [None]:
df_o[df_o.duplicated()]

In [None]:
df_s = data.get_synthetic(torch_synthetic_gan, df_o.columns)
duplicate_records = len(df_s)-len(df_s.drop_duplicates())
print (duplicate_records)

In [None]:
df_s[df_s.duplicated()]

# Classification with NMCE

In [None]:
# Strip out x data record in order to get the batch size in an integer divide size
x_data = torch.tensor(df.iloc[:-1,:].values, dtype=torch.float32)
x_data = x_data + 0.00001  # To avoid 0
runs = 20

In [None]:
c_loss = []
d_loss = []
z_sim_list = []
n_steps = 1000
print_every = 300
bs = 1929 # Gives 10 iterations
n_chunks = 2  # MUST be hardcoded to same value in .py file!!!!
#task variables
amb_dim = feature_dimension  # input dim
lat_dim = 150  # neurons at each layer
z_dim = 100
n_clusters = 20
lambda_ = 40  # er 4000 for synthetic helix (ser ut til å spille mindre rolle)
G_Softmax = Gumble_Softmax(0.2, straight_through=False)
criterion = MaximalCodingRateReduction(eps=0.01, gamma=1.0)
criterion_z = Z_loss()

In [None]:
train_new = False

if train_new:
    # Create new instance of net
    net = MLP_net(amb_dim, lat_dim, z_dim, n_clusters)
    optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9,0.99), weight_decay=0.00001)    
else:
    # Using the last saved to keep training:
    net = torch.jit.load('models/nmce/fin/late_nmce_model_RUN_15_1929_230_125.06042_21.24433_0.69456scripted.pt') 
    optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9,0.99), weight_decay=0.00001) 
    """
    
Latest 20 runs with random noise (x + noise)/2
Start from last model
New 20 runs with random noise (x + x + noise)/2
Crashed at run 15 ---> start again from this



USE BS size --->

NORWAY: n = 24270 (all)
ba = batch size
1618 give 15 steps
2427 give 10 steps

FINLAND: n = 19290 (skip one record)
bs = batch size 
643 give 30 steps, 
1286 give 15 steps, (first runs using this)
1929 give 10 steps,
3858 give 5 step
"""
begin = time.time()
for run in range(runs):
    for i in range(n_steps):
        loader = iter(DataLoader(dataset=x_data, batch_size=bs, shuffle=True))
        # Run one batch and update grads
        for j in range(len(loader)):
            x = next(loader)
            # Create augmented data from vae-model
            noise = torch.rand(bs, x.shape[1])
            aug_latent = encoder((x + x + x + noise) / 4)
            xn = decoder(aug_latent)
            # xn = torch.tensor(np.array(xn), dtype=torch.float32)
            xt = torch.cat((xn, x), dim=0).float()
            z, logits = net(xt)
            loss_z, z_sim = criterion_z(z)
            z_sim = z_sim.mean()
            prob = G_Softmax(logits)
            z, prob = chunk_avg(z, n_chunks=n_chunks, normalize=True), chunk_avg(prob, n_chunks=n_chunks)
            loss, loss_list= criterion(z,prob,num_classes=n_clusters)
            loss += lambda_ * loss_z
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Save the loss at the end of run of all batches 
            if j % (len(loader) - 1) == 0:
                c_loss.append(loss_list[0])
                d_loss.append(loss_list[1])
                z_sim_list.append(z_sim.item())
        if i % print_every == 0:
            print('{} steps, loss c {:.5f}, loss d {:.5f}, z sim {:.5f}'.format(i+1,loss_list[0],loss_list[1],z_sim.item()))
    print("Duration for run {} is {}".format(run, duration))
    print("C_loss: ", c_loss[-1])
    print("D_loss: ", d_loss[-1])
    print("Z_sim: ", z_sim_list[-1])
    if run % 1 == 0 or run == runs:
        model_scripted = torch.jit.script(net) # Export to TorchScript
        model_scripted.save('models/nmce/fin/late_nmce_model_RUN_{}_{}_{}_{:.5f}_{:.5f}_{:.5f}scripted.pt'.format(run, bs, feature_dimension, c_loss[-1], d_loss[-1], z_sim_list[-1])) # Save

In [None]:
model_scripted = torch.jit.script(net) # Export to TorchScript
model_scripted.save('models/nmce/fin/late_nmce_model_RUN_{}_{}_{}_{:.5f}_{:.5f}_{:.5f}scripted.pt'.format(run, bs, feature_dimension, c_loss[-1], d_loss[-1], z_sim_list[-1])) # Save

In [None]:
duration = time.time() - begin

In [None]:
save_curves = True
plt.plot(c_loss[12::2], label="c-loss")
plt.xlabel('Epochs')
plt.ylabel('C-Loss')
plt.legend()
if save_curves:
    plt.savefig("figures/nmce-experimental-fin/Next_Normal_C_loss_" + str(n_steps * runs) +
                "_clusters_" + str(n_clusters) +
                "_batch_" + str(bs) +
                "_features_" + str(feature_dimension) +
                "_lat_" + str(vae_latent_dimension) + 
                ".png", dpi=200)

plt.show()

In [None]:
plt.plot(d_loss[12::2], label="d-loss")
plt.xlabel('Epochs')
plt.ylabel('D-Loss')
plt.legend()
if save_curves:
    plt.savefig("figures/nmce-experimental-fin/Next_Normal_D_loss_" + str(n_steps * runs) +
                "_clusters_" + str(n_clusters) +
                "_batch_" + str(bs) +
                "_features_" + str(feature_dimension) +
                "_lat_" + str(vae_latent_dimension) + 
                ".png", dpi=200)
plt.show()

In [None]:
plt.plot(z_sim_list[12::2], label="z-sim")
plt.xlabel('Epochs')
plt.ylabel('Z-Sim')
plt.legend()
if save_curves:
    plt.savefig("figures/nmce-experimental-fin/Next_Normal_Z_sim_" + str(n_steps * runs) +
                "_clusters_" + str(n_clusters) +
                "_batch_" + str(bs) +
                "_features_" + str(feature_dimension) +
                "_lat_" + str(vae_latent_dimension) + 
                ".png", dpi=200)

plt.show()

In [None]:
df_s_new = data.get_synthetic(torch_synthetic_vae, df.columns)

In [None]:
x1S = torch.tensor(df_s_new.values, dtype=torch.float32) # synthetic
x2O = torch.tensor(df_o.values, dtype=torch.float32) # original

In [None]:
df_preds_1 = get_predictions(net, df_s_new)
df_preds_2 = get_predictions(net, df)

In [None]:
df_preds_1_.value_counts()

In [None]:
df_preds_1 = pd.DataFrame(preds_1.numpy()) # Synthetic data
df_preds_2 = pd.DataFrame(preds_2.numpy()) # Original data

In [None]:
highest_cluster = df_preds_2.value_counts().index.tolist()
highest_cluster

In [None]:
highest = highest_cluster[0][0]
highest

In [None]:
df_o_highest = data.get_data_recategorised()
df_o_highest["cluster"] = df_preds_2
df_s_highest = data.get_synthetic_recategorised(torch_synthetic_gan, df.columns)
df_s_highest["cluster"] = df_preds_1

In [None]:
df_o_highest = df_o_highest.loc[df_o_highest["cluster"] == highest].copy()
df_s_highest = df_s_highest.loc[df_s_highest["cluster"] == highest].copy()

In [None]:
for col in data.categorical:
    if df_o_highest[col].value_counts().max() > 11000:
        scale_max=df_o_highest[col].value_counts().max()
    else:
        scale_max=None
    if (data.min_values[col] < 1):
        n_bins = int(data.max_bins[col] + 1)
        scale_min = 0
    else:
        scale_min = 1
        n_bins = int(data.max_bins[col])
    draw.plot_two(df_original=df_o_highest, 
                  df_synthetic=df_s_highest,
                  title=col,
                  hue_value="isFemale",
                  n_bins=n_bins,
                  model_type=model_type,
                  model="vae_highest_cluster_only ",
                  model_name=model_name,
                  scale_max=scale_max,
                  scale_min=scale_min,
                  #save=True
                  )

In [None]:
df_o_highest_hot = data.get_data()
df_o_highest_hot["cluster"] = df_preds_2
df_s_highest_hot = data.get_synthetic(torch_synthetic_gan, df.columns)
df_s_highest_hot["cluster"] = df_preds_1

df_o_highest_hot = df_o_highest_hot.loc[df_o_highest_hot["cluster"] == highest].copy()
df_s_highest_hot = df_s_highest_hot.loc[df_s_highest_hot["cluster"] == highest].copy()


original_highest_hot = (df_o_highest_hot.iloc[:,:-1].sum(axis=0) / df_o_highest_hot.shape[0])
synthetic_highest_hot = (df_s_highest_hot.iloc[:,:-1].sum(axis=0) / df_s_highest_hot.shape[0])

combine_highest_hot = pd.concat([original_highest_hot, synthetic_highest_hot], axis=1)
combine_highest_hot.columns = ["Original", "WGAN"]

In [None]:
combine_highest_hot.head()

In [None]:
"""
Select the cluster with greatest number of examples and compare variables between
original and syntetic data.

"""
draw.plot_compare(data=combine_highest_hot,
                  title="compare",
                  model_type=model_type,
                  model="VAE_highest_cluster",
                  model_name=model_name,
                  #save=True
                  )

In [None]:
combine_preds = pd.concat([df_preds_2.value_counts()/df_preds_2.shape[0], df_preds_1.value_counts()/df_preds_1.shape[0]], axis=1)
combine_preds.columns = ["Original", "Synthetic"]
combine_preds = combine_preds.replace(np.nan, 0)

print("combine_preds shape: ", combine_preds.shape)

In [None]:
"""
Compare frequencies on found clusters
"""
draw.plot_compare(data=combine_preds,
                  title="compare_all",
                  model_type=model_type,
                  model="augmented_clusters",
                  model_name=model_name,
                  #save=True
                  )

In [None]:
df_original_cluster = df_o.copy()
df_original_cluster["cluster"] = df_preds_2

In [None]:
df_synthetic_cluster = df_s_new.copy()
df_synthetic_cluster["cluster"] = df_preds_1

In [None]:
df_original_cluster.head()

In [None]:
sns.lmplot(data=df_original_cluster, 
           x="cluster", y="getHelp", 
           col="hasIncome", 
           hue="isFemale",
           ci=95,
           seed=42,
           x_ci="ci",
           fit_reg=True,
           height=6)

In [None]:
orig_13_o = df_original_cluster.copy()
orig_13_s = df_synthetic_cluster.copy()
orig_13_original = orig_13_o.loc[orig_13_o["cluster"] == 2].copy()
orig_13_synthetic = orig_13_s.loc[orig_13_s["cluster"] == 2].copy()
# Create a combo of original and synthetic data with the same predicted category
# Trying to "recreate" this category in the vae
orig_13 = pd.concat([orig_13_original, orig_13_synthetic], axis=0)
orig_13 = orig_13.iloc[:,:-1]

In [None]:
combine_13 = pd.concat([orig_13_original.iloc[:,:-1].mean(), orig_13_synthetic.iloc[:,:-1].mean()], axis=1)
combine_13.columns = ["SecondClusterOriginal", "SecondClusterSynthetic"]

In [None]:
"""
Select the cluster with second greatest number of examples and compare variables between
original and syntetic data.

"""
draw.plot_compare(data=combine_13,
                  title="compare",
                  model_type=model_type,
                  model="VAE_second_highest_cluster",
                  model_name=model_name,
                  #save=True
                  )

In [None]:
orig_13.shape

In [None]:
print(torch.from_numpy(orig_13.values).dtype)

In [None]:
# Create a batch of "same category" augmented data and test it with the classifier
# orig_13.values should be a tensor
synth_13_lat = encoder(torch.as_tensor(orig_13.values.astype(np.float32)))
synth_13_tf = decoder(synth_13_lat)
synth_13 = np.array(synth_13_tf.detach().numpy())

In [None]:
x1S13 = torch.tensor(synth_13, dtype=torch.float32)

with torch.no_grad():
    z, logits = net(x1S13)
    pred_s_13 = logits.max(dim=1)[1]
x1S13 = x1S13.detach()

In [None]:
synth_13_df = pd.DataFrame(pred_s_13.numpy())
synth_13_df.value_counts()

In [None]:
synth_13_df.shape

In [None]:
def kl_loss(mu, log_var): # std = sigma ** 2 log_var = log(sigma ** 2)
    loss_kl = - 0.5 * torch.sum(1 + log_var - torch.exp(log_var) - mu ** 2)
    return loss_kl

loss_fn = nn.BCELoss(reduction="sum")

In [None]:
model_vae.trainable = False
decoder.trainable = False
encoder.trainable = False
number_epochs = 40
# Train a new vae on small-size classes derived from this set-up
model_13 = VAE(x_data.shape[1], vae_latent_dimension)
optimiser_13 = optim.RMSprop(model_13.parameters(), lr=vae_learning_rate)
model_13.train()

In [None]:
torch_data = torch.tensor(orig_13.values, dtype=torch.float32)
batch_size = 128
beta_vae = 0.5
loader = DataLoader(torch_data, batch_size=batch_size, shuffle=True)

In [None]:
"""
Train VAE model to produce the second largest cluster in original data

"""

collect_loss = []
for epoch in range(number_epochs):
    for batch_idx, (real) in enumerate(loader):
        batch_size = real.shape[0]
        replica, z_mean, z_sigma = model_vae(real)
        reconstruction_loss = loss_fn(replica, real)
        kl = beta_vae * (kl_loss(z_mean, z_sigma) / real.shape[1])
        loss = reconstruction_loss + kl
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        if batch_idx % 20 == 0 and batch_idx > 0:
            print(f"Epoch [{epoch} / {number_epochs}] \ "
                  f"KL Loss: {kl:4f}, Rep Loss: {reconstruction_loss:.4f}")
            collect_loss.append((kl, reconstruction_loss))

In [None]:
encoder_13 = model_13.encoder
decoder_13 = model_13.decoder

In [None]:
for name, para in encoder_13.named_parameters():
    print("_"*20)
    print(f"name: {name}")
    print("values: ")
    print(para)

In [None]:
fake = torch.randn(df.shape[0], vae_latent_dimension)
torch_s = model_vae.decoder(fake)

In [None]:
start = time.time()
x_enc = encoder(torch.from_numpy(orig_13.values.astype(np.float32)))
x_dec = decoder(x_enc)

In [None]:
tf_synthetic_c = get_generated(decoder_13, orig_13.shape[0], vae_latent_dimension)

In [None]:
df_synthetic_c = data.get_synthetic(tf_synthetic_c, orig_13.columns)

In [None]:
cat_data = torch.tensor(df_synthetic_c.values, dtype=torch.float32)

In [None]:
with torch.no_grad():
    z, logits = net(cat_data)
    pred_cat = logits.max(dim=1)[1]
cat_data = cat_data.detach()

In [None]:
df_13_cat = pd.DataFrame(pred_cat.numpy())

In [None]:
df_13_cat.value_counts()

In [None]:
df_synthetic_c["cluster"] = pred_cat

In [None]:
# Synthetic population of cluster 6
df_double = df_synthetic_c.loc[df_synthetic_c["cluster"] == 8]

In [None]:
cat_double = torch.tensor(df_double.iloc[:,:-1].values, dtype=torch.float32)

In [None]:
df_13_cat.shape

In [None]:
with torch.no_grad():
    z, logits = net(cat_double)
    pred_double = logits.max(dim=1)[1]
cat_double = cat_double.detach()

In [None]:
df_13_double = pd.DataFrame(pred_double.numpy())
df_13_double.value_counts()

In [None]:
# Load the torch model (Saved trained net model)
nmce_model = torch.jit.load('models/nmce/nor/nmce_model_RUN_19_1618_157_23.49887_22.89522_0.97810scripted.pt')
nmce_model.eval()

In [None]:
df_synthetic_c = data.get_synthetic(tf_synthetic_c)

In [None]:
cat_nmce_data = torch.tensor(df_synthetic_c.values, dtype=torch.float32)

In [None]:
with torch.no_grad():
    z, logits = nmce_model(cat_nmce_data)
    pred_nmce_cat = logits.max(dim=1)[1]
cat_nmce_data = cat_nmce_data.detach()

In [None]:
df_13_cat_nmce = pd.DataFrame(pred_nmce_cat.numpy())

In [None]:
df_13_cat_nmce.value_counts()