## Data files 

mofid_test.cvs/.json - normalized dictionary of MOFid

test.csv/.json - normalized all data (mofid, mofkey, space group, cell parameter, organic linker, metal node and etc.)

func.py - script for normalization raw data 

In [2]:
import pandas as pd 

# Import comet_ml at the top of your file
# from comet_ml import Experiment

import numpy as np
import tensorflow as tf
import time
from sklearn.model_selection import train_test_split

## Data preparing 

In [3]:
data = pd.read_csv('03_04_prepare.csv', index_col=0)
train_dataset, test_dataset = train_test_split(data, test_size=0.2)


In [4]:
train_dataset

Unnamed: 0_level_0,simples_nodes,simples_linkers,simples,topology,cat,a,b,c,alpha,beta,gamma,cell volume,space group
cifname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4514475,0.624697,0.624419,0.606034,0.004525,0.000000,0.199092,0.298744,0.282894,0.515868,0.354983,0.480222,0.029148,0.884146
4328821,0.476998,0.033721,0.448092,0.067873,0.000000,0.365097,0.109990,0.196030,0.515868,0.939247,0.480222,0.011772,0.085366
1561679,0.096852,0.075581,0.063886,0.036199,0.000000,0.077958,0.157285,0.135446,0.515868,0.426464,0.480222,0.004161,0.152439
4508927,0.026634,0.604651,0.586513,0.004525,0.142857,0.081858,0.088397,0.093451,0.515868,0.570164,0.480222,0.001831,0.378049
4341158,0.537530,0.526744,0.501331,0.638009,0.000000,0.410854,0.410982,0.317569,0.515868,0.354983,0.480222,0.081565,0.018293
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4125916,0.009685,0.272093,0.244011,0.004525,0.142857,0.244661,0.112217,0.042540,0.515868,0.378132,0.480222,0.003490,0.085366
7707154,0.036320,0.974419,0.974268,0.036199,0.142857,0.150533,0.048127,0.111594,0.515868,0.690816,0.480222,0.002197,0.054878
4303505,0.036320,0.093023,0.370896,0.352941,0.000000,0.080809,0.096704,0.097403,0.515868,0.354983,0.480222,0.002118,0.146341
4342062,0.544794,0.537209,0.513753,0.036199,0.000000,0.088868,0.291701,0.062077,0.515868,0.653055,0.480222,0.004086,0.054878


In [5]:
train_dataset = train_dataset.to_numpy()
test_dataset = test_dataset.to_numpy()

In [None]:
train_set = int(data.shape[0]*0.8)
test_set = data.shape[0]-train_set
batch_size = 32 

In [30]:
# Shuffle data
train_dataset = (tf.data.Dataset.from_tensor_slices(data_train)
                 .shuffle(train_size).batch(batch_size))
test_dataset = (tf.data.Dataset.from_tensor_slices(data_test)
                .shuffle(test_size).batch(batch_size))

NameError: name 'train_images' is not defined

## Variational autoencoder

In [19]:
class CVAE(tf.keras.Model):
  """Convolutional variational autoencoder."""

  def __init__(self, latent_dim):
    super(CVAE, self).__init__()
    self.latent_dim = latent_dim
    self.encoder = tf.keras.Sequential(
        [
            tf.keras.layers.InputLayer(input_shape=(13,)),
            tf.keras.layers.Dense(128, activation=tf.nn.relu),
            tf.keras.layers.Dropout(0.3, name = 'dropout1'),
            tf.keras.layers.Dense(32, activation=tf.nn.relu),
            tf.keras.layers.Dropout(0.3, name = 'dropout2'),
            tf.keras.layers.Dense((latent_dim,)),
        ]
    )

    self.decoder = tf.keras.Sequential(
        [
            tf.keras.layers.InputLayer(input_shape=(latent_dim,)),
            tf.keras.layers.Dense(32, activation=tf.nn.relu),
            tf.keras.layers.Dense(128, activation=tf.nn.relu),
            tf.keras.layers.Dense(13)
        ]
    )

  @tf.function
  def sample(self, eps=None):
    if eps is None:
      eps = tf.random.normal(shape=(100, self.latent_dim))
    return self.decode(eps, apply_sigmoid=True)

  def encode(self, x):
    mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2)
    return mean, logvar

  def reparameterize(self, mean, logvar):
    eps = tf.random.normal(shape=mean.shape)
    return eps * tf.exp(logvar * .5) + mean

  def decode(self, z, apply_sigmoid=False):
    logits = self.decoder(z)
    if apply_sigmoid:
      probs = tf.sigmoid(logits)
      return probs
    return logits

In [20]:
optimizer = tf.keras.optimizers.Adam(1e-4)


def log_normal_pdf(sample, mean, logvar, raxis=1):
  log2pi = tf.math.log(2. * np.pi)
  return tf.reduce_sum(
      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
      axis=raxis)


def compute_loss(model, x):
  mean, logvar = model.encode(x)
  z = model.reparameterize(mean, logvar)
  x_logit = model.decode(z)
  cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)
  logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])
  logpz = log_normal_pdf(z, 0., 0.)
  logqz_x = log_normal_pdf(z, mean, logvar)
  return -tf.reduce_mean(logpx_z + logpz - logqz_x)


@tf.function
def train_step(model, x, optimizer):
  """Executes one training step and returns the loss.

  This function computes the loss and gradients, and uses the latter to
  update the model's parameters.
  """
  with tf.GradientTape() as tape:
    loss = compute_loss(model, x)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [8]:
latent_dim = 2
epochs = 30

In [21]:
model = CVAE(2)


TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple'

In [10]:
for train_z in train_dataset:
    print(train_z)


[0.62469734 0.6244186  0.60603372 0.00452489 0.         0.19909188
 0.29874358 0.28289407 0.51586753 0.35498328 0.48022246 0.02914782
 0.88414634]
[0.47699758 0.03372093 0.44809228 0.0678733  0.         0.36509652
 0.10998954 0.19602969 0.51586753 0.93924735 0.48022246 0.01177195
 0.08536585]
[0.0968523  0.0755814  0.06388642 0.0361991  0.         0.0779576
 0.15728527 0.1354457  0.51586753 0.42646368 0.48022246 0.00416107
 0.15243902]
[0.02663438 0.60465116 0.58651287 0.00452489 0.14285714 0.08185835
 0.08839709 0.093451   0.51586753 0.57016433 0.48022246 0.00183094
 0.37804878]
[0.53753027 0.52674419 0.50133097 0.63800905 0.         0.41085417
 0.41098189 0.31756919 0.51586753 0.35498328 0.48022246 0.08156534
 0.01829268]
[0.14285714 0.75348837 0.75332742 0.05882353 0.14285714 0.19593658
 0.1483851  0.09649662 0.51586753 0.4997937  0.48022246 0.00623563
 0.05487805]
[0.00242131 0.33139535 0.29724933 0.45248869 0.         0.23693902
 0.23710444 0.23366878 0.51586753 0.35498328 0.48022

In [23]:
for epoch in range(1, epochs + 1):
  start_time = time.time()
  for train_x in train_dataset:
    train_step(model, train_x, optimizer)
  end_time = time.time()

  loss = tf.keras.metrics.Mean()
  for test_x in test_dataset:
    loss(compute_loss(model, test_x))
  elbo = -loss.result()
  display.clear_output(wait=False)
  print('Epoch: {}, Test set ELBO: {}, time elapse for current epoch: {}'
        .format(epoch, elbo, end_time - start_time))
  generate_and_save_images(model, epoch, test_sample)

ValueError: in user code:

    File "C:\Users\schir\AppData\Local\Temp/ipykernel_6424/3753019012.py", line 30, in train_step  *
        loss = compute_loss(model, x)
    File "C:\Users\schir\AppData\Local\Temp/ipykernel_6424/3753019012.py", line 12, in compute_loss  *
        mean, logvar = model.encode(x)
    File "C:\Users\schir\AppData\Local\Temp/ipykernel_6424/497618582.py", line 34, in encode  *
        mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2)
    File "c:\Users\schir\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\schir\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\input_spec.py", line 253, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_2' (type Sequential).
    
    Input 0 of layer "dense_6" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (13,)
    
    Call arguments received by layer 'sequential_2' (type Sequential):
      • inputs=tf.Tensor(shape=(13,), dtype=float64)
      • training=None
      • mask=None
