In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import numpy as np
import pandas as pd
import os
import time
from tqdm import tqdm

from rdkit import Chem
from rdkit import DataStructs
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')  

from IPython.display import clear_output

In [2]:
df_trn = pd.read_csv('./data/train.csv')

In [3]:
with open('./data/train.csv', 'r') as csv_file:
    data = csv_file.read()

In [4]:
PATH = './data/train/'

In [5]:
all_captions = []
all_img_name_vector = []

for line in data.split('\n')[1:-1]:
    image_id, smiles = line.split(',')
    caption = '<' + smiles + '>'
    full_image_path = PATH + image_id

    all_img_name_vector.append(full_image_path)
    all_captions.append(caption)

train_captions, img_name_vector = shuffle(all_captions, all_img_name_vector, random_state=42)

# num_examples = 908765 # 학습에 사용할 데이터 수, Baseline에서는 제공된 데이터 모두 사용하였습니다.
num_examples = 1000 # 학습에 사용할 데이터 수, Baseline에서는 제공된 데이터 모두 사용하였습니다.
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]


In [6]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

In [7]:
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)


In [8]:
encode_train = sorted(set(img_name_vector))

image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

In [9]:
image_dataset

<BatchDataset shapes: ((None, 299, 299, 3), (None,)), types: (tf.float32, tf.string)>

In [10]:

# for i, (img, path) in enumerate(image_dataset):
#     print(i)
#     batch_features = image_features_extract_model(img)
#     batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))

#     for bf, p in zip(batch_features, path):
#         path_of_feature = p.numpy().decode("utf-8")
#         np.save(path_of_feature, bf.numpy())

In [11]:
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

max_length = calc_max_length(train_captions)

In [12]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, char_level=True)
tokenizer.fit_on_texts(train_captions)
top_k = len(tokenizer.word_index)

In [13]:
train_seqs = tokenizer.texts_to_sequences(train_captions)

In [14]:
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

In [15]:
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, cap_vector, test_size=0.02, random_state=42)

In [16]:
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

(980, 980, 20, 20)

In [17]:
BATCH_SIZE = 512
BUFFER_SIZE = 1000
embedding_dim = 512
units = 1024
vocab_size = top_k + 1
num_steps = len(img_name_train) // BATCH_SIZE
features_shape = 2048
attention_features_shape = 64

In [18]:
def map_func(img_name, cap):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    return img_tensor, cap

In [19]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))
dataset = dataset.map(lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [20]:
dataset_val = tf.data.Dataset.from_tensor_slices((img_name_val, cap_val))
dataset_val = dataset_val.map(lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_val = dataset_val.batch(BATCH_SIZE)
dataset_val = dataset_val.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [21]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [22]:
class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [23]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        context_vector, attention_weights = self.attention(features, hidden)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        x = self.fc1(output)

        x = tf.reshape(x, (-1, x.shape[2]))

        x = self.fc2(x)

        return x, state, attention_weights
    
    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [24]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [25]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [26]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=25)

In [27]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    ckpt.restore(ckpt_manager.latest_checkpoint)

In [28]:
loss_plot = []

In [29]:
@tf.function
def train_step(img_tensor, target, validation=False):
    loss = 0
    
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([tokenizer.word_index['<']] * target.shape[0], 1)
    
    with tf.GradientTape() as tape:
        features = encoder(img_tensor)

        for i in range(1, target.shape[1]):
            predictions, hidden, _ = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            dec_input = tf.expand_dims(target[:, i], 1)

    trainable_variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    total_loss = (loss / int(target.shape[1]))
    
    return loss, total_loss

In [32]:
dataset

<PrefetchDataset shapes: (<unknown>, <unknown>), types: (tf.float32, tf.int32)>

In [30]:
EPOCHS = 25

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
              epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
    
    loss_plot.append(total_loss / (batch+1))
    
    ckpt_manager.save()

    print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                         total_loss/(batch+1)))
    
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

UnknownError: FileNotFoundError: [Errno 2] No such file or directory: './data/train/train_83113.png.npy'
Traceback (most recent call last):

  File "C:\Users\dhzns\miniconda3\lib\site-packages\tensorflow_core\python\ops\script_ops.py", line 236, in __call__
    ret = func(*args)

  File "<ipython-input-18-92c2d9830b1b>", line 2, in map_func
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')

  File "C:\Users\dhzns\miniconda3\lib\site-packages\numpy\lib\npyio.py", line 416, in load
    fid = stack.enter_context(open(os_fspath(file), "rb"))

FileNotFoundError: [Errno 2] No such file or directory: './data/train/train_83113.png.npy'


	 [[{{node PyFunc}}]]

In [None]:
plt.plot(loss_plot, label='loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.legend()
plt.show()

In [None]:
# 가장 높은 확률로 예측
def predict(img_tensor):
    hidden = decoder.reset_state(batch_size=img_tensor.shape[0])
    dec_input = tf.expand_dims([tokenizer.word_index['<']] * img_tensor.shape[0], 1)
    features = encoder(img_tensor)
    
    result = []
    
    for i in range(max_length):
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predictions = np.argmax(predictions, axis=1)
        result.append(predictions)
        dec_input = tf.expand_dims(predictions, 1)
    
    return np.array(result)

# 비교적 높은 확률들로 예측
def predict_(img_tensor):
    hidden = decoder.reset_state(batch_size=img_tensor.shape[0])
    dec_input = tf.expand_dims([tokenizer.word_index['<']] * img_tensor.shape[0], 1)
    features = encoder(img_tensor)
    
    result = []
    
    for i in range(max_length):
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predictions = tf.random.categorical(predictions, 1)[:, 0].numpy()
        result.append(predictions)
        dec_input = tf.expand_dims(predictions, 1)
    
    return np.array(result)

def map_func_pred(img_name):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    return img_tensor

In [None]:
val_result = []
for batch in tqdm(dataset_val):
    val_result.extend(predict(batch[0]).T)
val_result = np.array(val_result)

In [None]:
preds = []
for rid in range(cap_val.shape[0]):
    pred = ''.join([tokenizer.index_word[i] for i in val_result[rid]])
    pred = pred.split('>')[0]
    preds.append(pred)

In [None]:
error_idx = []
for i, pred in enumerate(preds):
    m = Chem.MolFromSmiles(pred)
    if m == None:
        error_idx.append(i)
error_idx = np.array(error_idx)
error_idx_ = error_idx.copy()

In [None]:
drop_error = []
while True:
    error_idx_dict = {}
    for i, e in enumerate(error_idx_):
        error_idx_dict[i] = e
        
    img_name_val_, cap_val_ = np.array(img_name_val)[error_idx_], np.array(cap_val)[error_idx_]
    dataset_val_ = tf.data.Dataset.from_tensor_slices((img_name_val_, cap_val_))
    dataset_val_ = dataset_val_.map(lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_val_ = dataset_val_.batch(BATCH_SIZE)
    dataset_val_ = dataset_val_.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    
    val_result_ = []
    for batch in dataset_val_:
        val_result_.extend(predict_(batch[0]).T)
    val_result_ = np.array(val_result_)

    preds_ = []
    for rid in range(val_result_.shape[0]):
        pred = ''.join([tokenizer.index_word[i] for i in val_result_[rid] if i not in [0]])
        pred = pred.split('>')[0]
        preds_.append(pred)
    
    for i, pred in enumerate(preds_):
        m = Chem.MolFromSmiles(pred)
        if m != None:
            preds[error_idx_dict[i]] = pred
            drop_idx = np.where(error_idx==error_idx_dict[i])[0]
            drop_error.append(drop_idx[0])
    error_idx_ = np.delete(error_idx, drop_error)
    clear_output(wait=True)
    print(len(list(drop_error)), '/', error_idx.shape[0])
    
    if error_idx.shape[0]-len(list(drop_error)) < 10 :
        break

In [None]:
count = 0
answer = []
for rid, pred in enumerate(preds):
    true = ''.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])[1:-1]
    answer.append(true)
    if true == pred:
        count+=1
print('val_accuracy : ', count/cap_val.shape[0])

In [None]:
score = []
for i, pred in enumerate(preds):
    m1 = Chem.MolFromSmiles(answer[i])
    m2 = Chem.MolFromSmiles(pred)
    
    if m2 != None:
        fp1 = Chem.RDKFingerprint(m1)
        fp2 = Chem.RDKFingerprint(m2)

        similarity = DataStructs.FingerprintSimilarity(fp1,fp2)
    else:
        similarity = 0
    score.append(similarity)
    
print('val_similarity :', np.mean(score))