In [None]:
!pip install rdkit-pypi




In [None]:
from rdkit import Chem
from rdkit.Chem import Crippen
import sys
import os
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
import random as rn
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, TensorBoard
from tensorflow.keras.utils import Sequence

from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
%matplotlib inline

%tensorflow_version 2.x
import tensorflow as tf

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
seed = 23
np.random.seed(seed)
rn.seed(seed)

In [None]:
# veriyi dosyadan okuyoruz
csv_path = "/content/dosya_yeni.csv"

# burada geçenki çalışmamızda olan aynı verisetini kullanıyoruz fakat hatırlarsanız
# drug discovery için sadece 1 sınıfına ihtiyacımız var 0 sınıfı bizim için gereksiz
df = pd.read_csv(csv_path)

df['smiles'] = df['smiles'].astype('str')
mask = (df['cancer_active']==1)
df = df.loc[mask]
mask = (df['smiles'].str.len() < 128)
df = df.loc[mask]
smiles = df['smiles']
data = df['smiles']
full_train, test = train_test_split(data, test_size=0.2, random_state=seed)
len(data)

In [None]:
val_split = 0.10
train, val_set = train_test_split(full_train, test_size=val_split, random_state=seed)

In [None]:
char_to_int = {'n': 0, '[': 1, '\\': 2, 'E': 3, 'H': 4, ')': 5, 'B': 6, '9': 7, '2': 8, ']': 9, '7': 10, '!': 11, 't': 12, 's': 13, 'o': 14, 'c': 15, 'K': 16, '-': 17, '/': 18, 'l': 19, 'A': 20, 'r': 21, '@': 22, 'C': 23, '=': 24, '6': 25, 'N': 26, 'L': 27, 'a': 28, '5': 29, 'S': 30, 'T': 31, '#': 32, '+': 33, 'P': 34, 'i': 35, '(': 36, '8': 37, '1': 38, 'I': 39, 'e': 40, 'O': 41, '3': 42, 'F': 43, '4': 44, '.': 45, 'Z': 46, 'b': 47, 'G': 48, 'd': 49, 'g': 50, 'm': 51, 'U': 52, 'u': 53, 'X':54, '$':55, 'R':56, 'h':57, '%':58, '0':59}
int_to_char = {'0': 'n', '1': '[', '2': '\\', '3': 'E', '4': 'H', '5': ')', '6': 'B', '7': '9', '8': '2', '9': ']', '10': '7', '11': '!', '12': 't', '13': 's', '14': 'o', '15': 'c', '16': 'K', '17': '-', '18': '/', '19': 'l', '20': 'A', '21': 'r', '22': '@', '23': 'C', '24': '=', '25': '6', '26': 'N', '27': 'L', '28': 'a', '29': '5', '30': 'S', '31': 'T', '32': '#', '33': '+', '34': 'P', '35': 'i', '36': '(', '37': '8', '38': '1', '39': 'I', '40': 'e', '41': 'O', '42': '3', '43': 'F', '44': '4', '45': '.', '46': 'Z', '47': 'b', '48': 'G', '49': 'd', '50': 'g', '51': 'm', '52': 'U', '53': 'u', '54':'X', '55':'$', '56':'R', '57':'h', '58':'%', '59':'0'}
n_vocab = len(char_to_int)

In [None]:
# her bir smiles datası için sabit olan sequence boyutuna göre padding yapıyoruz padding için büyük E harfi kullanıyoruz
# ayrıca her bir smiles sekansının ! işaretiyle başlamasını sağlıyoruz
# bu elde ettiğimiz vektörler ile yapay zekayı besleyeceğiz bu sayede yapay zeka smiles strings içerisindeki gramatic kuralları anlayabilecek
def vectorize(smiles, embed, n_vocab):
    one_hot = np.zeros((smiles.shape[0], embed, n_vocab), dtype=np.int8)
    for i, smile in enumerate(smiles):
        one_hot[i,0,char_to_int["$"]] = 1
        for j, c in enumerate(smile):
            one_hot[i,j+1,char_to_int[c]] = 1
        one_hot[i,len(smile)+1:,char_to_int["X"]] = 1
    return one_hot[:,0:-1,:], one_hot[:,1:,:]

In [None]:
# daha önce train test split işlemini zaten yapmıştık burada split edilmiş veri setini vectorize ediyoruz
# yani smiles veriseti tamamen vektörize ediliyor çünkü lstm network vektör ile çalışır
embed = 128
X_train, y_train = vectorize(train, embed, n_vocab)
X_val, y_val = vectorize(val_set, embed, n_vocab)
X_test, y_test = vectorize(test, embed, n_vocab)

In [None]:
len(X_test[0])

In [None]:
# lstm katmanı
# encoder katmanı
enc_input = Input(shape=(X_train.shape[1:]))
_, state_h, state_c = LSTM(128, return_state=True)(enc_input)
states = Concatenate(axis=-1)([state_h, state_c])
bottle_neck = Dense(64, activation='relu')(states)

# decoder katmanı
state_h_decoded = Dense(128, activation='relu')(bottle_neck)
state_c_decoded = Dense(128, activation='relu')(bottle_neck)
encoder_states = [state_h_decoded, state_c_decoded]
dec_input = Input(shape=(X_train.shape[1:]))
dec1 = LSTM(128, return_sequences=True)(dec_input, initial_state=encoder_states)
output = Dense(y_train.shape[2], activation='softmax')(dec1)

model = Model(inputs=[enc_input, dec_input], outputs=output)
# model.summary()

In [None]:
state_h

In [None]:
# compile
batch_size = 16
steps_per_epoch = len(X_train) // batch_size
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['acc'])

In [None]:
# batch öğrenme için data generatörü
class Data_Generator(Sequence):
    def __init__(self, input_data, labels, batch_size):
        self.input_data, self.labels = input_data, labels
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.input_data) / float(self.batch_size)))

    def __getitem__(self, idx):
        x = self.input_data[idx * self.batch_size:(idx + 1) * self.batch_size]
        y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]

        batch_x, batch_y = np.array(x), np.array(y)

        return [batch_x, batch_x], batch_y

In [None]:
training_generator = Data_Generator(X_train, y_train, batch_size)
validation_generator = Data_Generator(X_val, y_val, batch_size)

In [None]:
# fit the model
nb_epochs = 200
validation_steps = len(X_val) // batch_size
history = model.fit(training_generator, steps_per_epoch=steps_per_epoch, epochs=nb_epochs, verbose=1,
                              validation_data=validation_generator, validation_steps=validation_steps,
                             use_multiprocessing=False, shuffle=True, callbacks=[])

In [None]:
# eğitilen lstm modelinin accuracy ve loss grafikleri
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2, figsize=(16,6))

axis1.plot(history.history["acc"], label='Train', linewidth=3)
axis1.plot(history.history["val_acc"], label='Validation', linewidth=3)
axis1.set_title('Model accuracy', fontsize=16, color="white")
axis1.set_ylabel('accuracy')
axis1.set_xlabel('epoch')
axis1.legend(loc='lower right')

axis2.plot(history.history["loss"], label='Train', linewidth=3)
axis2.plot(history.history["val_loss"], label='Validation', linewidth=3)
axis2.set_title('Model loss', fontsize=16, color="white")
axis2.set_ylabel('loss')
axis2.set_xlabel('epoch')
axis2.legend(loc='upper right')
plt.show()

In [None]:
encoder_model = Model(inputs=model.layers[0].input, outputs=model.layers[3].output)

In [None]:
# encoder modelden gelen hidden state ve cell state değerlerini input olarak alıp sonraki decoder modele input olarak verecek olan model

latent_input = Input(shape=(64, ))
state_h = model.layers[5](latent_input)
state_c = model.layers[6](latent_input)
latent_to_states_model = Model(latent_input, [state_h, state_c])

In [None]:
# ilaç üretecek olan kısım burası burada input katmanının 1,1,54 olması
# bu modelin çıktı olarak 54 feature'ı olan bir smiles vereceği anlamına gelir
decoder_inputs = Input(batch_shape=(1, 1, 60))
decoder_lstm = LSTM(128, return_sequences=True, stateful=True)(decoder_inputs)
decoder_outputs = Dense(60, activation='softmax')(decoder_lstm)
gen_model = Model(decoder_inputs, decoder_outputs)
for i in range(1,3):
    gen_model.layers[i].set_weights(model.layers[i+6].get_weights())

In [None]:
# bu fonksiyonda kendi softmax fonskiyonumuzu yazıyoruz bu şekilde bir latent space oluşturabileceğiz
def sample_with_temp(preds, sampling_temp):
    streched = np.log(preds) / sampling_temp
    streched_probs = np.exp(streched) / np.sum(np.exp(streched))
    return np.random.choice(range(len(streched)), p=streched_probs)

In [None]:
# Oluşan latent space'den sample smiles değerleri almak için fonksiyon
# oluşan latent space'i veriyoruz feature saıyısını veriyouz ve aktivasyon fonksiyonunun parametresini veriyoruz
def sample_smiles(latent, n_vocab, sampling_temp):
    states = latent_to_states_model.predict(latent)
    gen_model.layers[1].reset_states(states=[states[0], states[1]])

    startidx = char_to_int["$"]
    samplevec = np.zeros((1,1,n_vocab))
    samplevec[0,0,startidx] = 1
    sequence = ""

    for i in range(101):
        preds = gen_model.predict(samplevec)[0][-1]
        if sampling_temp == 1.0:
          sampleidx = np.argmax(preds)
        else:
          sampleidx = sample_with_temp(preds, sampling_temp)
        samplechar = int_to_char[str(sampleidx)]
        if samplechar != "X":
            sequence += samplechar
            samplevec = np.zeros((1,1,n_vocab))
            samplevec[0,0,sampleidx] = 1
        else:
            break
    return sequence

In [None]:
# latent space etrafında smiles generationu yapıyoruz
def generate(latent_seed, sampling_temp, scale, quant):
  samples, mols = [], []
  for i in range(quant):
    latent_vec = latent_seed + scale*(np.random.randn(latent_seed.shape[1]))
    out = sample_smiles(latent_vec, n_vocab, sampling_temp)
    mol = Chem.MolFromSmiles(out)
    if mol:
      try:
        hDonorNum = Chem.Lipinski.NumHDonors(mol)
        hAcceptorNum = Chem.Lipinski.NumHAcceptors(mol)
        logp = Descriptors.MolLogP(mol)
        mol_weight = Descriptors.MolWt(mol)
        print(f'donor: {hDonorNum}, acceptor: {hAcceptorNum}, logp: {logp}, molweight: {mol_weight}')
        if hDonorNum <=5 and hAcceptorNum <= 10 and mol_weight <500 and logp <=5:
          mols.append(mol)
          samples.append(out)
      except:
        print('başaramadık')
  return mols, samples

In [None]:
latent_space = encoder_model.predict(X_train)
latent_seed = latent_space[50:51]
sampling_temp = 0.75
scale = 0.5
quantity = 300
t_mols, t_smiles = generate(latent_seed, sampling_temp, scale, 300)


In [None]:
Draw.MolsToGridImage(t_mols, molsPerRow=3, subImgSize=(400, 400))

In [None]:
Draw.MolsToGridImage([t_mols[0]], molsPerRow=1, subImgSize=(300, 200))

In [None]:
t_smiles

In [None]:
def check_lipinski_rule(smiles):
    # SMILES dizgisinden molekül oluştur
    molecule = Chem.MolFromSmiles(smiles)

    # Lipinski'nin 5 kuralını kontrol et
    molecular_weight = Chem.rdMolDescriptors.CalcExactMolWt(molecule)
    num_heteroatoms = sum(1 for atom in molecule.GetAtoms() if atom.GetSymbol() not in ['C', 'H'])
    num_hb_acceptors = Chem.rdMolDescriptors.CalcNumLipinskiHBA(molecule)
    num_hb_donors = Chem.rdMolDescriptors.CalcNumLipinskiHBD(molecule)
    rotatable_bonds = Chem.rdMolDescriptors.CalcNumRotatableBonds(molecule)
    log_p = Crippen.MolLogP(molecule)

    # Tüm Lipinski kurallarına uygunsa, SMILES dizgisini ve molekül ağırlığını ekrana yazdır
    if (molecular_weight <= 500 and
        num_heteroatoms <= 5 and
        num_hb_acceptors <= 10 and
        num_hb_donors <= 5 and
        rotatable_bonds <= 10 and
        log_p <= 5):
        print("SMILES: ", smiles)
        print("Molekül Ağırlığı: ", molecular_weight)


In [None]:
for smiles in t_smiles:
  check_lipinski_rule(smiles)