In [41]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns

import time
from sklearn.metrics import pairwise_distances
from sklearn.manifold import TSNE

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.ipython_useSVG = True

In [2]:
def read_xy(x_fp, y_fp):
    x = pd.read_csv(x_fp, header=None, index_col=0)
    y = pd.read_csv(y_fp, index_col=0)
    return {'x': x, 'y': y}

In [3]:
def count_y_class(y_df):
    y_class = y_df.apply(lambda x: ''.join([str(i) for i in x]), axis=1).to_frame(name='class')
    # y_class.head(2)
    
    y_class2num = y_class['class'].value_counts().to_frame(name='num')
    y_class2num['n_error'] = 0
    return y_class2num

In [4]:
def predict_y(x, model):
    y_pred_raw = model.predict(x)
    y_pred_bool = tf.math.sigmoid(y_pred_raw)
    y_pred_bool = y_pred_bool.numpy()
    y_pred_bool[y_pred_bool >= 0.5] = 1
    y_pred_bool[y_pred_bool < 0.5] = 0
    return y_pred_bool

In [5]:
def predict_y_reg(x, model):
    y_pred_raw = model.predict(x)
    y_pred_int = np.rint(y_pred_raw)
    return y_pred_int.astype(np.int)

In [6]:
def count_pred_error_in_class(y, y_pred):
    pred_comp_bool = np.all(y == y_pred, axis=1)
    y_pred_error = y_pred[~pred_comp_bool]
    y_class = y.apply(lambda x: ''.join([str(i) for i in x]), axis=1).to_frame(name='class')
    y_pred_error_count_by_class = y_class.loc[y_class.index.isin(y_pred_error.index), 'class'].value_counts().to_frame(name='n_error')
    
    y_class2num = y_class['class'].value_counts().to_frame(name='num')
    y_class2num['n_error'] = 0
    y_class2num.loc[y_pred_error_count_by_class.index, 'n_error'] = y_pred_error_count_by_class.n_error
    return y_class2num

In [7]:
def cal_distance(x, y, metric='euclidean'):
    if type(x) == pd.core.series.Series:
        x = x.values.reshape(1, -1)
    if type(y) == pd.core.series.Series:
        y = y.values.reshape(1, -1)
    return pairwise_distances(x, y, metric=metric)

In [8]:
def print_closest_words(x_embedding, x_query, n=5, add_vec=None):
    x = x_embedding.loc[x_query].values.reshape(1, -1).copy()
    # print('x is: {}'.format(x))
    if add_vec is not None:
        x += add_vec
        # print('x + add_vec is: {}'.format(x))
    dists = cal_distance(x=x_embedding.values, y=x)     # compute distances to all words
    lst = sorted(enumerate(dists), key=lambda x: x[1]) # sort by distance
    # print(lst[:100])
    all_smiles = []
    all_dis = [] 
    if add_vec is not None:
        for idx, difference in lst[0:n]:
            _smiles = x_embedding.iloc[idx,:].name
            all_smiles.append(_smiles)
            all_dis.append(difference[0])
            # print(_smiles, difference)
    else:
        for idx, difference in lst[1:n+1]:   # take the top n
            _smiles = x_embedding.iloc[idx,:].name
            all_smiles.append(_smiles)
            all_dis.append(difference[0])
            # print(_smiles, difference)
    return {'smiles': all_smiles, 'dis': all_dis}

In [9]:
def get_minus_result(x_embedding, x, y):
    x = x_embedding.loc[x].values.reshape(1, -1)
    y = x_embedding.loc[y].values.reshape(1, -1)
    return x-y

In [10]:
def draw_mol_by_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    size = (200, 200)
    return Draw.MolToImage(mol, size=size)

In [11]:
def draw_multiple_mol(smiles_list, mols_per_row=4, file_path=None, legends=None):
    mols = []
    for i in smiles_list:
        mols.append(Chem.MolFromSmiles(i))
    mols_per_row = min(len(smiles_list), mols_per_row)
    if legends is None:
        img=Draw.MolsToGridImage(mols, molsPerRow=mols_per_row, subImgSize=(220, 120), useSVG=True)
    else:
        img=Draw.MolsToGridImage(mols, molsPerRow=mols_per_row, subImgSize=(220, 120), useSVG=True, legends=legends)
    if file_path:
        with open(file_path, 'w') as f_handle:
            f_handle.write(img.data)
    return img

In [39]:
def show_each_md(x_reduced, frag_info, file_path=''):
    """
    reduced_x: 2 dimensions x with fragment as index, a dataframe
    frag_info: the number of each MD with fragemnt as index, a dataframe
    """
    # model = model_name
    fig, ax = plt.subplots(2, 4, figsize=(24, 12))
    ax = ax.flatten()
    # print(x_reduced.head(2))
    # print(frag_info.head(2))
    intersect_index = set(x_reduced.index.to_list()) & set(frag_info.index.to_list())
    x_reduced = x_reduced.loc[intersect_index, :].copy()  # alignment
    frag_info = frag_info.loc[intersect_index, :].copy()
    # reduced_x = reduced_x.loc[frag_info.index, :].copy()
    # parallel_frag_info = parallel_frag_info.loc[:, selected_md].copy()
    for i,md in enumerate(frag_info.columns.to_list()):
        # current_labels = parallel_frag_info.iloc[:, i]
        current_labels = frag_info.iloc[:, i]
        unique_labels = sorted(current_labels.unique())
        n_labels = len(unique_labels)
        # print(n_labels)
        cc = sns.color_palette('Blues', n_labels)
        for j,label in enumerate(unique_labels):
            current_nodes = (current_labels == label)
            ax[i].scatter(x_reduced.loc[current_nodes, 0], x_reduced.loc[current_nodes, 1],
                          c=colors.rgb2hex(cc[j]), vmin=0, vmax=10, s=10, label=str(label))
        ax[i].set_title(md, fontsize=12)
        ax[i].legend()
    plt.tight_layout()
    plt.savefig(file_path, bbox_inches='tight', transparent=True)
    plt.close()

In [40]:
def reduce_by_tsne(x):
    t0 = time.time()
    tsne = TSNE(n_components=2, n_jobs=4, learning_rate=200, early_exaggeration=20, n_iter=2000, random_state=42, init='pca', verbose=1)
    X_reduced_tsne = tsne.fit_transform(x)
    # X_reduced_tsne = tsne.fit(x)
    print(X_reduced_tsne.shape)
    # np.save('X_reduced_tsne_pca_first', X_reduced_tsne2)
    t1 = time.time()
    print("t-SNE took {:.1f}s.".format(t1 - t0))
    return X_reduced_tsne

In [13]:
model = keras.models.load_model('../../big-data/moses_dataset/nn/parallel/model_reg.h5')

In [14]:
xy_train = read_xy(x_fp='../../big-data/moses_dataset/nn/parallel/x_training_set_mol2vec.csv', 
                   y_fp='../../big-data/moses_dataset/nn/parallel/y_train_regression.csv')
x_train = xy_train['x']
y_train = xy_train['y']

In [15]:
x_train.head(2)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id6,0.23,4.271,-1.86,3.139,2.185,2.498,2.318,1.343,-0.212,1.851,...,7.804,1.818,0.267,3.757,-0.121,3.14,1.682,-2.24,-0.015,-1.227
id8,0.256,8.87,0.412,2.352,3.176,6.292,0.904,1.206,-0.017,-0.431,...,9.534,2.898,2.562,2.606,0.781,3.915,-3.332,-2.733,1.892,-2.017


In [16]:
y_train.head(2)

Unnamed: 0_level_0,nN,nS,nO,nX,nBondsD,nBondsT,naRing,nARing
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id6,1,0,4,1,1,0,1,1
id8,0,0,5,0,1,0,2,0


In [17]:
y_train_bool = y_train.copy()
y_train_bool[y_train >= 1] = 1

In [18]:
y_train_class2num = count_y_class(y_df=y_train_bool)
print(y_train_class2num.shape)
y_train_class2num.head(2)

(98, 2)


Unnamed: 0,num,n_error
11101111,4500,0
10100011,4500,0


In [19]:
y_train_pred_int = predict_y_reg(model=model, x=x_train)
y_train_pred = pd.DataFrame(data=y_train_pred_int, index=y_train.index, columns=y_train.columns)
y_train_pred.head(2)

Unnamed: 0_level_0,nN,nS,nO,nX,nBondsD,nBondsT,naRing,nARing
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id6,1,0,4,1,1,0,1,1
id8,0,0,6,0,1,0,2,0


In [20]:
y_train_pred_bool = y_train_pred.copy()
y_train_pred_bool[y_train_pred >= 1] = 1
y_train_class2num2 = count_pred_error_in_class(y=y_train_bool, y_pred=y_train_pred_bool)
y_train_class2num2.head(2)

Unnamed: 0,num,n_error
11101111,4500,44
10100011,4500,215


In [21]:
y_train_class2num2.sum()

num        155970
n_error      5527
dtype: int64

In [22]:
1-5527/155970

0.9645636981470795

### test

In [23]:
xy_test = read_xy(x_fp='../../big-data/moses_dataset/nn/parallel/x_test_set_mol2vec.csv', 
                                     y_fp='../../big-data/moses_dataset/nn/parallel/y_test_regression.csv')
x_test = xy_test['x']
y_test = xy_test['y']

In [24]:
x_test.head(2)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id27,0.794,7.313,-1.258,2.849,2.937,3.976,1.416,1.87,0.279,1.028,...,10.392,2.231,1.297,3.92,-0.44,4.393,0.002,-3.069,0.695,-1.967
id39,1.467,3.095,-2.83,2.447,2.14,1.325,2.328,1.323,-0.67,2.027,...,7.12,1.221,0.689,4.305,-0.615,4.696,2.862,-1.267,-0.731,-1.225


In [25]:
y_test.head(2)

Unnamed: 0_level_0,nN,nS,nO,nX,nBondsD,nBondsT,naRing,nARing
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id27,0,0,4,0,2,0,2,0
id39,2,0,2,0,2,1,2,1


In [26]:
y_test_pred = predict_y_reg(x=x_test, model=model)
y_test_pred = pd.DataFrame(data=y_test_pred, index=y_test.index, columns=y_test.columns)
y_test_pred.head(2)

Unnamed: 0_level_0,nN,nS,nO,nX,nBondsD,nBondsT,naRing,nARing
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id27,0,0,5,0,2,0,2,0
id39,2,0,2,0,1,1,2,1


In [27]:
y_test_pred.head(2)

Unnamed: 0_level_0,nN,nS,nO,nX,nBondsD,nBondsT,naRing,nARing
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id27,0,0,5,0,2,0,2,0
id39,2,0,2,0,1,1,2,1


In [28]:
y_test_bool = y_test.copy()
y_test_pred_bool = y_test_pred.copy()
y_test_bool[y_test >= 1] = 1
y_test_pred_bool[y_test_pred >= 1] = 1
y_test_class2num = count_pred_error_in_class(y=y_test_bool, y_pred=y_test_pred_bool)
y_test_class2num.head(5)

Unnamed: 0,num,n_error
10101010,5000,1412
11101010,5000,177
10111111,5000,264
10101111,5000,295
11000011,3826,17


In [29]:
y_test_class2num.sum()

num        48207
n_error     2978
dtype: int64

In [30]:
7821 / 48207, 6818 / 48207, 2978/48207

(0.16223784927500157, 0.14143174227809238, 0.06177526085423279)

In [31]:
y_test_class2num['class_in_train'] = 0
y_test_class2num.loc[y_test_class2num.index.isin(y_train_class2num2.index), 'class_in_train'] = 1

In [32]:
y_test_class2num['accuray_in_each_class'] = 1 - y_test_class2num['n_error'] / y_test_class2num['num']

In [33]:
y_test_class2num.head()

Unnamed: 0,num,n_error,class_in_train,accuray_in_each_class
10101010,5000,1412,0,0.7176
11101010,5000,177,0,0.9646
10111111,5000,264,0,0.9472
10101111,5000,295,0,0.941
11000011,3826,17,0,0.995557


In [34]:
y_test_class2num.to_csv('y_test_class2num_reg_model_parallel.csv')

### embedding vector

In [35]:
m_part1 = keras.models.load_model('../../big-data/moses_dataset/nn/parallel/m_part1_reg.h5')
m_part1



<tensorflow.python.keras.engine.sequential.Sequential at 0x7fc7ac44f4a8>

In [36]:
# x_train_test = x_train.append(x_test)
# x_train_test.shape
x_test.shape

(48207, 100)

In [37]:
x_test_embedding = m_part1.predict(x_test)
type(x_test_embedding)

numpy.ndarray

In [38]:
x_test_embedding = pd.DataFrame(data=x_test_embedding, index=x_test.index)
x_test_embedding.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id27,-1.750638,6.183736,-1.745564,0.96638,4.862798,-1.288332,1.058993,-1.56888,-0.977151,-1.75701,...,-1.15665,0.49242,2.326666,-1.268212,-1.758076,-1.41064,0.477689,-1.251694,9.220509,-1.69994
id39,0.934497,4.012554,-1.757697,1.656628,0.929729,-0.719724,-1.235421,-1.726944,-0.736592,-1.758037,...,-0.635071,4.146644,2.500072,-0.722488,-1.647596,-1.593054,-0.208207,-1.661613,2.194536,3.574096


#### show mol vector by raw fragment vector

In [42]:
x_test_reduced_raw = reduce_by_tsne(x=x_test)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 48207 samples in 0.739s...
[t-SNE] Computed neighbors for 48207 samples in 85.626s...
[t-SNE] Computed conditional probabilities for sample 1000 / 48207
[t-SNE] Computed conditional probabilities for sample 2000 / 48207
[t-SNE] Computed conditional probabilities for sample 3000 / 48207
[t-SNE] Computed conditional probabilities for sample 4000 / 48207
[t-SNE] Computed conditional probabilities for sample 5000 / 48207
[t-SNE] Computed conditional probabilities for sample 6000 / 48207
[t-SNE] Computed conditional probabilities for sample 7000 / 48207
[t-SNE] Computed conditional probabilities for sample 8000 / 48207
[t-SNE] Computed conditional probabilities for sample 9000 / 48207
[t-SNE] Computed conditional probabilities for sample 10000 / 48207
[t-SNE] Computed conditional probabilities for sample 11000 / 48207
[t-SNE] Computed conditional probabilities for sample 12000 / 48207
[t-SNE] Computed conditional probabilities for sa

In [45]:
x_test_reduced_raw_df = pd.DataFrame(data=x_test_reduced_raw, index=x_test.index)
x_test_reduced_raw_df.head(2)

Unnamed: 0_level_0,0,1
0,Unnamed: 1_level_1,Unnamed: 2_level_1
id27,-116.88192,-36.252781
id39,-5.743494,-45.813259


In [46]:
show_each_md(x_reduced=x_test_reduced_raw_df, frag_info=y_test, file_path='./images/x_test_t-sne_raw_frag2vec.png')

In [47]:
x_test_reduced_after_trained = reduce_by_tsne(x_test_embedding)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 48207 samples in 0.259s...
[t-SNE] Computed neighbors for 48207 samples in 36.777s...
[t-SNE] Computed conditional probabilities for sample 1000 / 48207
[t-SNE] Computed conditional probabilities for sample 2000 / 48207
[t-SNE] Computed conditional probabilities for sample 3000 / 48207
[t-SNE] Computed conditional probabilities for sample 4000 / 48207
[t-SNE] Computed conditional probabilities for sample 5000 / 48207
[t-SNE] Computed conditional probabilities for sample 6000 / 48207
[t-SNE] Computed conditional probabilities for sample 7000 / 48207
[t-SNE] Computed conditional probabilities for sample 8000 / 48207
[t-SNE] Computed conditional probabilities for sample 9000 / 48207
[t-SNE] Computed conditional probabilities for sample 10000 / 48207
[t-SNE] Computed conditional probabilities for sample 11000 / 48207
[t-SNE] Computed conditional probabilities for sample 12000 / 48207
[t-SNE] Computed conditional probabilities for sa

In [48]:
x_test_reduced_after_trained_df = pd.DataFrame(data=x_test_reduced_after_trained, index=x_test_embedding.index)
x_test_reduced_after_trained_df.head(2)

Unnamed: 0_level_0,0,1
0,Unnamed: 1_level_1,Unnamed: 2_level_1
id27,-15.629355,47.694611
id39,1.9182,-51.893841


In [49]:
show_each_md(x_reduced=x_test_reduced_after_trained_df, frag_info=y_test, 
                                  file_path='./images/x_test_t-sne_after_trained_frag2vec.png')

In [50]:
x_train_test = x_train.append(x_test)
x_train_test.shape

(204177, 100)

In [51]:
x_train_test_embedding = m_part1.predict(x_train_test)
x_train_test_embedding.shape

(204177, 30)

In [53]:
x_train_test_embedding_df = pd.DataFrame(data=x_train_test_embedding, index=x_train_test.index)
x_train_test_embedding_df.to_csv('all_x_after_trained_parallel_model.csv')