In [1]:
import pandas as pd
import useful_rdkit_utils as uru
from sklearn.decomposition import PCA
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
import string

In [2]:
ts_df = pd.read_csv("ts_telemetry.csv.gz")

In [3]:
bb_df = pd.read_csv("bb.csv")

In [4]:
ts_df.head()

Unnamed: 0,location,synthon_id,total_scored,cycle,bootstrap,scale,synthon_count
0,0.0,2734314,1000,1,False,1.0,0
1,0.0,2734354,1000,1,False,1.0,0
2,0.0,2734356,1000,1,False,1.0,0
3,0.0,2734358,1000,1,False,1.0,0
4,0.0,2734360,1000,1,False,1.0,0


In [5]:
bb_df.head()

Unnamed: 0,smiles,synthon_id,synthon_number,bb_id
0,COC(=O)CC(N)C=1C=CC=CC1Cl,2734334,1,EN300-23255
1,Cl.COC(=O)C(N)CC=1C=CC=CC1,2735030,1,EN300-04537
2,Cl.COC(=O)C=1C=C(CN)C=CC1OC,2735032,1,EN300-08509
3,Cl.COC(=O)CC(N)C=1C=CC(OC)=C(OC)C1,2735036,1,EN300-23095
4,Cl.COC(=O)CC(N)C=1C=CC(OC)=CC1,2735040,1,EN300-30310


In [6]:
bb_df.synthon_number.value_counts()

2    15439
1     1158
Name: synthon_number, dtype: int64

In [7]:
bb_df['fp'] = bb_df.smiles.apply(uru.smi2numpy_fp)

In [8]:
bb_df

Unnamed: 0,smiles,synthon_id,synthon_number,bb_id,fp
0,COC(=O)CC(N)C=1C=CC=CC1Cl,2734334,1,EN300-23255,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Cl.COC(=O)C(N)CC=1C=CC=CC1,2735030,1,EN300-04537,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Cl.COC(=O)C=1C=C(CN)C=CC1OC,2735032,1,EN300-08509,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Cl.COC(=O)CC(N)C=1C=CC(OC)=C(OC)C1,2735036,1,EN300-23095,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Cl.COC(=O)CC(N)C=1C=CC(OC)=CC1,2735040,1,EN300-30310,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
16592,CC(C)(C)OC(=O)N[C@@H]1CC(=C[C@@H]1O)C(=O)O,23045600,2,EN300-20065287,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16593,COC=1C=C(Br)C=2C=C(NC2C1)C(=O)O,23045686,2,EN300-1705936,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16594,Cl.OC(=O)CNC=1C=CC=C(Br)N1,23047342,2,EN300-39923803,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16595,OC(=O)C=1C=CC=2OCCNC2C1,23076168,2,EN300-181145,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
pca = PCA(n_components=50)

In [10]:
pcs = pca.fit_transform(np.stack(bb_df.fp))

In [11]:
tsne = TSNE(n_components=2)
tsne_crds = tsne.fit_transform(pcs)

In [12]:
bb_df[['tsne_x','tsne_y']] = tsne_crds.tolist()

In [13]:
combo_df = ts_df.merge(bb_df[['smiles','synthon_id','synthon_number','bb_id','tsne_x','tsne_y']],on='synthon_id')

In [14]:
combo_df.to_csv("ts_tsne_plot_data.csv",index=False)