In [1]:
#Initialization

import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

import os
import time
from typing import List

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from rdkit import Chem, DataStructs
from rdkit.Chem.rdchem import Mol
from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import hdbscan

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Silence non-critical RDKit warnings to minimize unnecessary outputs

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [3]:
#Compute RDKit Topological Fingerprint

def compute_RDKFingerprint(smiles_list: List[str]):
    """ Computes RDKit Fingerprints """
    
    keep_idx = []
    descriptors = []
    for i, smiles in enumerate(smiles_list):
        RDKFingerprint = _compute_single_RDKFingerprint(smiles)
        if RDKFingerprint is not None:
            keep_idx.append(i)
            descriptors.append(RDKFingerprint)

    return np.vstack(descriptors), keep_idx

def _compute_single_RDKFingerprint(smiles: str):
    try:
        mol = Chem.MolFromSmiles(smiles)
    except Exception as E:
        return None

    if mol:
        mol = [mol]
        fp = [Chem.RDKFingerprint(x) for x in mol]
        return np.array(fp)
    
    return None

In [4]:
#Load Data

RKC = pd.read_csv("UMAP_input.csv")

RKC_descriptors = compute_RDKFingerprint(RKC["smiles"])

# Compute desrciptors and keep track of which failed to featurize
RKC_RDKFingerprint_descriptors, RKC_keep_idx = compute_RDKFingerprint(RKC["smiles"])

# Only keep those that sucessfully featurized
RKC = RKC.iloc[RKC_keep_idx]

In [5]:
#Run UMAP Calculation and Export Output as .csv

import csv


umap_model_global = umap.UMAP(metric = "jaccard",
                               n_neighbors = 50,
                               n_components = 2,
                               low_memory = False,
                               min_dist = 0.005)
X_umap_global = umap_model_global.fit_transform(RKC_RDKFingerprint_descriptors)
RKC["UMAP_0"], RKC["UMAP_1"] = X_umap_global[:,0], X_umap_global[:,1]

header = ['name', 'smiles', 'UMAP_0', 'UMAP_1']
data1 = RKC['name']
data2 = RKC['smiles']

with open('UMAP_output.csv', 'w', newline='') as f:    
    writer = csv.writer(f)
    writer.writerow(data1)
    writer.writerow(data2) 
    writer.writerow(RKC["UMAP_0"])
    writer.writerow(RKC["UMAP_1"])

  warn(
