In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt
import tqdm
from scipy import sparse
from sklearn.linear_model import LogisticRegression

In [2]:
alt.renderers.enable('default')

RendererRegistry.enable('default')

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [3]:
RECEPTOR = "EnamineHTS"
INPUT_DATA_DIR = "/mnt/efs/enamine"
OUTPUT_DATA_DIR = "../processed_data"
# OUTPUT_RESULTS_FILE = f"{RECEPTOR}_results.csv"
OUTPUT_RESULTS_FILE = f"{RECEPTOR}_embedding_results.csv"

## Fingerprints

In [4]:
# scores = np.load(f"../processed_data/{RECEPTOR}_scores.npy")
# vectors = sparse.load_npz(f"../processed_data/{RECEPTOR}_fingerprints.npz")

## Mol2Vec (pre-trained) embeddings

In [5]:
scores = np.load(f"{INPUT_DATA_DIR}/{RECEPTOR}_embedding_scores.npy")
vectors = np.load(f"{INPUT_DATA_DIR}/{RECEPTOR}_embeddings.npy")

In [6]:
scores.shape, vectors.shape

((2104318,), (2104318, 300))

# Train a Logistic Regression models

In [15]:
model = LogisticRegression(max_iter=10000, C=1)

In [16]:
top_k = (scores.argsort().argsort() < 1_000)
total = top_k.sum() 

In [None]:
training_set_fractions = [0.004, 0.002, 0.001]

percentile = 0.3

df = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0

for i in range(3):
    idx = np.arange(scores.shape[0])
    np.random.shuffle(idx)

    for fraction in training_set_fractions:
        size = int(len(scores) * fraction)
        
        # split indices into train and test:
        train_indices = idx[:size].copy()
        test_indices = idx[size:].copy()
        train_indices.sort()
        test_indices.sort()

        # generate a 'is a training instance' mask. 
        is_train = np.zeros(scores.shape[0]).astype(bool)
        is_train[train_indices] = True

        # top_k molecules already found in the training set:
        num_found = top_k[train_indices].sum()

        df.loc[count] = ["morgan_feat", size, train_indices.shape[0], num_found/total]
        count += 1
        print(f"Iteration: {count}, Found {num_found} top k ligands")

        # estimate the cutoff once, from the initial random sample:
        cutoff = np.percentile(scores[train_indices], percentile)

        for i in range(5):
            # fit logreg model:
            x_train = vectors[is_train]
            y_train = scores[is_train] < cutoff
            x_test = vectors[~is_train]
            # y_val = scores[~is_train]
            
            model.fit(x_train, y_train)

            # predict (slowest step) for logreg:
            proba = model.predict_proba(x_test)[:, 1]

            # rank the probabilities
            proba_sorted = (-proba).argsort()

            # rank the unseen instances:
            test_indices = test_indices[proba_sorted]

            # now append the next N instances from the rank ordered unseen instances onto the training set:
            train_indices = np.concatenate([train_indices, test_indices[:size]])

            # update the isTrain mask and remove those training instances from the test set
            is_train[train_indices] = True
            test_indices = test_indices[size:]

            # keep the train and test idx arrays sorted so they agree with the chunked* methods:
            test_indices.sort()
            train_indices.sort()

            # topK molecules already found in the training set:
            num_found = top_k[train_indices].sum()

            df.loc[count] = ['morgan_feat', size, train_indices.shape[0], num_found/total]
            count += 1
            
            print(f"Iteration: {count}, Found {num_found} top k ligands")
            
            df.to_csv(f"{OUTPUT_DATA_DIR}/{OUTPUT_RESULTS_FILE}")

df.to_csv(f"{OUTPUT_DATA_DIR}/{OUTPUT_RESULTS_FILE}")

Iteration: 1, Found 2 top k ligands
Iteration: 2, Found 362 top k ligands
Iteration: 3, Found 747 top k ligands
Iteration: 4, Found 852 top k ligands
Iteration: 5, Found 903 top k ligands
Iteration: 6, Found 921 top k ligands
Iteration: 7, Found 1 top k ligands
Iteration: 8, Found 207 top k ligands
Iteration: 9, Found 493 top k ligands
Iteration: 10, Found 709 top k ligands
Iteration: 11, Found 793 top k ligands
Iteration: 12, Found 850 top k ligands
Iteration: 13, Found 0 top k ligands
Iteration: 14, Found 167 top k ligands
Iteration: 15, Found 324 top k ligands
Iteration: 16, Found 535 top k ligands
Iteration: 17, Found 637 top k ligands
Iteration: 18, Found 714 top k ligands
Iteration: 19, Found 4 top k ligands
Iteration: 20, Found 527 top k ligands
Iteration: 21, Found 815 top k ligands
Iteration: 22, Found 889 top k ligands
Iteration: 23, Found 910 top k ligands
Iteration: 24, Found 938 top k ligands
Iteration: 25, Found 2 top k ligands
Iteration: 26, Found 261 top k ligands
Itera

# Results look like this:
And they can be plotted using `./plot_scripts/plot_wholedataset.py`

In [None]:
df1 = pd.read_csv(f"{OUTPUT_DATA_DIR}/{RECEPTOR}_embedding_results.csv", index_col=0)
df1['Algorithm'] = 'LogReg (embeddings)'

In [None]:
df2 = pd.read_csv(f"{OUTPUT_DATA_DIR}/{RECEPTOR}_results.csv", index_col=0)
df2['Algorithm'] = 'LogReg (fps)'

In [None]:
df = pd.concat([df1, df2])

In [None]:
prev_results = [['RF (Graff)', 8_417, 84.3, 1.1], 
                ['NN (Graff)', 8_417, 95.7, 0.1],
                ['MPN (Graff)',8_417, 97.6, 0.3],
                ['random',8_417, 2.6, 0.1],
                ['RF (Graff)', 4_208, 72.3, 1.9],
                ['NN (Graff)', 4_208, 88.8, 0.8],
                ['MPN (Graff)', 4_208, 93.3, 0.9],
                ['random', 4_208, 1.3, 0.4],
                ['RF (Graff)', 2_104, 55.8, 4.9],
                ['NN (Graff)', 2_104 , 70.5, 1.8],
                ['MPN (Graff)', 2_104, 78.5, 2.2],
                ['random', 2_104, 0.6, 0.2]]

coley = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0 
for res in prev_results:
    desired_std_dev = res[3]
    samples = np.array([-1,0,1]).astype(float)
    samples *= (desired_std_dev/np.std(samples))
    for s in samples:
        coley.loc[count]= [res[0], res[1], res[1]*6, (s+res[2])/100]
        count += 1

In [None]:
concat = pd.concat([df, coley])
concat['% top-k found'] *= 100
concat.columns = ['Algorithm', 'Training set size', 'N ligands explored', '% top-k found']
concat['Training set size'] = concat['Training set size'].apply(lambda num: f"{num:,d}",)

In [None]:
error_bars = alt.Chart(concat).mark_errorbar(extent='ci').encode(
  x=alt.X('N ligands explored:Q',title='Number of ligands sampled'),
  y=alt.Y('% top-k found:Q', title='% top 1,000 found'),
    color=alt.Color('Algorithm')
)

points = alt.Chart(concat).mark_point(filled=False, size=40, color='black').encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 1,000 found'),
    color=alt.Color('Algorithm'),
    tooltip=alt.Tooltip('% top-k found:Q',aggregate='mean',title='% top 1,000 found')
)

line = alt.Chart(concat).mark_line(color='black',size=2,opacity=0.5).encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 1,000 found'),
    color=alt.Color('Algorithm')
)

ch = (error_bars+points+line).properties(height=300,width=150).facet(
    column=alt.Column('Training set size:N',sort=alt.Sort([0.004, 0.002, 0.001])),
).resolve_scale(x='independent')
# ch.save('../../figures/active_learning_percentage.html')

In [None]:
ch