In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt
import tqdm
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from functools import lru_cache

In [2]:
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [3]:
NUM_CHUNKS = 10

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [4]:
USE_EMBEDDINGS = True
TOP_K_THRESHOLD = 50_000
MODEL = "_r1"

In [5]:
RECEPTOR = "AmpC"
DATA_DIR = "/mnt/efs/AmpC_data"

MODEL_PATH = "/mnt/efs/mol2vec/examples/models/model_300dim.pkl"
UNCOMMON = "UNK"

if USE_EMBEDDINGS:
    OUTPUT_RESULTS_FILE = f"{RECEPTOR}_embedding_results{MODEL}.csv"
else:
    OUTPUT_RESULTS_FILE = f"{RECEPTOR}_results.csv"

In [6]:
# count number of items:
indptr = [0]
scores_lst = []

# if USE_EMBEDDINGS:
    # scores = np.load(f"{INPUT_DATA_DIR}/{RECEPTOR}_embedding_scores.npy")
    # vectors = np.load(f"{INPUT_DATA_DIR}/{RECEPTOR}_embeddings.npy")
# else:
    # scores = np.load(f"{INPUT_DATA_DIR}/{RECEPTOR}_scores.npy")
    # vectors = sparse.load_npz(f"{INPUT_DATA_DIR}/{RECEPTOR}_fingerprints.npz")

for chunk_id in range(NUM_CHUNKS):
    scores = np.load(f"{DATA_DIR}/{RECEPTOR}_scores_{chunk_id}{MODEL}.npy")
    indptr.append(indptr[-1] + scores.shape[0])
    scores_lst.append(scores)
    
scores = np.concatenate(scores_lst)

In [7]:
scores.shape

(96214206,)

In [8]:
@lru_cache(maxsize=4)
def load_vectors(chunk_id):
    print(f"Loading vectors {chunk_id}{MODEL}", end="; ", flush=True)
    if USE_EMBEDDINGS:
        vectors = np.load(f"{DATA_DIR}/{RECEPTOR}_embeddings_{chunk_id}{MODEL}.npy")
    else:
        vectors = sparse.load_npz(f"{DATA_DIR}/{RECEPTOR}_fingerprints_{chunk_id}.npz")
    return vectors

In [9]:
def extract_vectors(chunk_id, indptr, is_train):
    print(f"Extracting vectors: {chunk_id}", end="; ", flush=True)
    vectors = load_vectors(chunk_id)
    mask = is_train[indptr[chunk_id]:indptr[chunk_id+1]]
    return vectors[mask]

def build_train(indptr, is_train):
    print("Building training set", end="; ", flush=True)
    if USE_EMBEDDINGS:
        vectors = np.vstack([extract_vectors(i, tuple(indptr), is_train) for i in range(NUM_CHUNKS)])
    else:
        vectors = sparse.vstack([extract_vectors(i, tuple(indptr), is_train) for i in range(NUM_CHUNKS)])  
    return vectors

def chunk_predict_proba(model, indptr, is_train):
    print("Predicting proba", end="; ", flush=True)
    probas = []
    for chunk_id in range(NUM_CHUNKS):
        vectors = extract_vectors(chunk_id, indptr, ~is_train)
        proba = model.predict_proba(vectors)[:,1]
        probas.append(proba)
    return np.concatenate(probas)

def chunk_predict(model, indptr, is_train):
    print("Predicting scores", end="; ", flush=True)
    preds = []
    for chunk_id in range(NUM_CHUNKS):
        vectors = extract_vectors(chunk_id, indptr, ~is_train)
        pred = -1*model.predict(vectors) # best scoring will now be on top (like the proba)
        preds.append(pred)
    return np.concatenate(preds)

# Train a Logistic Regression models

In [12]:
model = LogisticRegression(max_iter=10000, C=1)

In [13]:
top_k = (scores.argsort().argsort() < TOP_K_THRESHOLD)
total = top_k.sum() 

In [14]:
training_set_fractions = [0.004, 0.002, 0.001]
training_set_sizes = [400_000]

percentile = 0.3

df = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0

for i in range(3):
    idx = np.arange(scores.shape[0])
    np.random.shuffle(idx)

    for size in training_set_sizes:
#     for fraction in training_set_fractions:
#         size = int(len(scores) * fraction)
        
        # split indices into train and test:
        train_indices = idx[:size].copy()
        test_indices = idx[size:].copy()
        train_indices.sort()
        test_indices.sort()

        # generate a 'is a training instance' mask. 
        is_train = np.zeros(scores.shape[0]).astype(bool)
        is_train[train_indices] = True

        # top_k molecules already found in the training set:
        num_found = top_k[train_indices].sum()

        df.loc[count] = ["morgan_feat", size, train_indices.shape[0], num_found/total]
        count += 1
        print(f"Iteration: {count}, Found {num_found} top k ligands")

        # estimate the cutoff once, from the initial random sample:
        cutoff = np.percentile(scores[train_indices], percentile)

        for i in range(5):
            # fit logreg model:
            x_train = build_train(indptr, is_train)
            y_train = scores[is_train] < cutoff

            print("Fitting model", end="; ", flush=True)
            model.fit(x_train, y_train)

            # predict (slowest step) for logreg:
            proba = chunk_predict_proba(model, indptr, is_train)

            # rank the probabilities
            proba_sorted = (-proba).argsort()

            # rank the unseen instances:
            test_indices = test_indices[proba_sorted]

            # now append the next N instances from the rank ordered unseen instances onto the training set:
            train_indices = np.concatenate([train_indices, test_indices[:size]])

            # update the isTrain mask and remove those training instances from the test set
            is_train[train_indices] = True
            test_indices = test_indices[size:]

            # keep the train and test idx arrays sorted so they agree with the chunked* methods:
            test_indices.sort()
            train_indices.sort()

            # topK molecules already found in the training set:
            num_found = top_k[train_indices].sum()

            df.loc[count] = ['morgan_feat', size, train_indices.shape[0], num_found/total]
            count += 1
            
            print(f"\nIteration: {count}, Found {num_found} top k ligands")
            
            df.to_csv(f"{DATA_DIR}/{OUTPUT_RESULTS_FILE}")

df.to_csv(f"{DATA_DIR}/{OUTPUT_RESULTS_FILE}")

Iteration: 1, Found 209 top k ligands
Building training set; Extracting vectors: 0; Loading vectors 0_r1; Extracting vectors: 1; Loading vectors 1_r1; Extracting vectors: 2; Loading vectors 2_r1; Extracting vectors: 3; Loading vectors 3_r1; Extracting vectors: 4; Loading vectors 4_r1; Extracting vectors: 5; Loading vectors 5_r1; Extracting vectors: 6; Loading vectors 6_r1; Extracting vectors: 7; Loading vectors 7_r1; Extracting vectors: 8; Loading vectors 8_r1; Extracting vectors: 9; Loading vectors 9_r1; Fitting model; Predicting proba; Extracting vectors: 0; Loading vectors 0_r1; Extracting vectors: 1; Loading vectors 1_r1; Extracting vectors: 2; Loading vectors 2_r1; Extracting vectors: 3; Loading vectors 3_r1; Extracting vectors: 4; Loading vectors 4_r1; Extracting vectors: 5; Loading vectors 5_r1; Extracting vectors: 6; Loading vectors 6_r1; Extracting vectors: 7; Loading vectors 7_r1; Extracting vectors: 8; Loading vectors 8_r1; Extracting vectors: 9; Loading vectors 9_r1; 
Itera

Building training set; Extracting vectors: 0; Loading vectors 0_r1; Extracting vectors: 1; Loading vectors 1_r1; Extracting vectors: 2; Loading vectors 2_r1; Extracting vectors: 3; Loading vectors 3_r1; Extracting vectors: 4; Loading vectors 4_r1; Extracting vectors: 5; Loading vectors 5_r1; Extracting vectors: 6; Loading vectors 6_r1; Extracting vectors: 7; Loading vectors 7_r1; Extracting vectors: 8; Loading vectors 8_r1; Extracting vectors: 9; Loading vectors 9_r1; Fitting model; Predicting proba; Extracting vectors: 0; Loading vectors 0_r1; Extracting vectors: 1; Loading vectors 1_r1; Extracting vectors: 2; Loading vectors 2_r1; Extracting vectors: 3; Loading vectors 3_r1; Extracting vectors: 4; Loading vectors 4_r1; Extracting vectors: 5; Loading vectors 5_r1; Extracting vectors: 6; Loading vectors 6_r1; Extracting vectors: 7; Loading vectors 7_r1; Extracting vectors: 8; Loading vectors 8_r1; Extracting vectors: 9; Loading vectors 9_r1; 
Iteration: 11, Found 32477 top k ligands
Bu

# Results look like this:
And they can be plotted using `./plot_scripts/plot_wholedataset.py`

In [None]:
df0 = pd.read_csv('../processed_data/ampc_reconstruction_0.3_1_.csv', index_col=0)
df0['Algorithm'] = 'AmpC:LogReg (lewis)'

In [None]:
df1 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_embedding_results.csv", index_col=0)
df1['Algorithm'] = 'AmpC:LogReg (embeddings)'

In [None]:
df2 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_results.csv", index_col=0)
df2['Algorithm'] = 'AmpC:LogReg (fps)'

In [None]:
df = pd.concat([df0, df1, df2])

In [None]:
prev_results = [['AmpC:RF (Graff)', 400_000, 71.4, 2.1], 
                ['AmpC:NN (Graff)', 400_000, 74.7, 1.4],
                ['AmpC:MPN (Graff)',400_000, 87.9, 2.3],
                ['AmpC:RF (Graff)', 200_000, 45.5, 1.8],
                ['AmpC:NN (Graff)', 200_000, 52.8, 0.5],
                ['AmpC:MPN (Graff)', 200_000, 67.1, 2.1],
                ['AmpC:RF (Graff)', 100_000, 24.0, 2.2],
                ['AmpC:NN (Graff)', 100_000 , 33.3,0.3],
                ['AmpC:MPN (Graff)', 100_000, 52.0, 0.5]]

coley = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0 
for res in prev_results:
    desired_std_dev = res[3]
    samples = np.array([-1,0,1]).astype(float)
    samples *= (desired_std_dev/np.std(samples))
    for s in samples:
        coley.loc[count] = [res[0], res[1], res[1]*6, (s+res[2])/100]
        count += 1

In [None]:
concat = pd.concat([df, coley])
concat['% top-k found'] *= 100
concat.columns = ['Algorithm', 'Training set size', 'N ligands explored', '% top-k found']
concat['Training set size'] = concat['Training set size'].apply(lambda num: f"{num:,d}",)
concat['Computation days (single CPU)'] = concat['N ligands explored'] / 60 / 60 /24

In [None]:
error_bars = alt.Chart(concat).mark_errorbar(extent='ci').encode(
  x=alt.X('N ligands explored:Q',title='Number of ligands sampled'),
  y=alt.Y('% top-k found:Q', title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

points = alt.Chart(concat).mark_point(filled=False, size=40, color='black').encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 50,000 found'),
    color=alt.Color('Algorithm'),
    tooltip=alt.Tooltip('% top-k found:Q',aggregate='mean',title='% top 50,000 found')
)

line = alt.Chart(concat).mark_line(color='black',size=2,opacity=0.5).encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

ch = (error_bars+points+line).properties(height=300,width=150).facet(
    column=alt.Column('Training set size:N',sort=alt.Sort([0.004, 0.002, 0.001])),
).resolve_scale(x='independent')
# ch.save('../../figures/active_learning_percentage.html')

In [None]:
ch