In [168]:
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt
import tqdm
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from functools import lru_cache
from sklearn.metrics import recall_score
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [169]:
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [170]:
NUM_CHUNKS = 10

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [179]:
USE_EMBEDDINGS = True
MODEL = "_r1"
CLASSIFIER = "_rf_reg"

In [180]:
RECEPTOR = "AmpC"
DATA_DIR = "/mnt/efs/AmpC_data"

MODEL_PATH = "/mnt/efs/mol2vec/examples/models/model_300dim.pkl"
UNCOMMON = "UNK"

if USE_EMBEDDINGS:
    OUTPUT_RESULTS_FILE = f"{RECEPTOR}_embedding_results{MODEL}{CLASSIFIER}.csv"
else:
    OUTPUT_RESULTS_FILE = f"{RECEPTOR}_results{CLASSIFIER}.csv"

In [181]:
# count number of items:
indptr = [0]
scores_lst = []

for chunk_id in range(NUM_CHUNKS):
    scores = np.load(f"{DATA_DIR}/{RECEPTOR}_scores_{chunk_id}{MODEL}.npy")
    indptr.append(indptr[-1] + scores.shape[0])
    scores_lst.append(scores)
    
scores = np.concatenate(scores_lst)

In [182]:
scores.shape

(96214206,)

In [183]:
def load_vectors(chunk_id, use_embeddings=True):
    print(f"Loading vectors {chunk_id}{MODEL}", end="; ", flush=True)
    if use_embeddings:
        vectors = np.load(f"{DATA_DIR}/{RECEPTOR}_embeddings_{chunk_id}{MODEL}.npy")
    else:
        vectors = sparse.load_npz(f"{DATA_DIR}/{RECEPTOR}_fingerprints_{chunk_id}.npz")
    return vectors

def extract_vectors(chunk_id, indptr, is_train):
    print(f"Extracting vectors: {chunk_id}", end="; ", flush=True)
    vectors = load_vectors(chunk_id, use_embeddings=USE_EMBEDDINGS)
    mask = is_train[indptr[chunk_id]:indptr[chunk_id+1]]
    return vectors[mask].astype(int)

def build_train(indptr, is_train):
    print("Building training set", end="; ", flush=True)
    if USE_EMBEDDINGS:
        vectors = np.vstack([extract_vectors(i, tuple(indptr), is_train) for i in range(NUM_CHUNKS)])
    else:
        vectors = sparse.vstack([extract_vectors(i, tuple(indptr), is_train) for i in range(NUM_CHUNKS)])  
    return vectors

def chunk_predict_proba(model, indptr, is_train):
    print("Predicting proba", end="; ", flush=True)
    probas = []
    for chunk_id in range(NUM_CHUNKS):
        vectors = extract_vectors(chunk_id, indptr, ~is_train)
        proba = model.predict_proba(vectors)[:,1]
        probas.append(proba)
    return np.concatenate(probas)

def chunk_predict(model, indptr, is_train):
    print("Predicting scores", end="; ", flush=True)
    preds = []
    for chunk_id in range(NUM_CHUNKS):
        vectors = extract_vectors(chunk_id, indptr, ~is_train)
        pred = model.predict(vectors)
        preds.append(pred)
    return np.concatenate(preds)

def chunk_get_mean_and_var(model, indptr, is_train):
    print("Getting mean and var", end="; ", flush=True)
    preds = []
    for chunk_id in range(NUM_CHUNKS):
        vectors = extract_vectors(chunk_id, indptr, ~is_train)
        pred = np.zeros((len(vectors), len(model.estimators_)))
        for j, submodel in enumerate(model.estimators_):
            pred[:, j] = submodel.predict(vectors)
        preds.append(pred)
    preds = np.concatenate(preds)
    return np.mean(preds, axis=1), np.var(preds, axis=1)

In [184]:
def greedy(Y_mean: np.ndarray) -> np.ndarray:
    """Greedy acquisition score
    
    Parameters
    ----------
    Y_mean : np.ndarray
        the mean predicted y values
    Returns
    -------
    np.ndarray
        the greedy acquisition scores
    """
    return Y_mean

def ucb(Y_mean: np.ndarray, Y_var: np.ndarray, beta: int = 2) -> float:
    """Upper confidence bound acquisition score
    Parameters
    ----------
    Y_mean : np.ndarray
    Y_var : np.ndarray
        the variance of the mean predicted y values
    beta : int (Default = 2)
        the number of standard deviations to add to Y_mean
    Returns
    -------
    np.ndarray
        the upper confidence bound acquisition scores
    """
    return Y_mean + beta*np.sqrt(Y_var)

# Train a Logistic Regression models

In [185]:
# model = LogisticRegression(max_iter=10000, C=1)

In [178]:
# model = XGBRegressor(
# #     objective="reg:squaredlogerror"
#     use_label_encoder=False
# )

model = RandomForestRegressor(
    n_estimators=100,
    max_depth=8,
    min_samples_leaf=1
)

In [150]:
# model = XGBClassifier(
#     objective="binary:logistic",
#     use_label_encoder=False
# )

In [167]:
TOP_K_THRESHOLD = 50_000
N_QUERIES = 5
N_FOLDS = 3
EPSILON = 0.

In [152]:
# training_set_fractions = [0.004, 0.002, 0.001]
training_set_sizes = [400_000]

# percentile = 0.3

cols = ['Algorithm', 'Training size', 'N ligands explored', '% top-k found']
df = pd.DataFrame(columns=cols)

n_labeled_examples = scores.shape[0]

y_test = (scores.argsort().argsort() < TOP_K_THRESHOLD)

for i in range(N_FOLDS):
    for size in training_set_sizes:
#     for fraction in training_set_fractions:
#         size = int(len(scores) * fraction)
        
        # split indices into train and pool
        all_indices = np.arange(n_labeled_examples)
        train_indices = np.array(random.sample(range(n_labeled_examples+1), k=size))
        pool_indices = np.delete(all_indices, train_indices, axis=0)
        train_indices.sort()
        pool_indices.sort()

        # generate a 'is a training instance' mask. 
        is_train = np.zeros(n_labeled_examples).astype(bool)
        is_train[train_indices] = True

        # Calculate recall
        y_pred = np.zeros(n_labeled_examples).astype(int)
        y_pred[train_indices] = 1
        recall = recall_score(y_true=y_test, y_pred=y_pred)
        
        df = df.append(pd.DataFrame([["morgan_feat", size, train_indices.shape[0], recall]],
                                    columns=cols),
                      ignore_index=True)
        print(f"Iteration: -1, Recall: {recall}")

        # estimate the cutoff once, from the initial random sample:
#         cutoff = np.percentile(scores[train_indices], percentile)

        for i in range(N_QUERIES):
            # fit logreg model
            x_train = build_train(indptr, is_train)
            y_train = scores[is_train]
#             y_train = scores[is_train] < cutoff

            print("Fitting model", end="; ", flush=True)
            model.fit(
                x_train,
                y_train, 
#                 eval_metric="rmsle"
#                 eval_metric="rmse"
            )

            # predict (slowest step) for logreg
#             preds = chunk_predict(model, indptr, is_train)
#             preds = chunk_predict_proba(model, indptr, is_train)
            y_mean, y_var = chunk_get_mean_and_var(model, indptr, is_train)
            utility = ucb(Y_mean=y_mean, Y_var=y_var)
            
            # get some exploration indices
            exploration_indices = np.random.choice(
                np.arange(utility.size), replace=False,
                size=int(size * EPSILON)
            )
            utility[exploration_indices] = np.inf

            # rank the probabilities (negative is better, otherwise we'd have to do (-preds).argsort())
#             preds_sorted = preds.argsort()
#             preds_sorted = (-preds).argsort()
            preds_sorted = utility.argsort()

            # rank the unseen instances
            pool_indices = pool_indices[preds_sorted]

            # now append the next N instances from the rank ordered unseen instances onto the training set
            train_indices = np.concatenate([train_indices, pool_indices[:size]])

            # update the isTrain mask and remove those training instances from the test set
            is_train[train_indices] = True
            pool_indices = pool_indices[size:]

            # keep the train and test idx arrays sorted so they agree with the chunked* methods
            pool_indices.sort()
            train_indices.sort()

            # Calculate recall
            y_pred = np.zeros(n_labeled_examples).astype(int)
            y_pred[train_indices] = 1
            recall = recall_score(y_true=y_test, y_pred=y_pred)
                        
            df = df.append(pd.DataFrame([["morgan_feat", size, train_indices.shape[0], recall]],
                                        columns=cols),
                          ignore_index=True)
            
            print(f"\nIteration: {i+1}, Recall: {recall}")
                        
            df.to_csv(f"{DATA_DIR}/{OUTPUT_RESULTS_FILE}")

df.to_csv(f"{DATA_DIR}/{OUTPUT_RESULTS_FILE}")

Iteration: -1, Recall: 0.00418
Building training set; Extracting vectors: 0; Loading vectors 0_r1; Extracting vectors: 1; Loading vectors 1_r1; Extracting vectors: 2; Loading vectors 2_r1; Extracting vectors: 3; Loading vectors 3_r1; Extracting vectors: 4; Loading vectors 4_r1; Extracting vectors: 5; Loading vectors 5_r1; Extracting vectors: 6; Loading vectors 6_r1; Extracting vectors: 7; Loading vectors 7_r1; Extracting vectors: 8; Loading vectors 8_r1; Extracting vectors: 9; Loading vectors 9_r1; Fitting model; Predicting scores; Extracting vectors: 0; Loading vectors 0_r1; Extracting vectors: 1; Loading vectors 1_r1; Extracting vectors: 2; Loading vectors 2_r1; Extracting vectors: 3; Loading vectors 3_r1; Extracting vectors: 4; Loading vectors 4_r1; Extracting vectors: 5; Loading vectors 5_r1; Extracting vectors: 6; Loading vectors 6_r1; Extracting vectors: 7; Loading vectors 7_r1; Extracting vectors: 8; Loading vectors 8_r1; Extracting vectors: 9; Loading vectors 9_r1; 
Iteration: 

Building training set; Extracting vectors: 0; Loading vectors 0_r1; Extracting vectors: 1; Loading vectors 1_r1; Extracting vectors: 2; Loading vectors 2_r1; Extracting vectors: 3; Loading vectors 3_r1; Extracting vectors: 4; Loading vectors 4_r1; Extracting vectors: 5; Loading vectors 5_r1; Extracting vectors: 6; Loading vectors 6_r1; Extracting vectors: 7; Loading vectors 7_r1; Extracting vectors: 8; Loading vectors 8_r1; Extracting vectors: 9; Loading vectors 9_r1; Fitting model; Predicting scores; Extracting vectors: 0; Loading vectors 0_r1; Extracting vectors: 1; Loading vectors 1_r1; Extracting vectors: 2; Loading vectors 2_r1; Extracting vectors: 3; Loading vectors 3_r1; Extracting vectors: 4; Loading vectors 4_r1; Extracting vectors: 5; Loading vectors 5_r1; Extracting vectors: 6; Loading vectors 6_r1; Extracting vectors: 7; Loading vectors 7_r1; Extracting vectors: 8; Loading vectors 8_r1; Extracting vectors: 9; Loading vectors 9_r1; 
Iteration: 4, Recall: 0.7778
Building trai

KeyboardInterrupt: 

# Results look like this:
And they can be plotted using `./plot_scripts/plot_wholedataset.py`

In [153]:
df0 = pd.read_csv('../processed_data/ampc_reconstruction_0.3_1_.csv', index_col=0)
df0['Algorithm'] = 'AmpC:LogReg (lewis)'

In [154]:
df1 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_embedding_results_r1.csv", index_col=0)
df1['Algorithm'] = 'AmpC:LogReg (embeddings)'

In [155]:
df2 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_embedding_results_r1_xgb.csv", index_col=0)
df2['Algorithm'] = 'AmpC:LogReg (embeddings, xgb)'

In [156]:
df3 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_results_xgb.csv", index_col=0)
df3['Algorithm'] = 'AmpC:LogReg (fps, xgb)'

In [157]:
df33 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_results_xgb_reg.csv", index_col=0)
df33['Algorithm'] = 'AmpC:LogReg (fps, xgb_reg)'

In [158]:
df4 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_results.csv", index_col=0)
df4['Algorithm'] = 'AmpC:LogReg (fps)'
df4.loc[df4["Training size"] == 384_856, "Training size"] = 400_000 
df4.loc[df4["Training size"] == 192_428, "Training size"] = 200_000
df4.loc[df4["Training size"] == 96_214, "Training size"] = 100_000 

In [159]:
df = pd.concat([df0, df1, df2, df3, df33, df4])

In [160]:
prev_results = [['AmpC:RF (Graff)', 400_000, 71.4, 2.1], 
                ['AmpC:NN (Graff)', 400_000, 74.7, 1.4],
                ['AmpC:MPN (Graff)',400_000, 87.9, 2.3],
                ['AmpC:RF (Graff)', 200_000, 45.5, 1.8],
                ['AmpC:NN (Graff)', 200_000, 52.8, 0.5],
                ['AmpC:MPN (Graff)', 200_000, 67.1, 2.1],
                ['AmpC:RF (Graff)', 100_000, 24.0, 2.2],
                ['AmpC:NN (Graff)', 100_000 , 33.3,0.3],
                ['AmpC:MPN (Graff)', 100_000, 52.0, 0.5]]

coley = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0 
for res in prev_results:
    desired_std_dev = res[3]
    samples = np.array([-1,0,1]).astype(float)
    samples *= (desired_std_dev/np.std(samples))
    for s in samples:
        coley.loc[count] = [res[0], res[1], res[1]*6, (s+res[2])/100]
        count += 1

In [161]:
concat = pd.concat([df, coley])
concat['% top-k found'] *= 100
concat.columns = ['Algorithm', 'Training set size', 'N ligands explored', '% top-k found']
concat['Training set size'] = concat['Training set size'].apply(lambda num: f"{num:,d}",)
concat['Computation days (single CPU)'] = concat['N ligands explored'] / 60 / 60 /24

In [162]:
error_bars = alt.Chart(concat).mark_errorbar(extent='ci').encode(
  x=alt.X('N ligands explored:Q',title='Number of ligands sampled'),
  y=alt.Y('% top-k found:Q', title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

points = alt.Chart(concat).mark_point(filled=False, size=40, color='black').encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 50,000 found'),
    color=alt.Color('Algorithm'),
    tooltip=alt.Tooltip('% top-k found:Q',aggregate='mean',title='% top 50,000 found')
)

line = alt.Chart(concat).mark_line(color='black',size=2,opacity=0.5).encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

ch = (error_bars+points+line).properties(height=300,width=150).facet(
    column=alt.Column('Training set size:N',sort=alt.Sort([0.004, 0.002, 0.001])),
).resolve_scale(x='independent')
# ch.save('../../figures/active_learning_percentage.html')

In [163]:
ch