In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt
import tqdm
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from functools import lru_cache
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import recall_score
from xgboost import XGBClassifier, XGBRegressor
from apricot import FeatureBasedSelection

In [2]:
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [3]:
NUM_CHUNKS = 1

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [4]:
USE_EMBEDDINGS = False

In [5]:
RECEPTOR = "EnamineHTS"
DATA_DIR = "/mnt/efs/enamine"

if USE_EMBEDDINGS:
    OUTPUT_RESULTS_FILE = f"{RECEPTOR}_embedding_results.csv"
else:
    OUTPUT_RESULTS_FILE = f"{RECEPTOR}_results.csv"

In [6]:
# count number of items:
indptr = [0]
scores_lst = []

for chunk_id in range(NUM_CHUNKS):
    scores = np.load(f"{DATA_DIR}/{RECEPTOR}_scores_{chunk_id}.npy")
    indptr.append(indptr[-1] + scores.shape[0])
    scores_lst.append(scores)
    
scores = np.concatenate(scores_lst)

In [7]:
scores.shape

(2104318,)

In [13]:
def load_vectors(chunk_id, use_embeddings=True):
    print("Loading vectors", end="; ", flush=True)
    if use_embeddings:
        vectors = np.load(f"{DATA_DIR}/{RECEPTOR}_embeddings_{chunk_id}.npy")
    else:
        vectors = sparse.load_npz(f"{DATA_DIR}/{RECEPTOR}_fingerprints_{chunk_id}.npz")
    return vectors

def extract_vectors(chunk_id, indptr, is_train):
    print(f"Extracting vectors: {chunk_id}", end="; ", flush=True)
    vectors = load_vectors(chunk_id, use_embeddings=USE_EMBEDDINGS)
    mask = is_train[indptr[chunk_id]:indptr[chunk_id+1]]
    return vectors[mask].astype(int)

def build_train(indptr, is_train):
    print("Building training set", end="; ", flush=True)
    if USE_EMBEDDINGS:
        vectors = np.vstack([extract_vectors(i, tuple(indptr), is_train) for i in range(NUM_CHUNKS)])
    else:
        vectors = sparse.vstack([extract_vectors(i, tuple(indptr), is_train) for i in range(NUM_CHUNKS)])  
    return vectors

def chunk_predict_proba(model, indptr, is_train):
    print("Predicting proba", end="; ", flush=True)
    probas = []
    for chunk_id in range(NUM_CHUNKS):
        vectors = extract_vectors(chunk_id, indptr, ~is_train)
        proba = model.predict_proba(vectors)[:,1]
        probas.append(proba)
    return np.concatenate(probas)

def chunk_predict(model, indptr, is_train):
    print("Predicting scores", end="; ", flush=True)
    preds = []
    for chunk_id in range(NUM_CHUNKS):
        vectors = extract_vectors(chunk_id, indptr, ~is_train)
        pred = model.predict(vectors)
        preds.append(pred)
    return np.concatenate(preds)

def select_train_indices(n_samples, total_samples, method="random"):
    print(f"Selecting train indices with method: {method}", end="; ", flush=True)
    if method == "random":
        train_indices = np.random.randint(low=0, high=total_samples+1, size=n_samples)
    else:
        if USE_EMBEDDINGS:
            vectors = np.vstack([load_vectors(chunk_id=i, use_embeddings=USE_EMBEDDINGS).astype(int) for i in range(NUM_CHUNKS)])
        else:
            vectors = sparse.vstack([load_vectors(chunk_id=i, use_embeddings=USE_EMBEDDINGS).astype(int) for i in range(NUM_CHUNKS)])
        vectors = vectors.astype(int)
        selector = FeatureBasedSelection(n_samples, concave_func=method, verbose=True)
        selector.fit(vectors)
        train_indices = selector.ranking
    return train_indices

# Train a Logistic Regression models

In [None]:
# model = LogisticRegression(max_iter=10000, C=1)

In [None]:
model = XGBRegressor(
#     objective="reg:squaredlogerror"
    use_label_encoder=False
)

In [None]:
TOP_K_THRESHOLD = 1_000
N_QUERIES = 6
N_FOLDS = 4
CONC_METHODS = ["log", "sigmoid", "sqrt", "random"]

In [None]:
# training_set_fractions = [0.004, 0.002, 0.001]
training_set_fractions = [0.004]

# percentile = 0.3

cols = ['Algorithm', 'Training size', 'N ligands explored', '% top-k found']
# df = pd.DataFrame(columns=cols)

n_labeled_examples = scores.shape[0]

y_test = (scores.argsort().argsort() < TOP_K_THRESHOLD)

for i_fold in range(N_FOLDS):
    for fraction in training_set_fractions:
        size = int(len(scores) * fraction)
        
        # split indices into train and pool
        all_indices = np.arange(scores.shape[0])
        
        conc_method = "sqrt"
        train_indices = select_train_indices(n_samples=size, total_samples=n_labeled_examples, method=conc_method)
        
        pool_indices = np.delete(all_indices, train_indices, axis=0)
        train_indices.sort()
        pool_indices.sort()

        # generate a 'is a training instance' mask
        is_train = np.zeros(scores.shape[0]).astype(bool)
        is_train[train_indices] = True

        # Calculate recall
        y_pred = np.zeros(n_labeled_examples)
        y_pred[train_indices] = 1
        recall = recall_score(y_true=y_test, y_pred=y_pred)

#         df = df.append(pd.DataFrame([["morgan_feat", size, train_indices.shape[0], recall]],
#                                     columns=cols),
#                       ignore_index=True)
        df = df.append(pd.DataFrame([[conc_method, size, train_indices.shape[0], recall]],
                                    columns=cols),
                      ignore_index=True)
        print(f"Iteration: -1, Recall: {recall}")

        # estimate the cutoff once, from the initial random sample:
        # cutoff = np.percentile(scores[train_indices], percentile)

        for i_query in range(N_QUERIES):
            # fit logreg model
            x_train = build_train(indptr, is_train)
            y_train = scores[is_train]

            print("Fitting model", end="; ", flush=True)
            model.fit(
                x_train,
                y_train, 
#                 eval_metric="rmsle"
                eval_metric="rmse"
            )

            # predict (slowest step) for logreg
            preds = chunk_predict(model, indptr, is_train)

            # rank the probabilities (negative is better, otherwise we'd have to do (-preds).argsort())
            preds_sorted = preds.argsort()

            # rank the unseen instances
            pool_indices = pool_indices[preds_sorted]

            # now append the next N instances from the rank ordered unseen instances onto the training set
            train_indices = np.concatenate([train_indices, pool_indices[:size]])

            # update the isTrain mask and remove those training instances from the test set
            is_train[train_indices] = True
            pool_indices = pool_indices[size:]

            # keep the train and test idx arrays sorted so they agree with the chunked* methods
            pool_indices.sort()
            train_indices.sort()

            # Calculate recall
            y_pred[train_indices] = 1
            recall = recall_score(y_true=y_test, y_pred=y_pred)
            
#             df = df.append(pd.DataFrame([["morgan_feat", size, train_indices.shape[0], recall]],
#                                         columns=cols),
#                           ignore_index=True)
            df = df.append(pd.DataFrame([[conc_method, size, train_indices.shape[0], recall]],
                                        columns=cols),
                          ignore_index=True)
            
            print(f"\nIteration: {i_query+1}, Recall: {recall}")
            
            df.to_csv(f"{DATA_DIR}/{OUTPUT_RESULTS_FILE}")

df.to_csv(f"{DATA_DIR}/{OUTPUT_RESULTS_FILE}")

# Results look like this:
And they can be plotted using `./plot_scripts/plot_wholedataset.py`

In [None]:
df1 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_embedding_results.csv", index_col=0)
df1['Algorithm'] = 'LogReg (embeddings)'

In [None]:
df2 = pd.read_csv(f"{DATA_DIR}/{RECEPTOR}_results.csv", index_col=0)
# df2['Algorithm'] = 'LogReg (fps)'

In [None]:
df = pd.concat([df1, df2])

In [None]:
prev_results = [['RF (Graff)', 8_417, 84.3, 1.1], 
                ['NN (Graff)', 8_417, 95.7, 0.1],
                ['MPN (Graff)',8_417, 97.6, 0.3],
                ['random', 8_417, 2.6, 0.1],
                ['RF (Graff)', 4_208, 72.3, 1.9],
                ['NN (Graff)', 4_208, 88.8, 0.8],
                ['MPN (Graff)', 4_208, 93.3, 0.9],
                ['random', 4_208, 1.3, 0.4],
                ['RF (Graff)', 2_104, 55.8, 4.9],
                ['NN (Graff)', 2_104 , 70.5, 1.8],
                ['MPN (Graff)', 2_104, 78.5, 2.2],
                ['random', 2_104, 0.6, 0.2]]

coley = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0 
for res in prev_results:
    desired_std_dev = res[3]
    samples = np.array([-1,0,1]).astype(float)
    samples *= (desired_std_dev/np.std(samples))
    for s in samples:
        coley.loc[count]= [res[0], res[1], res[1]*6, (s+res[2])/100]
        count += 1

In [None]:
concat = pd.concat([df, coley])
concat['% top-k found'] *= 100
concat.columns = ['Algorithm', 'Training set size', 'N ligands explored', '% top-k found']
concat['Training set size'] = concat['Training set size'].apply(lambda num: f"{num:,d}",)

In [None]:
error_bars = alt.Chart(concat).mark_errorbar(extent='ci').encode(
  x=alt.X('N ligands explored:Q',title='Number of ligands sampled'),
  y=alt.Y('% top-k found:Q', title='% top 1,000 found'),
    color=alt.Color('Algorithm')
)

points = alt.Chart(concat).mark_point(filled=False, size=40, color='black').encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 1,000 found'),
    color=alt.Color('Algorithm'),
    tooltip=alt.Tooltip('% top-k found:Q',aggregate='mean',title='% top 1,000 found')
)

line = alt.Chart(concat).mark_line(color='black',size=2,opacity=0.5).encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 1,000 found'),
    color=alt.Color('Algorithm')
)

ch = (error_bars+points+line).properties(height=300,width=150).facet(
    column=alt.Column('Training set size:N',sort=alt.Sort([0.004, 0.002, 0.001])),
).resolve_scale(x='independent')
# ch.save('../../figures/active_learning_percentage.html')

In [None]:
ch

# PCA

In [None]:
vectors = load_vectors(chunk_id=0, use_embeddings=False)

In [None]:
classes = scores < cutoff

In [None]:
# pca = PCA(n_components=2, random_state=42)
pca = TruncatedSVD(n_components=2, random_state=42)
transformed_vectors = pca.fit_transform(X=vectors)

# Isolate the data we'll need for plotting.
x_component, y_component = transformed_vectors[:, 0], transformed_vectors[:, 1]

In [None]:
def plot_pca(x_component, y_component, classes):
    # Plot our dimensionality-reduced (via PCA) dataset.
    plt.figure(figsize=(8.5, 6), dpi=130)
    plt.scatter(x=x_component, y=y_component, c=classes, s=5, alpha=0.5)
    plt.title('Ligands after PCA transformation')
    plt.show()

In [None]:
plot_pca(x_component=x_component, y_component=y_component, classes=classes)