In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
from scipy import sparse
from sklearn.linear_model import LogisticRegression

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [52]:
RECEPTOR = "FEN1"
OUTPUT_DATA_DIR = "../processed_data"

In [40]:
scores = np.load(f"../processed_data/{RECEPTOR}_scores.npy")

In [50]:
fingerprints = sparse.load_npz(f"../processed_data/{RECEPTOR}_fingerprints.npz")

In [51]:
scores.shape, fingerprints.shape

((355771,), (355771, 8192))

In [62]:
scores.sum(), len(scores)-scores.sum()

(369.0, 355402.0)

# Train a Logistic Regression models

In [6]:
model = LogisticRegression(max_iter=10000, C=1)

In [46]:
top_k = scores
total = scores.sum()

# With Altair, using three repeats:

In [60]:
training_set_sizes = [1_000, 5_000, 10_000]

percentile = 0.3

df = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0

for i in range(3):
    idx = np.arange(scores.shape[0])
    np.random.shuffle(idx)

    for size in training_set_sizes:
        # split indices into train and test:
        train_indices = idx[:size].copy()
        test_indices = idx[size:].copy()
        train_indices.sort()
        test_indices.sort()

        # generate a 'is a training instance' mask. 
        is_train = np.zeros(scores.shape[0]).astype(bool)
        is_train[train_indices] = True

        # top_k molecules already found in the training set:
        num_found = top_k[train_indices].sum()

        df.loc[count] = ["morgan_feat", size, train_indices.shape[0], num_found/total]
        count += 1
        print(f"Iteration: {count}, Found {num_found} top k ligands")

        # estimate the cutoff once, from the initial random sample:
        # cutoff = np.percentile(scores[train_indices], percentile)

        for i in range(5):
            # fit logreg model:
            x_train = fingerprints[is_train]
            y_train = scores[is_train]
            x_val = fingerprints[~is_train]
            y_val = scores[~is_train]
            
            model.fit(x_train, y_train)

            # predict (slowest step) for logreg:
            proba = model.predict_proba(x_val)[:, 1]

            # rank the probabilities
            proba_sorted = (-proba).argsort()

            # rank the unseen instances:
            test_indices = test_indices[proba_sorted]

            # now append the next N instances from the rank ordered unseen instances onto the training set:
            train = np.concatenate([train_indices, test_indices[:size]])

            # update the isTrain mask and remove those training instances from the test set
            is_train[train_indices] = True
            test_indices = test_indices[size:]

            # keep the train and test idx arrays sorted so they agree with the chunked* methods:
            test_indices.sort()
            train_indices.sort()

            # topK molecules already found in the training set:
            num_found = top_k[train_indices].sum()

            df.loc[count] = ['morgan_feat', size, train_indices.shape[0], num_found/total]
            count += 1
            
            print(f"Iteration: {count}, Found {num_found} top k ligands")
            
            df.to_csv(f"{OUTPUT_DATA_DIR}/{RECEPTOR}_results.csv")

df.to_csv(f"{OUTPUT_DATA_DIR}/{RECEPTOR}_results.csv")

Iteration: 1, Found 1.0 top k ligands
Iteration: 2, Found 1.0 top k ligands


IndexError: index 354486 is out of bounds for axis 0 with size 353771

# Results look like this:
And they can be plotted using `./plot_scripts/plot_wholedataset.py`

In [None]:
pd.read_csv('../processed_data/ampc_reconstruction_0.3_1_.csv')