In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
from scipy import sparse
import altair as alt

In [2]:
alt.renderers.enable('default')

RendererRegistry.enable('default')

# Generate some indices
Even the sparse matrices won't fit in memory. So we will have to loop through them when making predictions or sampling random items.

In [None]:
#count number of items:
indptr = [0]

for chunkID in range(10):
    scores = np.load(f'../processed_data/AmpC_all{chunkID}.npy')
    indptr.append(indptr[-1] + scores.shape[0])


In [None]:
scores = np.concatenate([np.load(f'../processed_data/AmpC_all{i}.npy') for i in range(10)])

# functions to handle the slabs

For training, these loop through the chunks and extract the indices that have been selected either at random or suggested by the surrogate model. 

For predicting, these loop through the chunks and perform the `predict_proba` method on each chunk (after removing the training indices), outputting a concatenated numpy array of predicted values.

In [None]:
def extractFPs(chunkID, indptr, isTrain):
    fp = sparse.load_npz(f'../processed_data/AmpC_all{chunkID}.npz')
    mask = isTrain[indptr[chunkID]:indptr[chunkID+1]]
    return fp[mask]

def buildTrain(indptr, isTrain, verbose=0):
    if verbose:
        print('building training matrix')
    fps = sparse.vstack([extractFPs(i, indptr, isTrain) for i in range(10)])
    return fps

def chunkPredictProba(model, indptr, isTrain, verbose=0):
    if verbose:
        print('predicting probabilities')
    probas = []
    for chunkID in range(10):
        fps = extractFPs(chunkID, indptr, ~isTrain)
        proba = model.predict_proba(fps)[:,1]
        probas.append(proba)
    return np.concatenate(probas)

def chunkPredict(model, indptr, isTrain, verbose=0):
    if verbose:
        print('predicting probabilities')
    preds = []
    for chunkID in range(10):
        fps = extractFPs(chunkID, indptr, ~isTrain)
        pred = -1*model.predict(fps) #best scoring will now be on top (like the proba)
        preds.append(pred)
    return np.concatenate(preds)

# Train a Logistic Regression models

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=10000, C=1)


In [None]:

topK = (scores.argsort().argsort() < 50_000) #~0.05th percentile.
tot = topK.sum() 

# With Altair, using three repeats:

In [None]:

trainingSetSizes=[400_000, 200_000, 100_000]


for percentile in [0.3 ]:    
    df = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
    count=0
    
    for i in range(3):
        idx = np.arange(scores.shape[0])
        np.random.shuffle(idx)

        for size in trainingSetSizes:
            #split indices into train and test:
            train = idx[:size].copy()
            test = idx[size:].copy()
            train.sort()
            test.sort()
    
            #generate a 'is a training instance' mask. 
            isTrain = np.zeros(scores.shape[0]).astype(bool)
            isTrain[train]=True
    
            #topK molecules already found in the training set:
            numFound = topK[train].sum()
        
            df.loc[count] = ['morgan_feat', size, train.shape[0], numFound/tot]
            count+=1
            print(count, numFound)
    
            #estimate the cutoff once, from the initial random sample:
            cutoff = np.percentile(scores[train], percentile)
            
            for i in range(5):

                #fit logreg model:
                model.fit(buildTrain(indptr, isTrain, 1), scores[isTrain]<cutoff)
                #fit ridge:
                #model.fit(buildTrain(indptr, isTrain, 1), scores[isTrain])
    
                #predict (slowest step) for logreg:
                proba = chunkPredictProba(model, indptr, isTrain, 1)
                #predict (slowest step) for ridge:
                #proba = chunkPredict(model, indptr, isTrain, 1)
    
                #rank the probabilities
                proba_sorted = (-proba).argsort()
        
                #rank the unseen instances:
                test = test[proba_sorted]

                #now append the next N instances from the rank ordered unseen instances onto the training set:
                train = np.concatenate([train, test[:size]])
        
                #update the isTrain mask:
                isTrain[train]=True
        
                #now remove those training instances from the test set:
                test = test[size:]

                #keep the train and test idx arrays sorted so they agree with the chunked* methods:
                test.sort()
                train.sort()
        
                #topK molecules already found in the training set:
                numFound = topK[train].sum()
            
                df.loc[count] = ['morgan_feat', size, train.shape[0], numFound/tot]
                count+=1
                print(count, numFound)
                df.to_csv('../processed_data/ampc_reconstruction_'+str(percentile)+'_1_.csv')
                
    df.to_csv('../processed_data/ampc_reconstruction_'+str(percentile)+'_1_.csv')

    

# Results look like this:
And they can be plotted using `./plot_scripts/plot_wholedataset.py`

In [10]:
df = pd.read_csv('../processed_data/ampc_reconstruction_0.3_1_.csv', index_col=0)
df['Algorithm'] = 'AmpC:LogReg (lewis)'

In [11]:
prev_results = [['AmpC:RF (Graff)', 400_000, 71.4, 2.1], 
                ['AmpC:NN (Graff)', 400_000, 74.7, 1.4],
                ['AmpC:MPN (Graff)',400_000, 87.9, 2.3],
                ['AmpC:RF (Graff)', 200_000, 45.5, 1.8],
                ['AmpC:NN (Graff)', 200_000, 52.8, 0.5],
                ['AmpC:MPN (Graff)', 200_000, 67.1, 2.1],
                ['AmpC:RF (Graff)', 100_000, 24.0, 2.2],
                ['AmpC:NN (Graff)', 100_000 , 33.3,0.3],
                ['AmpC:MPN (Graff)', 100_000, 52.0, 0.5]]

coley = pd.DataFrame(columns=['Algorithm', 'Training size', 'N ligands explored', '% top-k found'])
count = 0 
for res in prev_results:
    desired_std_dev = res[3]
    samples = np.array([-1,0,1]).astype(float)
    samples *= (desired_std_dev/np.std(samples))
    for s in samples:
        coley.loc[count] = [res[0], res[1], res[1]*6, (s+res[2])/100]
        count += 1

In [12]:
concat = pd.concat([df, coley])
concat['% top-k found']*=100
concat.columns = ['Algorithm', 'Training set size', 'N ligands explored', '% top-k found']
concat['Training set size'] = concat['Training set size'].apply(lambda num: f"{num:,d}",)
concat['Computation days (single CPU)'] = concat['N ligands explored'] / 60 / 60 /24

In [13]:
error_bars = alt.Chart(concat).mark_errorbar(extent='ci').encode(
  x=alt.X('N ligands explored:Q',title='Number of ligands sampled'),
  y=alt.Y('% top-k found:Q', title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

points = alt.Chart(concat).mark_point(filled=False, size=40, color='black').encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 50,000 found'),
    color=alt.Color('Algorithm'),
    tooltip=alt.Tooltip('% top-k found:Q',aggregate='mean',title='% top 50,000 found')
)

line = alt.Chart(concat).mark_line(color='black',size=2,opacity=0.5).encode(
  x=alt.X('N ligands explored:Q'),
  y=alt.Y('% top-k found:Q',aggregate='mean',title='% top 50,000 found'),
    color=alt.Color('Algorithm')
)

ch = (error_bars+points+line).properties(height=300,width=150).facet(
    column=alt.Column('Training set size:N',sort=alt.Sort([0.004, 0.002, 0.001])),
).resolve_scale(x='independent')
# ch.save('../../figures/active_learning_percentage.html')

In [14]:
ch