In [1]:
import os
import sys
import time
import argparse
import progressbar
import networkx as nx
import numpy as np
import pandas as pd

sys.path.append('../pyprot/')
import pyprot.graph_models as graph_models
from pyprot.downloader import PdbDownloader, ConsurfDBDownloader
from pyprot.protein import Protein
from pyprot.structure import Perseus


### Graph verification and preprocessing

In [2]:
import networkx as nx
import pickle
import pyprot.constants
def load_graph(fn):
    with open(fn, "rb") as f:
        return pickle.load(f)
filenames = [fn for fn in os.listdir("graphs/") if ".pkl" in fn]
#filenames = ["3KEU.pkl"]
graphs = [load_graph("graphs/"+fn) for fn in filenames]
print("Loaded {} graphs".format(len(graphs)))

Loaded 700 graphs


In [3]:
for i in progressbar.progressbar(range(len(graphs))):
    graphs[i] = graph_models.GraphModel.get_diffused_graph(graphs[i], steps=3)

100% (700 of 700) |######################| Elapsed Time: 0:24:14 Time:  0:24:14


In [4]:
def touches_ligand(x):
    return x <= 4 or (x<=6 and np.random.binomial(1, 1-(x-4)/2) == 1)
dataframes = []
for graph in graphs:
    df = graph_models.GraphModel.graph_to_dataframe(graph)
    # One hot for AAs
    for code3 in pyprot.constants.AMINOACIDS_3 + ["UNK"]:
        df[code3] = (df.resname == code3).astype(np.int)
    
    for i, coord in enumerate(["x", "y", "z"]):
        df[coord] = df.coord.map(lambda x: x[i])
    df["target"] = df.distance.map(touches_ligand).astype(np.int)
    df = df[[c for c in df.columns if "distance_" not in c]]
    df = df.drop(["full_id", "resname", "coord", "distance"], axis=1)
    dataframes.append(df)

In [None]:
print([df.columns for df in dataframes])

In [None]:
# todo: class balance

## Example model

In [5]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import sklearn.metrics

In [6]:
len(dataframes)

700

### Data splits and preprocessing

In [7]:
def make_set(sets):
    print("Received {} sets".format(len(sets)))
    sets = pd.concat(sets)
    return sets.drop("target", axis=1), sets["target"]
train_feat, train_target = make_set(dataframes[0:600])
#val_feat, val_target = make_set(dataframes[100:120])
test_feat, test_target = make_set(dataframes[600:])

Received 600 sets
Received 100 sets


In [8]:
parameters = {"n_estimators": list(range(10, 200, 10)), "max_depth": list(range(1, 11))}

In [9]:
clf = RandomForestClassifier(random_state = 42, class_weight="balanced")
clf = GridSearchCV(clf, parameters, cv=5, 
                   n_jobs=12,
                   scoring=sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score))
clf.fit(train_feat, train_target)
predicted_target = clf.predict(test_feat)



In [10]:
y = train_target.to_numpy()
y_hat = clf.predict(train_feat)
print("Accuracy:",
     sklearn.metrics.accuracy_score(y, y_hat))
print("Precision score (TP/(TP+FP)): ",
      sklearn.metrics.precision_score(y, y_hat))
print("AUC score: ",
      sklearn.metrics.roc_auc_score(y, y_hat))

Accuracy: 0.7152023717858059
Precision score (TP/(TP+FP)):  0.08549600013704027
AUC score:  0.773249815168968


In [12]:
y = test_target.to_numpy()
y_hat = predicted_target
print("Accuracy:",
     sklearn.metrics.accuracy_score(y, y_hat))
print("Precision score (TP/(TP+FP)): ",
      sklearn.metrics.precision_score(y, y_hat))
print("AUC score: ",
      sklearn.metrics.roc_auc_score(y, y_hat))


Accuracy: 0.7230463740536963
Precision score (TP/(TP+FP)):  0.06783150790700335
AUC score:  0.7264615620142678


In [13]:
clf.best_params_

{'max_depth': 10, 'n_estimators': 120}