In [1]:
import json 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
# Protein properties file
with open("../protein_properties/proteins.json", "r") as f:
    protein_properties = json.loads(f.read())

len(protein_properties)

20427

In [3]:
# Read from protein's drugs information extracted from DrugBank
with open("../drugbank/protein_drugbank.json", "r") as f:
    drugs = json.loads(f.read())

druggable_prots = []
for k, v in drugs.items():
    for drug, info in v.items():
        if("approved" in info[1]):
            druggable_prots.append(k)
            break 

len(druggable_prots)

2652

#### Machine Learning Models

In [9]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [5]:
# Here we will investigate all properties at once
# Molar Extinction Coefficient, GRAVY, Isoelectric Point, Instability Index, Aromaticity, Sequence Length, Molecular Weight, Amino Acid Composition
X_druggable = []
X_non_druggable = []

for prot, info in protein_properties.items():   
    if prot in druggable_prots:
        X_druggable.append([info['Molar Extinction Coefficient'][0], info["Molar Extinction Coefficient"][1], info['GRAVY'], info['Isoelectric Point'], info['Instability Index'], info['Aromaticity'], info['Sequence Length'], info['Molecular Weight']] + list(info['Amino Acid Percent'].values()))
    else:
        X_non_druggable.append([info['Molar Extinction Coefficient'][0], info["Molar Extinction Coefficient"][1], info['GRAVY'], info['Isoelectric Point'], info['Instability Index'], info['Aromaticity'], info['Sequence Length'], info['Molecular Weight']] + list(info['Amino Acid Percent'].values()))

In [6]:
random_state = 42
np.random.seed(random_state)

X_druggable, X_non_druggable = np.array(X_druggable), np.array(X_non_druggable)

np.random.shuffle(X_druggable)
np.random.shuffle(X_non_druggable)

X_druggable.shape, X_non_druggable.shape

((2652, 28), (17775, 28))

In [7]:
train_ratio = 0.8
X_train = np.concatenate([X_druggable[:int(train_ratio * len(X_druggable))], X_non_druggable[:int(train_ratio * len(X_non_druggable))]], axis=0)
X_test = np.concatenate([X_druggable[int(train_ratio * len(X_druggable)):], X_non_druggable[int(train_ratio * len(X_non_druggable)):]], axis=0)
# labels
y_train = np.concatenate([np.ones(int(train_ratio * len(X_druggable))), np.zeros(int(train_ratio * len(X_non_druggable)))], axis=0)
y_test = np.concatenate([np.ones(len(X_druggable) - int(train_ratio * len(X_druggable))), np.zeros(len(X_non_druggable) - int(train_ratio * len(X_non_druggable)))], axis=0)

# Shuffling the training data
data = np.concatenate([X_train, y_train.reshape(-1, 1)], axis=1)
np.random.shuffle(data)
X_train, y_train = data[:, :-1], data[:, -1]

# Printing the shapes of the training and testing data
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16341, 28), (16341,), (4086, 28), (4086,))

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
clf1 = SVC(kernel='linear', probability=True, random_state=random_state)
clf2 = SVC(kernel='rbf', probability=True, random_state=random_state)
clf3 = SVC(kernel='poly', probability=True, random_state=random_state)

clf4 = LogisticRegression(random_state=random_state)
clf5 = KNeighborsClassifier(n_neighbors=5)

clf6 = DecisionTreeClassifier(random_state=random_state)
clf7 = RandomForestClassifier(random_state=random_state)

clf8 = AdaBoostClassifier(random_state=random_state)

In [13]:
clf1.fit(X_train, y_train)
print("SVM Linear trained")
clf2.fit(X_train, y_train)
print("SVM RBF trained")
clf3.fit(X_train, y_train)
print("SVM Poly trained")
clf4.fit(X_train, y_train)
print("Logistic Regression trained")
clf5.fit(X_train, y_train)
print("KNN trained")

SVM Linear trained
SVM RBF trained
SVM Poly trained
Logistic Regression trained
KNN trained


In [14]:
clf6.fit(X_train, y_train)
print("Decision Tree trained")
clf7.fit(X_train, y_train)
print("Random Forest trained")

Decision Tree trained
Random Forest trained


In [15]:
clf8.fit(X_train, y_train)
print("AdaBoost trained")

AdaBoost trained


In [30]:
def evaluate(y_test, y_pred):
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred))
    print("Recall: ", recall_score(y_test, y_pred))
    print("F1 Score: ", f1_score(y_test, y_pred))
    print("ROC AUC Score: ", roc_auc_score(y_test, y_pred))

    return accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred)

In [22]:
# Predict class
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)
y_pred4 = clf4.predict(X_test)
y_pred5 = clf5.predict(X_test)
y_pred6 = clf6.predict(X_test)
y_pred7 = clf7.predict(X_test)
y_pred8 = clf8.predict(X_test)

In [26]:
np.sum(y_test), np.sum(y_pred1), np.sum(y_pred2), np.sum(y_pred3), np.sum(y_pred4), np.sum(y_pred5), np.sum(y_pred6), np.sum(y_pred7), np.sum(y_pred8)

(531.0, 0.0, 3.0, 13.0, 5.0, 294.0, 584.0, 58.0, 49.0)

In [28]:
print("SVM Linear")
evaluate(y_test, y_pred1)
print("=====================================")
print("SVM RBF")
evaluate(y_test, y_pred2)
print("=====================================")
print("SVM Poly")
evaluate(y_test, y_pred3)
print("=====================================")
print("Logistic Regression")
evaluate(y_test, y_pred4)
print("=====================================")
print("KNN")
evaluate(y_test, y_pred5)
print("=====================================")
print("Decision Tree")
evaluate(y_test, y_pred6)
print("=====================================")
print("Random Forest")
evaluate(y_test, y_pred7)
print("=====================================")
print("AdaBoost")
evaluate(y_test, y_pred8)

SVM Linear
Accuracy:  0.8700440528634361
Precision:  0.0
Recall:  0.0
F1 Score:  0.0
ROC AUC Score:  0.5
SVM RBF
Accuracy:  0.8707782672540382
Precision:  1.0
Recall:  0.005649717514124294
F1 Score:  0.011235955056179775
ROC AUC Score:  0.5028248587570622
SVM Poly
Accuracy:  0.8707782672540382
Precision:  0.6153846153846154
Recall:  0.015065913370998116
F1 Score:  0.029411764705882346
ROC AUC Score:  0.506829721805049
Logistic Regression
Accuracy:  0.8688203622124328
Precision:  0.0
Recall:  0.0
F1 Score:  0.0
ROC AUC Score:  0.49929676511954996
KNN
Accuracy:  0.8710230053842388
Precision:  0.5068027210884354
Recall:  0.2806026365348399
F1 Score:  0.3612121212121212
ROC AUC Score:  0.6199075067343679
Decision Tree
Accuracy:  0.7980910425844346
Precision:  0.2482876712328767
Recall:  0.2730696798493409
F1 Score:  0.26008968609865474
ROC AUC Score:  0.5747908174211542
Random Forest
Accuracy:  0.8764072442486539
Precision:  0.7241379310344828
Recall:  0.07909604519774012
F1 Score:  0.1426

  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Draw a table of results, using pandas
results = pd.DataFrame(columns=["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC Score"], index=["SVM Linear", "SVM RBF", "SVM Poly", "Logistic Regression", "KNN", "Decision Tree", "Random Forest", "AdaBoost"])
results.loc["SVM Linear"] = evaluate(y_test, y_pred1)
results.loc["SVM RBF"] = evaluate(y_test, y_pred2)
results.loc["SVM Poly"] = evaluate(y_test, y_pred3)
results.loc["Logistic Regression"] = evaluate(y_test, y_pred4)
results.loc["KNN"] = evaluate(y_test, y_pred5)
results.loc["Decision Tree"] = evaluate(y_test, y_pred6)
results.loc["Random Forest"] = evaluate(y_test, y_pred7)
results.loc["AdaBoost"] = evaluate(y_test, y_pred8)
results

Accuracy:  0.8700440528634361
Precision:  0.0
Recall:  0.0
F1 Score:  0.0
ROC AUC Score:  0.5
Accuracy:  0.8707782672540382
Precision:  1.0
Recall:  0.005649717514124294
F1 Score:  0.011235955056179775
ROC AUC Score:  0.5028248587570622
Accuracy:  0.8707782672540382
Precision:  0.6153846153846154
Recall:  0.015065913370998116
F1 Score:  0.029411764705882346
ROC AUC Score:  0.506829721805049
Accuracy:  0.8688203622124328
Precision:  0.0
Recall:  0.0
F1 Score:  0.0
ROC AUC Score:  0.49929676511954996
Accuracy:  0.8710230053842388
Precision:  0.5068027210884354
Recall:  0.2806026365348399
F1 Score:  0.3612121212121212
ROC AUC Score:  0.6199075067343679
Accuracy:  0.7980910425844346
Precision:  0.2482876712328767
Recall:  0.2730696798493409
F1 Score:  0.26008968609865474
ROC AUC Score:  0.5747908174211542
Accuracy:  0.8764072442486539
Precision:  0.7241379310344828
Recall:  0.07909604519774012
F1 Score:  0.14261460101867574
ROC AUC Score:  0.5372976709814298
Accuracy:  0.8717572197748409
P

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
SVM Linear,0.870044,0.0,0.0,0.0,0.5
SVM RBF,0.870778,1.0,0.00565,0.011236,0.502825
SVM Poly,0.870778,0.615385,0.015066,0.029412,0.50683
Logistic Regression,0.86882,0.0,0.0,0.0,0.499297
KNN,0.871023,0.506803,0.280603,0.361212,0.619908
Decision Tree,0.798091,0.248288,0.27307,0.26009,0.574791
Random Forest,0.876407,0.724138,0.079096,0.142615,0.537298
AdaBoost,0.871757,0.571429,0.052731,0.096552,0.523412
