In [13]:
from preproc_utils import *
from Get_PSSM import *
from Get_dataset import *
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score,f1_score

chebi = pd.read_table('/Users/suhancho/data/Uniprot_metalbinding_challenge/POS_TRAIN_FULL.tsv')
inpath = '/Users/suhancho/data/Uniprot_metalbinding_challenge/chebi/'
bind_tsv_list = [inpath + f for f in os.listdir(inpath)]
pssm_path = '/Users/suhancho/data/Uniprot_metalbinding_challenge/PSSM/'
pssm_files = [pssm_path+f for f in os.listdir(pssm_path)]
bindlist = pd.concat([pd.read_table(f) for f in bind_tsv_list])
low_labels = [l.replace(' ','') for l in bindlist.Name.value_counts().index[bindlist.Name.value_counts()<1000]]

In [2]:
def calculate_window(num_inspections,bs_idx):
    train_dat=[]
    for i,pssm in tqdm(enumerate(pssm_files[0:num_inspections])):
        ion_file =bind_tsv_list[bs_idx] # bs_idx : 0~29
        ion_name = ion_file.split('/')[-1].split('.')[0]
        bs = get_binding_site(ion_file,pssm.split('/')[-1].split('.')[0])
        try : 
            if len(bs)!=0:
                # gt,fs = get_dataset(get_processed_pssm(pssm),bs)
                gt,fs = get_dataset_padded(get_processed_pssm(pssm),bs)
                # print("Number of positive windows:"+str(len(gt))+'\n'+
                #       "Number of negative windows:"+str(len(fs)))
                for g in gt : 
                    train_dat.append([g.values.tolist(),1])
                for f in fs : 
                    train_dat.append([f.values.tolist(),0])
        except:
            print(pssm)

    return(train_dat,ion_name)

In [3]:
def preproc_data(windowdata):
    train_X = [dat[0] for dat in windowdata]
    train_Y = [dat[1] for dat in windowdata]
    print('Size of dataset : '+str(len(train_X)))
    return(train_X,train_Y)

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
def check_windowdata(traindata):
    sns.histplot([len(t) for t in traindata])
    plt.show()



In [5]:
def filter_traindata(Xdata,Ydata):
    train_X_filtered = [Xdata[i] for i in range(len(Xdata)) if len(Xdata[i])==9]
    train_Y_filtered = [Ydata[i] for i in range(len(Xdata)) if len(Xdata[i])==9]
    return(train_X_filtered,train_Y_filtered)  

In [6]:
from itertools import chain
def flatten_Xdata(filtered_X):
    flatten_trainX = [list(chain.from_iterable(lst)) for lst in filtered_X]
    return(flatten_trainX)

In [41]:
def get_MLmetrics(testset_y,testset_X,classifier,ion_name):
    auc = roc_auc_score(testset_y,list(classifier.predict(testset_X)))
    acc = accuracy_score(testset_y,list(classifier.predict(testset_X)))
    recall = recall_score(testset_y,list(classifier.predict(testset_X)))
    f1 = f1_score(testset_y,list(classifier.predict(testset_X)))
    prec = precision_score(testset_y,list(classifier.predict(testset_X)))
    print('ION = '+ion_name)
    print('\nAUC = '+str(round(auc,2)))
    print('\nAccuracy = '+str(round(acc,2)))
    print('\nRecall = '+str(round(recall,2)))
    print('\nF1 = '+str(round(f1,2)))
    print('Precision = '+str(round(prec,2)))
    return(auc,acc,recall,f1,prec)

In [42]:
def balance_classes(traindata,fold):
    label1 = [traindata[i] for i in range(len(traindata)) if traindata[i][1]==1]
    label0 = [traindata[i] for i in range(len(traindata)) if traindata[i][1]==0]
    balanced0 = random.sample(label0,len(label1)*fold)
    return(balanced0+label1)

In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [64]:
result=[]
for i,b in enumerate(bind_tsv_list):
    b_tmp = open(b,'r').readlines()

    if len(b_tmp)<1000:
        iter = 90000
    else : 
        iter = 5000

    train_dat,ion = calculate_window(iter,i)
    train_dat = balance_classes(train_dat,3)
    train_X,train_Y = preproc_data(train_dat)
    # check_windowdata(train_X)
    train_X,train_Y = filter_traindata(train_X,train_Y)
    flatten_trainX = flatten_Xdata(train_X)
    trainX, testX, trainy, testy = train_test_split(flatten_trainX,train_Y,test_size=0.4,shuffle = True,stratify=train_Y)

    X = trainX ; y = trainy

    pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=9510))

    param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

    param_grid = [{'svc__C': param_range, 
                'svc__kernel': ['linear']},
                {'svc__C': param_range, 
                'svc__gamma': param_range, 
                'svc__kernel': ['rbf']}]

    gs = GridSearchCV(estimator=pipe_svc, 
                    param_grid=param_grid, 
                    scoring='accuracy', 
                    cv=3,
                    n_jobs=-1)
    gs = gs.fit(X, y)
    auc,acc,f1,recall,prec = get_MLmetrics(testy,testX,gs,ion)
    result.append([auc,acc,f1,recall,prec,ion])

90000it [01:40, 895.90it/s] 


Size of dataset : 248
ION = Cu(+)

AUC = 0.93

Accuracy = 0.96

Recall = 0.88

F1 = 0.92
Precision = 0.96


90000it [01:00, 1490.77it/s]


Size of dataset : 104
ION = Hg(2+)

AUC = 0.91

Accuracy = 0.95

Recall = 0.82

F1 = 0.9
Precision = 1.0


90000it [01:13, 1228.67it/s]


Size of dataset : 208
ION = [8Fe-7S]cluster

AUC = 0.99

Accuracy = 0.99

Recall = 1.0

F1 = 0.98
Precision = 0.95


90000it [00:52, 1720.22it/s]


Size of dataset : 16
ION = [8Fe-9S-C-homocitryl]cluster

AUC = 0.5

Accuracy = 0.71

Recall = 0.0

F1 = 0.0
Precision = 0.0


90000it [01:23, 1072.76it/s]


Size of dataset : 284
ION = Co(2+)

AUC = 0.85

Accuracy = 0.88

Recall = 0.79

F1 = 0.76
Precision = 0.73


5000it [00:07, 646.73it/s]


Size of dataset : 64
ION = iron-sulfurcluster

AUC = 0.86

Accuracy = 0.92

Recall = 0.71

F1 = 0.83
Precision = 1.0


5000it [03:16, 25.50it/s]


Size of dataset : 2520
ION = [4Fe-4S]cluster

AUC = 0.99

Accuracy = 0.99

Recall = 0.98

F1 = 0.97
Precision = 0.96


90000it [01:33, 962.63it/s] 


Size of dataset : 128
ION = Cd(2+)

AUC = 0.74

Accuracy = 0.85

Recall = 0.54

F1 = 0.64
Precision = 0.78


5000it [01:13, 68.36it/s] 


Size of dataset : 988
ION = adivalentmetalcation

AUC = 0.88

Accuracy = 0.92

Recall = 0.8

F1 = 0.83
Precision = 0.87


5000it [00:27, 180.34it/s]


Size of dataset : 248
ION = Cucation

AUC = 0.83

Accuracy = 0.9

Recall = 0.71

F1 = 0.77
Precision = 0.85


90000it [01:03, 1420.90it/s]


Size of dataset : 32
ION = [Ni-4Fe-5S]cluster

AUC = 0.67

Accuracy = 0.85

Recall = 0.33

F1 = 0.5
Precision = 1.0


5000it [09:27,  8.81it/s]


Size of dataset : 6080
ION = Zn(2+)

AUC = 0.94

Accuracy = 0.96

Recall = 0.92

F1 = 0.91
Precision = 0.91


90000it [00:52, 1705.18it/s]


Size of dataset : 16
ION = Co(3+)

AUC = 0.5

Accuracy = 0.71

Recall = 0.0

F1 = 0.0
Precision = 0.0


90000it [01:27, 1028.77it/s]


Size of dataset : 64
ION = [Ni-4Fe-4S]cluster

AUC = 0.98

Accuracy = 0.96

Recall = 1.0

F1 = 0.92
Precision = 0.86


5000it [01:04, 77.53it/s] 


Size of dataset : 908
ION = Fecation

AUC = 0.93

Accuracy = 0.95

Recall = 0.9

F1 = 0.9
Precision = 0.9


23842it [00:41, 614.85it/s]