In [2]:
import os
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score, roc_auc_score, classification_report

In [3]:
def predict_stacked_model(model, inputX):
    # prepare input data
    X = inputX
    # make prediction
    return model.predict(X, verbose=0)

# Ankh Nucleocytoplasmic Training Dataset

In [4]:
os.chdir("/homes/t326h379/Ankh_nucleocytoplasmic_dbptm_2231")
df_positive = pd.read_csv("Ankh_Feature_5724_Positive_Taining_Site_Intracellular_from_1638_Proteins.txt",header=None)
df_negative = pd.read_csv("Ankh_Feature_232286_Negative_Taining_Site_Intracellular_from_1638_Proteins.txt",header = None)

df_positive_positive = df_positive
df_negative_negative = df_negative

Header_name = ["Position","PID","S_or_T"]+[i for i in range(1,1537)]
df_positive_positive.columns = Header_name
df_negative_negative.columns = Header_name


df_positive_only_feature_vector = df_positive_positive.drop(["Position","PID","S_or_T"],axis=1)
df_positive_only_fv = np.array(df_positive_only_feature_vector)

df_negative_only_feature_vector = df_negative_negative.drop(["Position","PID","S_or_T"],axis=1)
df_negative_only_fv = np.array(df_negative_only_feature_vector)

X_train_full_Ankh = np.vstack((df_positive_only_fv,df_negative_only_fv))

y_train_full = [1]*df_positive_only_fv.shape[0]+[0]*df_negative_only_fv.shape[0]

y_train_full_Ankh = np.array(y_train_full)

print(X_train_full_Ankh.shape, y_train_full_Ankh.shape)

(238010, 1536) (238010,)


# ProtT5 Nucleocytoplasmic Training Dataset

In [5]:
os.chdir("/homes/t326h379/Ankh_nucleocytoplasmic_dbptm_2231")

df_positive = pd.read_csv("Nucleocytoplasmic_5724_with_1638_Proteins.csv")
df_negative = pd.read_csv("Nucleocytoplasmic_232286_with_1638_Proteins.csv")

df_positive_positive = df_positive
df_negative_negative = df_negative

Header_name = ["Unnamed: 0","Position","PID","S_or_T"]+[i for i in range(1,1025)]
df_positive_positive.columns = Header_name
df_negative_negative.columns = Header_name


df_positive_only_feature_vector = df_positive_positive.drop(["Unnamed: 0","Position","PID","S_or_T"],axis=1)
df_positive_only_fv = np.array(df_positive_only_feature_vector)

df_negative_only_feature_vector = df_negative_negative.drop(["Unnamed: 0","Position","PID","S_or_T"],axis=1)
df_negative_only_fv = np.array(df_negative_only_feature_vector)

X_train_full_ProtT5 = np.vstack((df_positive_only_fv,df_negative_only_fv))

y_train_full = [1]*df_positive_only_fv.shape[0]+[0]*df_negative_only_fv.shape[0]

y_train_full_ProtT5 = np.array(y_train_full)

print(X_train_full_ProtT5.shape,y_train_full_ProtT5.shape)

(238010, 1024) (238010,)


# Ankh Nucleocytoplasmic Independent Test Dataset

In [6]:
os.chdir("/homes/t326h379/Ankh_nucleocytoplasmic_dbptm_2231")

df_positive = pd.read_csv("Ankh_Feature_1062_Positive_Independent_Testing_Site_Intracellular_from_183_Proteins.txt",header = None)
df_negative = pd.read_csv("Ankh_Feature_27031_Negative_Independent_Testing_Site_Intracellular_from_183_Proteins.txt",header = None)

df_positive_positive = df_positive
df_negative_negative = df_negative

Header_name = ["Position","PID","S_or_T"]+[i for i in range(1,1537)]
df_positive_positive.columns = Header_name
df_negative_negative.columns = Header_name

df_positive_only_feature_vector = df_positive_positive.drop(["Position","PID","S_or_T"],axis=1)
df_positive_only_fv = np.array(df_positive_only_feature_vector)

df_negative_only_feature_vector = df_negative_negative.drop(["Position","PID","S_or_T"],axis=1)
df_negative_only_fv = np.array(df_negative_only_feature_vector)

X_test_full = np.vstack((df_positive_only_fv,df_negative_only_fv))

y_test_full = [1]*df_positive_only_fv.shape[0]+[0]*df_negative_only_fv.shape[0]

y_test_full = np.array(y_test_full)

X_independent_Ankh = X_test_full

y_independent_Ankh = y_test_full

print(X_independent_Ankh.shape, y_independent_Ankh.shape)

(28093, 1536) (28093,)


# ProtT5 Nucleocytoplasmic Independent Test Dataset

In [7]:
os.chdir("/homes/t326h379/Ankh_nucleocytoplasmic_dbptm_2231")

df_positive_testing = pd.read_csv("Nucleocytoplasmic_Independent_testing_Positive_1062_with_183_Proteins.csv")
df_negative_testing = pd.read_csv("Nucleocytoplasmic_Independent_testing_Negative_27031_with_183_Proteins.csv")

df_positive_only_test_feature_vector = df_positive_testing.drop(["Unnamed: 0","Position","PID","S or T"],axis=1)
df_positive_only_fv_test = np.array(df_positive_only_test_feature_vector)

df_negative_only_test_feature_vector = df_negative_testing.drop(["Unnamed: 0","Position","PID","S or T"],axis=1)
df_negative_only_fv_test = np.array(df_negative_only_test_feature_vector)

X_test_full = np.vstack((df_positive_only_fv_test,df_negative_only_fv_test))
y_test_full = np.array([1]*df_positive_only_fv_test.shape[0]+[0]*df_negative_only_fv_test.shape[0])

X_independent_ProtT5 = X_test_full

y_independent_ProtT5 = y_test_full

print(X_independent_ProtT5.shape, y_independent_ProtT5.shape)

(28093, 1024) (28093,)


# Use the Ankh and ProtT5 features for prediction 

In [10]:
filename = "Model_Nucleus_and_Cytoplasm_Stacked_2687032Model.h5"

stacked_model = tf.keras.models.load_model(filename)

y_independent = y_independent_Ankh

inputX = [X_independent_Ankh,X_independent_ProtT5]

Y_pred = predict_stacked_model(stacked_model, inputX)

Y_pred = (Y_pred > 0.5)
y_pred = [np.argmax(y, axis=None, out=None) for y in Y_pred]
y_pred = np.array(y_pred)

confusion = confusion_matrix(y_independent,y_pred)

print("Matthews Correlation : ",matthews_corrcoef(y_independent, y_pred))
print("Confusion Matrix : \n",confusion_matrix(y_independent, y_pred))
print("Accuracy on test set:   ",accuracy_score(y_independent, y_pred))

cm = confusion_matrix(y_independent, y_pred)

TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]

mcc = matthews_corrcoef(y_independent, y_pred)

Sensitivity = TP/(TP+FN)

Specificity = TN/(TN+FP)

print("Sensitivity:   ",Sensitivity,"\t","Specificity:   ",Specificity)

print(classification_report(y_independent, y_pred))

ANN_model_Prob_auc = roc_auc_score(y_independent, y_pred)

print("Area Under Curve:   ",ANN_model_Prob_auc)

Matthews Correlation :  0.28241943745359493
Confusion Matrix : 
 [[22320  4711]
 [  252   810]]
Accuracy on test set:    0.8233367742854092
Sensitivity:    0.7627118644067796 	 Specificity:    0.8257186193629537
              precision    recall  f1-score   support

           0       0.99      0.83      0.90     27031
           1       0.15      0.76      0.25      1062

    accuracy                           0.82     28093
   macro avg       0.57      0.79      0.57     28093
weighted avg       0.96      0.82      0.88     28093

Area Under Curve:    0.7942152418848667


# Thank You