In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import *
import tensorflow as tf
from tensorflow import keras
from imblearn.under_sampling import RandomUnderSampler


os.chdir("/home/t326h379/OGP")

In [5]:
df_negative_test = pd.read_csv('Feature_Extraction_O_linked_Testing_Negative_11466_Sites_less.txt',header=None)
df_positive_test = pd.read_csv('Feature_Extraction_O_linked_Testing_Positive_375_Sites_less.txt',header=None)

Header_name = ["Position","PID","Position_redundant","81 Window sequence","S or T"]

col_of_feature = [i for i in range(1,1025)]

Header_name = Header_name + col_of_feature

df_positive_test.columns = Header_name

df_negative_test.columns = Header_name


frames_test = [df_positive_test, df_negative_test]

O_linked_testing = pd.concat(frames_test,ignore_index = True)

df_Test_array = O_linked_testing.drop(["Position","PID","Position_redundant","81 Window sequence","S or T"],axis=1)
df_Test_array = np.array(df_Test_array)

X_test_full = df_Test_array

y_test_full = np.array([1]*374+[0]*11466)

# Undersampled Independent Test Dataset

In [9]:
seed = 370381
print("Seed    :",seed)
rus = RandomUnderSampler(random_state = seed)
X_independent, y_independent = rus.fit_resample(X_test_full,y_test_full)

Seed    : 370381


# Load the Trained Model

In [6]:
model = tf.keras.models.load_model("Prot_T5_my_model_O_linked_Glycosylation370381Prot_T5_Subash_Salman_Neha.h5")

# Predict whether the site is O-linked Glycosylated or Not

In [16]:
Y_pred = model.predict(X_independent)
Y_pred = (Y_pred > 0.5)
y_pred = [np.argmax(y, axis=None, out=None) for y in Y_pred]
y_pred = np.array(y_pred)
print("Independent Testing Matthews Correlation: ",matthews_corrcoef(y_independent, y_pred))
print()
print()
print("Confusion Matrix : \n",confusion_matrix(y_independent, y_pred))
print()
print()
print("Accuracy on test set:   ",accuracy_score(y_independent, y_pred))
print()
cm = confusion_matrix(y_independent, y_pred)

TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]

mcc = matthews_corrcoef(y_independent, y_pred)



Sensitivity = TP/(TP+FN)

Specificity = TN/(TN+FP)

print()
print("Sensitivity:   ",Sensitivity,"\t","Specificity:   ",Specificity)
print()

fpr, tpr, _ = roc_curve(y_independent, y_pred)

roc_auc_test = auc(fpr,tpr)

pre = TP / (TP + FP)
print()
print("Precision :  ",pre)
print()
print()

print("Area Under Curve:   ",roc_auc_test)

Independent Testing Matthews Correlation:  0.6126696948450258


Confusion Matrix : 
 [[308  66]
 [ 79 295]]


Accuracy on test set:    0.8061497326203209


Sensitivity:    0.7887700534759359 	 Specificity:    0.8235294117647058


Precision :   0.817174515235457


Area Under Curve:    0.8061497326203209


In [12]:
model.summary()

Model: "sequential_44"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_88 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_132 (Dropout)        (None, 512)               0         
_________________________________________________________________
dense_89 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_133 (Dropout)        (None, 256)               0         
_________________________________________________________________
Dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_134 (Dropout)        (None, 32)                0         
_________________________________________________________________
Dense_2 (Dense)              (None, 2)               

# Without Undersampling the independent Test Dataset

In [17]:
model_without_undersampling = tf.keras.models.load_model("Without_Undersampling_Prot_T5_my_model_O_linked_Glycosylation361035Prot_T5_Subash_Salman_Neha.h5")

In [None]:
Y_pred = model.predict(X_independent)
Y_pred = (Y_pred > 0.5)
y_pred = [np.argmax(y, axis=None, out=None) for y in Y_pred]
y_pred = np.array(y_pred)
print("Independent Testing Matthews Correlation: ",matthews_corrcoef(y_independent, y_pred))
print()
print()
print("Confusion Matrix : \n",confusion_matrix(y_independent, y_pred))
print()
print()
print("Accuracy on test set:   ",accuracy_score(y_independent, y_pred))
print()
cm = confusion_matrix(y_independent, y_pred)

TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]

mcc = matthews_corrcoef(y_independent, y_pred)



Sensitivity = TP/(TP+FN)

Specificity = TN/(TN+FP)

print()
print("Sensitivity:   ",Sensitivity,"\t","Specificity:   ",Specificity)
print()

fpr, tpr, _ = roc_curve(y_independent, y_pred)

roc_auc_test = auc(fpr,tpr)

pre = TP / (TP + FP)
print()
print("Precision :  ",pre)
print()
print()

print("Area Under Curve:   ",roc_auc_test)

# Thank You