In [4]:
import os
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score, roc_auc_score, classification_report

In [5]:
def predict_stacked_model(model, inputX):
    # prepare input data
    X = inputX
    # make prediction
    return model.predict(X, verbose=0)

# Ankh Nucleocytoplasmic Training Dataset

In [96]:
os.chdir("/homes/t326h379/Ankh_nucleocytoplasmic_dbptm_2231")
df_positive = pd.read_csv("Ankh_Feature_5724_Positive_Taining_Site_Intracellular_from_1638_Proteins.txt",header=None)
df_negative = pd.read_csv("Ankh_Feature_232286_Negative_Taining_Site_Intracellular_from_1638_Proteins.txt",header = None)

df_positive_positive = df_positive
df_negative_negative = df_negative

Header_name = ["Position","PID","S_or_T"]+[i for i in range(1,1537)]
df_positive_positive.columns = Header_name
df_negative_negative.columns = Header_name

df_positive_only_feature_vector = df_positive_positive.drop(["Position","PID","S_or_T"],axis=1)
df_positive_only_fv = np.array(df_positive_only_feature_vector)

df_negative_only_feature_vector = df_negative_negative.drop(["Position","PID","S_or_T"],axis=1)
df_negative_only_fv = np.array(df_negative_only_feature_vector)

X_train_full_Ankh = np.vstack((df_positive_only_fv,df_negative_only_fv))

y_train_full = [1]*df_positive_only_fv.shape[0]+[0]*df_negative_only_fv.shape[0]

y_train_full_Ankh = np.array(y_train_full)

print(X_train_full_Ankh.shape, y_train_full_Ankh.shape)

(238010, 1536) (238010,)


In [97]:
Training_PID_positive = list(df_positive_positive["PID"])
Training_Position_position = list(df_positive_positive["Position"])

In [98]:
Positive_Training = set()
for i in range(len(Training_PID_positive)):
    Tuple_value = (Training_PID_positive[i],Training_Position_position[i])
    Positive_Training.add(Tuple_value)

In [99]:
len(Positive_Training)

5724

In [100]:
list(Positive_Training)[:5]

[('Q9UPA5', 1652),
 ('Q9NRM2', 379),
 ('Q5T6F2', 20),
 ('O95487', 255),
 ('Q14980', 1847)]

In [101]:
Training_PID_Negative = list(df_negative_negative["PID"])
Training_Position_Negative = list(df_negative_negative["Position"])

Negative_Training = set()
for i in range(len(Training_PID_Negative)):
    Umpa_Lumpa_value = (Training_PID_Negative[i],Training_Position_Negative[i])
    Negative_Training.add(Umpa_Lumpa_value)

In [102]:
len(Negative_Training)

232286

In [103]:
list(Negative_Training)[:5]

[('O14686', 3830),
 ('Q7Z2Y8', 2334),
 ('Q86XN7', 798),
 ('Q9Y4B6', 95),
 ('Q8NEN0', 572)]

# Lets Check if there is any redundancy within the training sites and protein id

In [13]:
len(Negative_Training.intersection(Positive_Training))

0

# Hurray no redundancy within the training sits and protein id

In [15]:
Total_Training = Negative_Training.union(Positive_Training)

In [17]:
list(Total_Training)[:5]

[('O14686', 3830),
 ('Q7Z2Y8', 2334),
 ('Q86XN7', 798),
 ('Q9Y4B6', 95),
 ('Q8NEN0', 572)]

In [18]:
len(Total_Training)

238010

# Ankh Nucleocytoplasmic Independent Test Dataset

In [46]:
os.chdir("/homes/t326h379/Ankh_nucleocytoplasmic_dbptm_2231")

df_positive_Test = pd.read_csv("Ankh_Feature_1062_Positive_Independent_Testing_Site_Intracellular_from_183_Proteins.txt",header = None)
df_negative_Test = pd.read_csv("Ankh_Feature_27031_Negative_Independent_Testing_Site_Intracellular_from_183_Proteins.txt",header = None)

df_positive_positive_Test = df_positive_Test
df_negative_negative_Test = df_negative_Test

Header_name = ["Position","PID","S_or_T"]+[i for i in range(1,1537)]
df_positive_positive_Test.columns = Header_name
df_negative_negative_Test.columns = Header_name

df_positive_only_feature_vector = df_positive_positive_Test.drop(["Position","PID","S_or_T"],axis=1)
df_positive_only_fv = np.array(df_positive_only_feature_vector)

df_negative_only_feature_vector = df_negative_negative_Test.drop(["Position","PID","S_or_T"],axis=1)
df_negative_only_fv = np.array(df_negative_only_feature_vector)

X_test_full = np.vstack((df_positive_only_fv,df_negative_only_fv))

y_test_full = [1]*df_positive_only_fv.shape[0]+[0]*df_negative_only_fv.shape[0]

y_test_full = np.array(y_test_full)

X_independent_Ankh = X_test_full

y_independent_Ankh = y_test_full

print(X_independent_Ankh.shape, y_independent_Ankh.shape)

(28093, 1536) (28093,)


In [47]:
Test_PID_positive = list(df_positive_positive_Test["PID"])
Test_Position_positive = list(df_positive_positive_Test["Position"])

Positive_Test = set()
for i in range(len(Test_PID_positive)):
    Tuple_value = (Test_PID_positive[i],Test_Position_positive[i])
    Positive_Test.add(Tuple_value)

In [48]:
list(Positive_Test)[:5]

[('P49790', 550),
 ('Q6PJT7', 331),
 ('Q8IX12', 131),
 ('Q14157', 967),
 ('P04792', 187)]

In [49]:
len(Positive_Test)

1062

In [50]:
Test_PID_Negative = list(df_negative_negative_Test["PID"])
Test_Position_Negative = list(df_negative_negative_Test["Position"])

Negative_Test = set()
for i in range(len(Test_PID_Negative)):
    Umpa_Lumpa_value = (Test_PID_Negative[i],Test_Position_Negative[i])
    Negative_Test.add(Umpa_Lumpa_value)

In [51]:
list(Negative_Test)[:5]

[('Q03164', 2027),
 ('Q9NRA8', 792),
 ('Q76NI1', 274),
 ('O14974', 520),
 ('Q03001', 3583)]

In [52]:
len(Negative_Test)

27031

# Lets Check if there is any redundancy within the independent test sites and protein id

In [53]:
Positive_Test.intersection(Negative_Test)

set()

In [54]:
total_testing = Positive_Test.union(Negative_Test)

# Lets Check if there is any redundancy across the independent test (Protein ID and Sites) and training (Protein ID and Sites)

In [55]:
len(Total_Training)

238010

In [56]:
list(Total_Training)[:5]

[('O14686', 3830),
 ('Q7Z2Y8', 2334),
 ('Q86XN7', 798),
 ('Q9Y4B6', 95),
 ('Q8NEN0', 572)]

In [57]:
list(Total_Training)[-5:]

[('Q8NDV7', 1603),
 ('Q15020', 642),
 ('Q9HCD6', 233),
 ('P51610', 961),
 ('Q8N3P4', 313)]

In [58]:
len(total_testing)

28093

In [59]:
list(total_testing)[:5]

[('P07197', 904),
 ('Q03164', 2027),
 ('Q9NRA8', 792),
 ('P49790', 1215),
 ('Q76NI1', 274)]

In [60]:
list(total_testing)[-5:]

[('P49790', 276),
 ('Q3YEC7', 535),
 ('Q5TGY3', 11),
 ('O95817', 348),
 ('Q9H2D6', 146)]

In [61]:
Total_Training.intersection(total_testing)

set()

# Hurray There is no redundancy across training and independent testing

# Download training Protein

In [66]:
len(set(df_positive_positive_Test["PID"]))

183

In [67]:
len(set(df_negative_negative["PID"]))

183

In [68]:
Total_Test_Protein = set(df_positive_positive_Test["PID"]).union(set(df_negative_negative["PID"]))

In [72]:
import os
os.chdir("/homes/t326h379/Nucleocytoplasmic_183_Test_Proteins")
list_Total_Test_Protein = list(Total_Test_Protein)
with open("Nucleocytoplasmic_183_Test_Proteins.txt","a+") as Subash:
    for value in list_Total_Test_Protein:
        download_link = "https://rest.uniprot.org/uniprotkb/"+value+".fasta"
        Subash.write(download_link+"\n")    

# Extract the 81 Positive Window of the Independent Test Dataset from Testing Proteins

In [73]:
len(Test_PID_positive), len(Test_Position_positive)

(1062, 1062)

In [112]:
import os 
os.chdir("/homes/t326h379/Nucleocytoplasmic_183_Test_Proteins")

import re
from Bio import SeqIO

import os 
window_size = 81

def open_file_extract_window(pid,position):
    position = int(position)-1
    fasta_file = pid+".fasta"
    with open("GlacNAC_81_window_INDEPENDENT_TEST_Positive.fasta","a+") as fp:
        for seq_record in SeqIO.parse(fasta_file,"fasta"):
            placeholder = seq_record.id.split("|")[1]  
            seq = str(seq_record.seq)
            if placeholder == pid:
                position = int(position)
                if (seq[position] == "S" or seq[position] == "T"):
                    half_window = window_size // 2

                    C_terminal_calculation = len(seq) - position
                   
                   
                    if len(seq) > 41 and len(seq) < 81:
                        check_length_of_t_terminal = seq[position+1:]
                        check_length_of_N_terminal = seq[:position]

                        
                        if len(check_length_of_t_terminal) < 41 and len(check_length_of_N_terminal) < 41:
                            half_window_size = 40
                            n_terminal_dummy_to_be_filled = half_window_size-len(check_length_of_N_terminal)
                            fillerr=[]            
                            for x in range(half_window_size):
                                dummy="-"
                                fillerr.append(dummy)
                            select = fillerr[0:int(n_terminal_dummy_to_be_filled)]
                            nTer_adjust = ''.join(select)
                            N_Terminal_needed_sequence = nTer_adjust+check_length_of_N_terminal
                            
                            t_terminal_dummy_to_be_filled = half_window_size-len(check_length_of_t_terminal)
                            t_fill = []
                            for x in range(half_window_size):
                                dummy="-"
                                t_fill.append(dummy)
                            t_select = t_fill[0:int(t_terminal_dummy_to_be_filled)]
                            tTer_adjust = ''.join(t_select)
                            TT_Terminal_needed_sequence = check_length_of_t_terminal+tTer_adjust
                            
                            final_window = N_Terminal_needed_sequence + seq[position] + TT_Terminal_needed_sequence
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(final_window)
                            fp.write("\n")                           

                        else:
                            if position < 41:
                                n_terminal_sequence = seq[:position+1]
                                p_terminal_sequence = seq[position+1:41+position]
                                needed_sequence = n_terminal_sequence+p_terminal_sequence
                                dummy_to_be_filled = window_size - len(needed_sequence)

                                fill=[]            
                                for x in range(window_size):
                                    dummy="-"
                                    fill.append(dummy)
                                select = fill[0:int(dummy_to_be_filled)]
                                nTer_adjust = ''.join(select)
                                motif = nTer_adjust +needed_sequence
                                fp.write(">")
                                fp.write(pid)
                                fp.write("|")
                                fp.write(str(position))
                                fp.write("\n")
                                fp.write(motif)
                                fp.write("\n")  

                            if position > 41 or position == 41:
                                index_of_half_window_sequence = position-40
                                n_terminal = seq[index_of_half_window_sequence:position]
                                needed_sequence_we_need = n_terminal+seq[position:]

                                dummy_to_be_filled = window_size - len(needed_sequence_we_need)

                                fill=[]            
                                for x in range(window_size):
                                    dummy="-"
                                    fill.append(dummy)
                                select = fill[0:int(dummy_to_be_filled)]
                                tTer_adjust = ''.join(select)
                                total_length = needed_sequence_we_need+tTer_adjust
                                fp.write(">")
                                fp.write(pid)
                                fp.write("|")
                                fp.write(str(position))
                                fp.write("\n")
                                fp.write(total_length)  
                                fp.write("\n")  


                    if len(seq) > 82 or len(seq) == 82:
                        if position < half_window:
                            # N_terminal_pad
                            dummy_to_be_filled = half_window - position
                            fill=[]            
                            for x in range(window_size):
                                dummy="-"
                                fill.append(dummy)
                            select = fill[0:int(dummy_to_be_filled)]
                            N_terminal_truncated_motif = seq[0:position+half_window+1]
                            nTer_adjust = ''.join(select)
                            motif = nTer_adjust+N_terminal_truncated_motif
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")

                        if C_terminal_calculation < half_window:
                            # C_terminal_pad
                            C_terminal_motif = seq[position-40:]
                            dummy_to_be_filled = window_size - len(C_terminal_motif)

                            fill=[]            
                            for x in range(window_size):
                                dummy="-"
                                fill.append(dummy)
                            select = fill[0:int(dummy_to_be_filled)]
                            cTer_adjust = ''.join(select)
                            motif = C_terminal_motif+cTer_adjust
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")


                        if len(seq[int(position)-half_window:int(position)+half_window+1]) == window_size:
                            motif = seq[int(position)-40:int(position)+41]
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")
                            
for i in range(len(Test_PID_positive)):
    pid = Test_PID_positive[i]
    position = Test_Position_positive[i]
    try:
        open_file_extract_window(pid,position)
    except:
        pass

In [113]:
Testing_Independent_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins = set()
with open("GlacNAC_81_window_INDEPENDENT_TEST_Positive.fasta") as fp:
    for line in fp:
        if line.startswith(">"):
            pass
        else:
            x = line.strip("\n")
            Testing_Independent_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins.add(x)

In [114]:
len(Testing_Independent_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins)

1055

In [115]:
list(Testing_Independent_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins)[:5]

['GFGTALGAGQASLFGNNQPKIGGPLGTGAFGAPGFNTTTATLGFGAPQAPVALTDPNASAAQQAVLQQHINSLTYSPFGDS',
 'EAGAEWAGDKGGGWAPHHGHPGGQAGRNCGFQGTEARAFASTGLESGASGRGSYYSTGAPSGQTELSQERQNLFTGYFRSL',
 '----MFQVPDSEGGRAGSRAMKPPGGESSNLFGSPEEATPSSRPNRMASNIFGPTEEPQNIPKRTNPPGGKGSGIFDESTP',
 'EDSFLGQTSIHTSAPQTFSYFSQVSSSSDPFGNIGQSPLTTAATSVGQSGFPKPLTALPFTTGSQDVSNAFSPSISKAQPG',
 'FVFGQSSNPVSSSAFGNTAESSTSQSLLFSQDSKLATTSSTGTAVTPFVFGPGASSNNTTTSGFGFGATTTSSSAGSSFVF']

# Extract the 81 Negative Window of the Independent Test Dataset from Testing Proteins

In [81]:
len(Test_PID_Negative), len(Test_Position_Negative)

(27031, 27031)

In [83]:
Test_PID_Negative[:5]

['Q8TC56', 'Q9Y4B4', 'O94880', 'Q01813', 'Q14157']

In [85]:
Test_PID_Negative[-5:]

['Q8TC56', 'O00507', 'O15417', 'Q9UGU0', 'Q8IVE3']

In [86]:
Test_Position_Negative[:5]

[77, 49, 905, 91, 553]

In [87]:
Test_Position_Negative[-5:]

[326, 615, 2561, 1830, 318]

In [116]:
import os 
os.chdir("/homes/t326h379/Nucleocytoplasmic_183_Test_Proteins")

import re
from Bio import SeqIO

import os 
window_size = 81

def open_file_extract_window(pid,position):
    position = int(position)-1
    fasta_file = pid+".fasta"
    with open("GlacNAC_81_window_INDEPENDENT_TEST_Negative.fasta","a+") as fp:
        for seq_record in SeqIO.parse(fasta_file,"fasta"):
            placeholder = seq_record.id.split("|")[1]  
            seq = str(seq_record.seq)
            if placeholder == pid:
                position = int(position)
                if (seq[position] == "S" or seq[position] == "T"):
                    half_window = window_size // 2

                    C_terminal_calculation = len(seq) - position
                   
                   
                    if len(seq) > 41 and len(seq) < 81:
                        check_length_of_t_terminal = seq[position+1:]
                        check_length_of_N_terminal = seq[:position]

                        
                        if len(check_length_of_t_terminal) < 41 and len(check_length_of_N_terminal) < 41:
                            half_window_size = 40
                            n_terminal_dummy_to_be_filled = half_window_size-len(check_length_of_N_terminal)
                            fillerr=[]            
                            for x in range(half_window_size):
                                dummy="-"
                                fillerr.append(dummy)
                            select = fillerr[0:int(n_terminal_dummy_to_be_filled)]
                            nTer_adjust = ''.join(select)
                            N_Terminal_needed_sequence = nTer_adjust+check_length_of_N_terminal
                            
                            t_terminal_dummy_to_be_filled = half_window_size-len(check_length_of_t_terminal)
                            t_fill = []
                            for x in range(half_window_size):
                                dummy="-"
                                t_fill.append(dummy)
                            t_select = t_fill[0:int(t_terminal_dummy_to_be_filled)]
                            tTer_adjust = ''.join(t_select)
                            TT_Terminal_needed_sequence = check_length_of_t_terminal+tTer_adjust
                            
                            final_window = N_Terminal_needed_sequence + seq[position] + TT_Terminal_needed_sequence
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(final_window)
                            fp.write("\n")                           

                        else:
                            if position < 41:
                                n_terminal_sequence = seq[:position+1]
                                p_terminal_sequence = seq[position+1:41+position]
                                needed_sequence = n_terminal_sequence+p_terminal_sequence
                                dummy_to_be_filled = window_size - len(needed_sequence)

                                fill=[]            
                                for x in range(window_size):
                                    dummy="-"
                                    fill.append(dummy)
                                select = fill[0:int(dummy_to_be_filled)]
                                nTer_adjust = ''.join(select)
                                motif = nTer_adjust +needed_sequence
                                fp.write(">")
                                fp.write(pid)
                                fp.write("|")
                                fp.write(str(position))
                                fp.write("\n")
                                fp.write(motif)
                                fp.write("\n")  

                            if position > 41 or position == 41:
                                index_of_half_window_sequence = position-40
                                n_terminal = seq[index_of_half_window_sequence:position]
                                needed_sequence_we_need = n_terminal+seq[position:]

                                dummy_to_be_filled = window_size - len(needed_sequence_we_need)

                                fill=[]            
                                for x in range(window_size):
                                    dummy="-"
                                    fill.append(dummy)
                                select = fill[0:int(dummy_to_be_filled)]
                                tTer_adjust = ''.join(select)
                                total_length = needed_sequence_we_need+tTer_adjust
                                fp.write(">")
                                fp.write(pid)
                                fp.write("|")
                                fp.write(str(position))
                                fp.write("\n")
                                fp.write(total_length)  
                                fp.write("\n")  


                    if len(seq) > 82 or len(seq) == 82:
                        if position < half_window:
                            # N_terminal_pad
                            dummy_to_be_filled = half_window - position
                            fill=[]            
                            for x in range(window_size):
                                dummy="-"
                                fill.append(dummy)
                            select = fill[0:int(dummy_to_be_filled)]
                            N_terminal_truncated_motif = seq[0:position+half_window+1]
                            nTer_adjust = ''.join(select)
                            motif = nTer_adjust+N_terminal_truncated_motif
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")

                        if C_terminal_calculation < half_window:
                            # C_terminal_pad
                            C_terminal_motif = seq[position-40:]
                            dummy_to_be_filled = window_size - len(C_terminal_motif)

                            fill=[]            
                            for x in range(window_size):
                                dummy="-"
                                fill.append(dummy)
                            select = fill[0:int(dummy_to_be_filled)]
                            cTer_adjust = ''.join(select)
                            motif = C_terminal_motif+cTer_adjust
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")


                        if len(seq[int(position)-half_window:int(position)+half_window+1]) == window_size:
                            motif = seq[int(position)-40:int(position)+41]
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")
                            
for i in range(len(Test_PID_Negative)):
    pid = Test_PID_Negative[i]
    position = Test_Position_Negative[i]
    try:
        open_file_extract_window(pid,position)
    except:
        pass

In [117]:
Testing_Independent_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins = set()
with open("GlacNAC_81_window_INDEPENDENT_TEST_Negative.fasta") as fp:
    for line in fp:
        if line.startswith(">"):
            pass
        else:
            x = line.strip("\n")
            Testing_Independent_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins.add(x)

In [118]:
list(Testing_Independent_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins)[:5]

['SGGEERLASHNLFREEEQCDLPKISQLDGVDDGTESDTSVTATTRKSSQIPKRNGKENGTENLKIDRPEDAGEKEHVTKSS',
 'TTLTAPQPPQVPPTQQVPPSQSQQQAQTLVVQPMLQSSPLSLPPDAAPKPPIPIQSKPPVAPIKPPQLGAAKMSAAQQPPP',
 'GGDGATKYITKSVTVTQKVEEHEETFEEKLVSTKKVEKVTSHAIVKEVTQSD-----------------------------',
 'LTVVDKPDPPAGTPCASDIRSSSLTLSWYGSSYDGGSAVQSYSIEIWDSANKTWKELATCRSTSFNVQDLLPDHEYKFRVR',
 'KHKQLNESIIVALFQGQFKSTVQCLTCHKKSRTFEAFMYLSLPLASTSKCTLQDCLRLFSKEEKLTDNNRFYCSHCRARRD']

# Lets check the peptide similarity within the test postive and test negative dataset

In [119]:
Testing_Independent_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins.intersection(Testing_Independent_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins)

set()

# Hurray there is no peptide similarity with the test dataset

In [106]:
len(set(Training_PID_positive))

1638

In [107]:
len(set(Training_PID_Negative))

1638

In [110]:
total_1638_Training_Protein = list(set(Training_PID_positive).union(set(Training_PID_Negative)))

In [111]:
import os
os.chdir("/homes/t326h379/Nucleocytoplasmic_1638_Training_Protein")
with open("Nucleocytoplasmic_1638_Training_Protein.txt","a+") as Subash:
    for value in total_1638_Training_Protein:
        download_link = "https://rest.uniprot.org/uniprotkb/"+value+".fasta"
        Subash.write(download_link+"\n")    

# Extract the 81 Positive Window of the Training Dataset from Training Proteins

In [124]:
import os 
os.chdir("/homes/t326h379/Nucleocytoplasmic_1638_Training_Protein")

import re
from Bio import SeqIO

import os 
window_size = 81

def open_file_extract_window(pid,position):
    position = int(position)-1
    fasta_file = pid+".fasta"
    with open("GlacNAC_81_window_Training_Positive.fasta","a+") as fp:
        for seq_record in SeqIO.parse(fasta_file,"fasta"):
            placeholder = seq_record.id.split("|")[1]  
            seq = str(seq_record.seq)
            if placeholder == pid:
                position = int(position)
                if (seq[position] == "S" or seq[position] == "T"):
                    half_window = window_size // 2

                    C_terminal_calculation = len(seq) - position
                   
                   
                    if len(seq) > 41 and len(seq) < 81:
                        check_length_of_t_terminal = seq[position+1:]
                        check_length_of_N_terminal = seq[:position]

                        
                        if len(check_length_of_t_terminal) < 41 and len(check_length_of_N_terminal) < 41:
                            half_window_size = 40
                            n_terminal_dummy_to_be_filled = half_window_size-len(check_length_of_N_terminal)
                            fillerr=[]            
                            for x in range(half_window_size):
                                dummy="-"
                                fillerr.append(dummy)
                            select = fillerr[0:int(n_terminal_dummy_to_be_filled)]
                            nTer_adjust = ''.join(select)
                            N_Terminal_needed_sequence = nTer_adjust+check_length_of_N_terminal
                            
                            t_terminal_dummy_to_be_filled = half_window_size-len(check_length_of_t_terminal)
                            t_fill = []
                            for x in range(half_window_size):
                                dummy="-"
                                t_fill.append(dummy)
                            t_select = t_fill[0:int(t_terminal_dummy_to_be_filled)]
                            tTer_adjust = ''.join(t_select)
                            TT_Terminal_needed_sequence = check_length_of_t_terminal+tTer_adjust
                            
                            final_window = N_Terminal_needed_sequence + seq[position] + TT_Terminal_needed_sequence
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(final_window)
                            fp.write("\n")                           

                        else:
                            if position < 41:
                                n_terminal_sequence = seq[:position+1]
                                p_terminal_sequence = seq[position+1:41+position]
                                needed_sequence = n_terminal_sequence+p_terminal_sequence
                                dummy_to_be_filled = window_size - len(needed_sequence)

                                fill=[]            
                                for x in range(window_size):
                                    dummy="-"
                                    fill.append(dummy)
                                select = fill[0:int(dummy_to_be_filled)]
                                nTer_adjust = ''.join(select)
                                motif = nTer_adjust +needed_sequence
                                fp.write(">")
                                fp.write(pid)
                                fp.write("|")
                                fp.write(str(position))
                                fp.write("\n")
                                fp.write(motif)
                                fp.write("\n")  

                            if position > 41 or position == 41:
                                index_of_half_window_sequence = position-40
                                n_terminal = seq[index_of_half_window_sequence:position]
                                needed_sequence_we_need = n_terminal+seq[position:]

                                dummy_to_be_filled = window_size - len(needed_sequence_we_need)

                                fill=[]            
                                for x in range(window_size):
                                    dummy="-"
                                    fill.append(dummy)
                                select = fill[0:int(dummy_to_be_filled)]
                                tTer_adjust = ''.join(select)
                                total_length = needed_sequence_we_need+tTer_adjust
                                fp.write(">")
                                fp.write(pid)
                                fp.write("|")
                                fp.write(str(position))
                                fp.write("\n")
                                fp.write(total_length)  
                                fp.write("\n")  


                    if len(seq) > 82 or len(seq) == 82:
                        if position < half_window:
                            # N_terminal_pad
                            dummy_to_be_filled = half_window - position
                            fill=[]            
                            for x in range(window_size):
                                dummy="-"
                                fill.append(dummy)
                            select = fill[0:int(dummy_to_be_filled)]
                            N_terminal_truncated_motif = seq[0:position+half_window+1]
                            nTer_adjust = ''.join(select)
                            motif = nTer_adjust+N_terminal_truncated_motif
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")

                        if C_terminal_calculation < half_window:
                            # C_terminal_pad
                            C_terminal_motif = seq[position-40:]
                            dummy_to_be_filled = window_size - len(C_terminal_motif)

                            fill=[]            
                            for x in range(window_size):
                                dummy="-"
                                fill.append(dummy)
                            select = fill[0:int(dummy_to_be_filled)]
                            cTer_adjust = ''.join(select)
                            motif = C_terminal_motif+cTer_adjust
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")


                        if len(seq[int(position)-half_window:int(position)+half_window+1]) == window_size:
                            motif = seq[int(position)-40:int(position)+41]
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")
                            
for i in range(len(Training_PID_positive)):
    pid = Training_PID_positive[i]
    position = Training_Position_position[i]
    try:
        open_file_extract_window(pid,position)
    except:
        pass

In [120]:
len(Training_PID_positive), len(Training_Position_position)

(5724, 5724)

In [122]:
Training_PID_positive[:5]

['Q9UQE7', 'Q8IXF0', 'Q8N684', 'Q9Y6V0', 'Q14980']

In [123]:
Training_Position_position[:5]

[387, 817, 348, 2682, 2093]

In [125]:
Training_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins = set()
with open("GlacNAC_81_window_Training_Positive.fasta") as fp:
    for line in fp:
        if line.startswith(">"):
            pass
        else:
            x = line.strip("\n")
            Training_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins.add(x)

In [127]:
len(Training_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins)

5707

In [129]:
list(Training_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins)[:5]

['SCSPQKAQEANKARPSAWEPAAGNSPARASVPAAPNPAATSATSVHVRSPARPSESRLAPTPTEGKVRPRVTNSSPMGWSS',
 'NLKAFVPAKRPIAARPSPGGVFTQFVMSKVGALQQKIPGVSTPQTLAGTQKFSIRPSPVMVVTPVVSSEPVQVCSPVTAAV',
 'PLVNDDIHAPGTSKSSLSDSLVCISEKNLPGHSKNTPLAMSDVGKVHKKDNEINIGKIELIPSMLETGKTNKKDAELNILK',
 'HLAYRMNMCSAGVARKAAEEVTLQTGIKRFVAGALGPTNKTLSVSPSVERPDYRNITFDELVEAYQEQAKGLLDGGVDILL',
 'PDDTYRLSRLLLRLPALRLMNATITEELFFKGLIGNIRIDSVIPHILKMEPADYNSQIIGHSI------------------']

# Extract the 81 Negative Window of the Training Dataset from Training Proteins

In [130]:
len(Training_PID_Negative), len(Training_Position_Negative)

(232286, 232286)

In [131]:
Training_PID_Negative[:5]

['Q6ZUM4', 'Q9ULT8', 'Q15648', 'Q96JG9', 'P31629']

In [132]:
Training_Position_Negative[:5]

[273, 2572, 1440, 3055, 2298]

In [133]:
import os 
os.chdir("/homes/t326h379/Nucleocytoplasmic_1638_Training_Protein")

import re
from Bio import SeqIO

import os 
window_size = 81

def open_file_extract_window(pid,position):
    position = int(position)-1
    fasta_file = pid+".fasta"
    with open("GlacNAC_81_window_Training_Negative.fasta","a+") as fp:
        for seq_record in SeqIO.parse(fasta_file,"fasta"):
            placeholder = seq_record.id.split("|")[1]  
            seq = str(seq_record.seq)
            if placeholder == pid:
                position = int(position)
                if (seq[position] == "S" or seq[position] == "T"):
                    half_window = window_size // 2

                    C_terminal_calculation = len(seq) - position
                   
                   
                    if len(seq) > 41 and len(seq) < 81:
                        check_length_of_t_terminal = seq[position+1:]
                        check_length_of_N_terminal = seq[:position]

                        
                        if len(check_length_of_t_terminal) < 41 and len(check_length_of_N_terminal) < 41:
                            half_window_size = 40
                            n_terminal_dummy_to_be_filled = half_window_size-len(check_length_of_N_terminal)
                            fillerr=[]            
                            for x in range(half_window_size):
                                dummy="-"
                                fillerr.append(dummy)
                            select = fillerr[0:int(n_terminal_dummy_to_be_filled)]
                            nTer_adjust = ''.join(select)
                            N_Terminal_needed_sequence = nTer_adjust+check_length_of_N_terminal
                            
                            t_terminal_dummy_to_be_filled = half_window_size-len(check_length_of_t_terminal)
                            t_fill = []
                            for x in range(half_window_size):
                                dummy="-"
                                t_fill.append(dummy)
                            t_select = t_fill[0:int(t_terminal_dummy_to_be_filled)]
                            tTer_adjust = ''.join(t_select)
                            TT_Terminal_needed_sequence = check_length_of_t_terminal+tTer_adjust
                            
                            final_window = N_Terminal_needed_sequence + seq[position] + TT_Terminal_needed_sequence
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(final_window)
                            fp.write("\n")                           

                        else:
                            if position < 41:
                                n_terminal_sequence = seq[:position+1]
                                p_terminal_sequence = seq[position+1:41+position]
                                needed_sequence = n_terminal_sequence+p_terminal_sequence
                                dummy_to_be_filled = window_size - len(needed_sequence)

                                fill=[]            
                                for x in range(window_size):
                                    dummy="-"
                                    fill.append(dummy)
                                select = fill[0:int(dummy_to_be_filled)]
                                nTer_adjust = ''.join(select)
                                motif = nTer_adjust +needed_sequence
                                fp.write(">")
                                fp.write(pid)
                                fp.write("|")
                                fp.write(str(position))
                                fp.write("\n")
                                fp.write(motif)
                                fp.write("\n")  

                            if position > 41 or position == 41:
                                index_of_half_window_sequence = position-40
                                n_terminal = seq[index_of_half_window_sequence:position]
                                needed_sequence_we_need = n_terminal+seq[position:]

                                dummy_to_be_filled = window_size - len(needed_sequence_we_need)

                                fill=[]            
                                for x in range(window_size):
                                    dummy="-"
                                    fill.append(dummy)
                                select = fill[0:int(dummy_to_be_filled)]
                                tTer_adjust = ''.join(select)
                                total_length = needed_sequence_we_need+tTer_adjust
                                fp.write(">")
                                fp.write(pid)
                                fp.write("|")
                                fp.write(str(position))
                                fp.write("\n")
                                fp.write(total_length)  
                                fp.write("\n")  


                    if len(seq) > 82 or len(seq) == 82:
                        if position < half_window:
                            # N_terminal_pad
                            dummy_to_be_filled = half_window - position
                            fill=[]            
                            for x in range(window_size):
                                dummy="-"
                                fill.append(dummy)
                            select = fill[0:int(dummy_to_be_filled)]
                            N_terminal_truncated_motif = seq[0:position+half_window+1]
                            nTer_adjust = ''.join(select)
                            motif = nTer_adjust+N_terminal_truncated_motif
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")

                        if C_terminal_calculation < half_window:
                            # C_terminal_pad
                            C_terminal_motif = seq[position-40:]
                            dummy_to_be_filled = window_size - len(C_terminal_motif)

                            fill=[]            
                            for x in range(window_size):
                                dummy="-"
                                fill.append(dummy)
                            select = fill[0:int(dummy_to_be_filled)]
                            cTer_adjust = ''.join(select)
                            motif = C_terminal_motif+cTer_adjust
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")


                        if len(seq[int(position)-half_window:int(position)+half_window+1]) == window_size:
                            motif = seq[int(position)-40:int(position)+41]
                            fp.write(">")
                            fp.write(pid)
                            fp.write("|")
                            fp.write(str(position))
                            fp.write("\n")
                            fp.write(motif)
                            fp.write("\n")
                            
for i in range(len(Training_PID_Negative)):
    pid = Training_PID_Negative[i]
    position = Training_Position_Negative[i]
    try:
        open_file_extract_window(pid,position)
    except:
        pass

In [142]:
Training_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins = set()
with open("GlacNAC_81_window_Training_Negative.fasta") as fp:
    for line in fp:
        if line.startswith(">"):
            pass
        else:
            x = line.strip("\n")
            Training_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins.add(x)

In [143]:
len(Training_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins)

230275

# Lets check the peptide identity between the independent test positive and negative dataset

In [148]:
Training_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins.intersection(Training_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins)

set()

# Hurray (Oompa-Loompa) there are no peptide similarity withing test dataset

In [145]:
Total_Training_Site = Training_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins.union(Training_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins)

In [146]:
Total_Independent_Testing_Site = Testing_Independent_Positive_81_window_Peptide_of_Nucleocytoplasmic_proteins.union(Testing_Independent_Negative_81_window_Peptide_of_Nucleocytoplasmic_proteins)

# Lets check the peptide similarity between the independent test and training peptides

In [149]:
Total_Training_Site.intersection(Total_Independent_Testing_Site)

set()

# No peptides are similar between the independent and training dataset