# Pre-process .seq file
The protein library is stored in .seq file with name and sequence. This code coverts into into csv file suitable for our project.

In [15]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import pandas as pd

In [16]:
with open('uniprot_human.seq') as seq_file:
    identifiers = []
    sequences = []
    for title, sequence in SimpleFastaParser(seq_file):
        identifiers.append(title.split(None, 1)[0])  # First word is ID
        sequences.append(sequence)

In [17]:
#creating a pandas dataframe and saving as csv
d = {'ID': identifiers, 'Sequence': sequences}
df = pd.DataFrame(data=d)

In [18]:
print(df.shape)
df.head()

(20117, 2)


Unnamed: 0,ID,Sequence
0,P00441,MATKAVCVLKGDGPVQGIINFEQKESNGPVKVWGSIKGLTEGLHGF...
1,Q13046,MGPLSAPPCTQHITWKGLLLTASLLNFWNPPTTAQVTIEAQPPKVS...
2,P0C868,MVVDLKNLLFNPSKPVSRGSQPADVDLMIDCLVSCLRVSPHNNQQF...
3,Q96G97,MVNDPPVPALLWAQEVGQVLAGRARRLLLQFGVLFCTILLLLWVSV...
4,Q9UBN7,MTSTGQDSTTTRQRRSRQNPQSPPQDSSVTSKRNIKKGAVPRSIPN...


In [19]:
df.to_csv("uniprot_human.csv")

# Pre-process .txt file
Convert protein interaction txt file to suitable csv file for analysis pipeline

In [26]:
with open('human_balanced_test.txt') as txt_file:
    content = txt_file.readlines()
content = [x.strip().split() for x in content]

ID1 = [x[1] for x in content]
ID2 = [x[2] for x in content]
interactions = [x[0] for x in content]

In [27]:
#creating a pandas dataframe and saving as csv
d = {'ID1': ID1,'ID2':ID2,'Interaction':interactions}
df = pd.DataFrame(data=d)
df.loc[df['Interaction'] == 'Positive', 'Interaction'] = 1
df.loc[df['Interaction'] == 'Negative', 'Interaction'] = 0

In [28]:
print(df.shape)
df.head()

(2776, 3)


Unnamed: 0,ID1,ID2,Interaction
0,Q9Y574,Q13617,1
1,O95999,P55211,1
2,P30305,O14965,1
3,Q9BXS5,P01833,1
4,P08581,P21554,1


In [29]:
df.to_csv("test_balanced.csv")

In [30]:
with open('human_balanced_train.txt') as txt_file:
    content = txt_file.readlines()
content = [x.strip().split() for x in content]

ID1 = [x[1] for x in content]
ID2 = [x[2] for x in content]
interactions = [x[0] for x in content]

In [31]:
#creating a pandas dataframe and saving as csv
d = {'ID1': ID1,'ID2':ID2,'Interaction':interactions}
df = pd.DataFrame(data=d)
df.loc[df['Interaction'] == 'Positive', 'Interaction'] = 1
df.loc[df['Interaction'] == 'Negative', 'Interaction'] = 0

In [32]:
print(df.shape)
df.head()

(24998, 3)


Unnamed: 0,ID1,ID2,Interaction
0,O43172,O43290,1
1,Q9NWS9,P17028,1
2,Q9Y230,Q86XP3,1
3,Q96RL1,Q13564,1
4,O75400,P10586,1


In [33]:
df.to_csv("train_balanced.csv")

In [37]:
#join the two datasplits into one for convenience
df1 = pd.read_csv("train_balanced.csv", index_col = 0) 
df2 = pd.read_csv("test_balanced.csv", index_col = 0)
concat = pd.concat([df1,df2])
concat.head()
concat.shape

(27774, 3)

In [38]:
concat.to_csv("balanced_interactions.csv")