In [1]:
import pandas as pd
from Bio import SeqIO

In [2]:
df_nesg = pd.read_csv("../../raw_data/DSRESsol/nesg.csv")
df_nesg

Unnamed: 0,id,exp,sol
0,AR3338B,5.0,0
1,AR3347A,0.0,0
2,AR3353C,4.0,3
3,AR3354C,0.0,0
4,AR3358A,5.0,2
...,...,...,...
9698,ZR94,0.0,3
9699,ZR95,4.0,0
9700,ZR96,5.0,1
9701,ZR97,2.0,1


In [4]:
matrix_data = []

for record in SeqIO.parse("../../raw_data/DSRESsol/nesg.fasta", "fasta"):
    id_seq = record.id
    sequence = str(record.seq)
    
    row = [
        id_seq,
        sequence    
    ]

    matrix_data.append(row)

fasta_nesg = pd.DataFrame(data=matrix_data, columns=["id_seq", "sequence"])

In [5]:
fasta_nesg.head(5)

Unnamed: 0,id_seq,sequence
0,AR3338B,MSVHKLTDLRDNSTNWKINVKILSIWNHPPNSHGEITTMILHDDKN...
1,AR3347A,IPFDYIVEKTVSTGVLVDVIGALLEVGNLTEDYRGLKLPFKIMDQY...
2,AR3353C,MEEERRDDYKFLRIQDAFKALHLHVNLIGVIVELGFSNGSDCSCTL...
3,AR3354C,ALLRRFIGQKVRTVIQVTGSEIGSVVGKSTDDLQIVVRGSSPPSPL...
4,AR3358A,MAASFAFLRDVRPYKTSWRVQVKVLHSWCQYTNMTGETLKLVLVNS...


In [6]:
fasta_nesg = fasta_nesg.merge(right=df_nesg, left_on="id_seq", right_on="id")
fasta_nesg.head(5)

Unnamed: 0,id_seq,sequence,id,exp,sol
0,AR3338B,MSVHKLTDLRDNSTNWKINVKILSIWNHPPNSHGEITTMILHDDKN...,AR3338B,5.0,0
1,AR3347A,IPFDYIVEKTVSTGVLVDVIGALLEVGNLTEDYRGLKLPFKIMDQY...,AR3347A,0.0,0
2,AR3353C,MEEERRDDYKFLRIQDAFKALHLHVNLIGVIVELGFSNGSDCSCTL...,AR3353C,4.0,3
3,AR3354C,ALLRRFIGQKVRTVIQVTGSEIGSVVGKSTDDLQIVVRGSSPPSPL...,AR3354C,0.0,0
4,AR3358A,MAASFAFLRDVRPYKTSWRVQVKVLHSWCQYTNMTGETLKLVLVNS...,AR3358A,5.0,2


In [7]:
fasta_nesg["sol"].value_counts()

0    3806
5    2407
3    1183
4     926
2     857
1     524
Name: sol, dtype: int64

In [13]:
fasta_nesg["sol"] = fasta_nesg["sol"].replace({1:0, 2:0, 3:1, 4:1, 5:1})
fasta_nesg["sol"].value_counts()

0    5187
1    4516
Name: sol, dtype: int64

In [14]:
fasta_nesg.head(5)

Unnamed: 0,id_seq,sequence,id,exp,sol
0,AR3338B,MSVHKLTDLRDNSTNWKINVKILSIWNHPPNSHGEITTMILHDDKN...,AR3338B,5.0,0
1,AR3347A,IPFDYIVEKTVSTGVLVDVIGALLEVGNLTEDYRGLKLPFKIMDQY...,AR3347A,0.0,0
2,AR3353C,MEEERRDDYKFLRIQDAFKALHLHVNLIGVIVELGFSNGSDCSCTL...,AR3353C,4.0,1
3,AR3354C,ALLRRFIGQKVRTVIQVTGSEIGSVVGKSTDDLQIVVRGSSPPSPL...,AR3354C,0.0,0
4,AR3358A,MAASFAFLRDVRPYKTSWRVQVKVLHSWCQYTNMTGETLKLVLVNS...,AR3358A,5.0,0


In [15]:
fasta_nesg = fasta_nesg[["sequence", "sol"]]
fasta_nesg.columns = ["sequence", "response"]

In [9]:
df_nsseq = pd.read_csv("../../raw_data/DSRESsol/nsseq.csv")
df_nsseq.head(5)

Unnamed: 0,Seq,solubility
0,MSVHKLTDLRDNSTNWKINVKILSIWNHPPNSHGEITTMILHDDKN...,0
1,IPFDYIVEKTVSTGVLVDVIGALLEVGNLTEDYRGLKLPFKIMDQY...,0
2,MEEERRDDYKFLRIQDAFKALHLHVNLIGVIVELGFSNGSDCSCTL...,1
3,ALLRRFIGQKVRTVIQVTGSEIGSVVGKSTDDLQIVVRGSSPPSPL...,0
4,MAASFAFLRDVRPYKTSWRVQVKVLHSWCQYTNMTGETLKLVLVNS...,1


In [10]:
df_nsseq["solubility"].value_counts()

0    1865
1    1864
Name: solubility, dtype: int64

In [16]:
df_nsseq.columns = ["sequence", "response"]

In [17]:
df_concat = pd.concat([fasta_nesg, df_nsseq], axis=0)
df_concat.head(5)

Unnamed: 0,sequence,response
0,MSVHKLTDLRDNSTNWKINVKILSIWNHPPNSHGEITTMILHDDKN...,0
1,IPFDYIVEKTVSTGVLVDVIGALLEVGNLTEDYRGLKLPFKIMDQY...,0
2,MEEERRDDYKFLRIQDAFKALHLHVNLIGVIVELGFSNGSDCSCTL...,1
3,ALLRRFIGQKVRTVIQVTGSEIGSVVGKSTDDLQIVVRGSSPPSPL...,0
4,MAASFAFLRDVRPYKTSWRVQVKVLHSWCQYTNMTGETLKLVLVNS...,0


In [18]:
df_concat["response"].value_counts()

0    7052
1    6380
Name: response, dtype: int64

In [35]:
df_concat.shape

(13432, 3)

In [19]:
df_concat["id_seq"] = [i for i in range(len(df_concat))]

In [20]:
checking_duplicated = df_concat.groupby(by=["sequence", "response"]).count().sort_values(by="id_seq", ascending=False)

In [21]:
sequences_no_duplicated = checking_duplicated[checking_duplicated["id_seq"] == 1]
sequences_duplicated = checking_duplicated[checking_duplicated["id_seq"] > 1]

In [22]:
sequences_duplicated.shape

(3329, 1)

In [23]:
sequences_duplicated

Unnamed: 0_level_0,Unnamed: 1_level_0,id_seq
sequence,response,Unnamed: 2_level_1
MIVLILVFRLVIGEQMIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQLFLWRKQYQEGSLTAVAAGEQVVPASELAAAMKQIKELQRLLGKKTMENELLKEAVEYGRAKKWIAHAPLLPGDGE,0,8
MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQLFLWRKQYQEGSLTAVAAGEQVVPASELAAAMKQIKELQRLLGKKTMENELLKEAVEYGRAKKWIAHAPLLPGDGE,1,6
MPVIKVRENEPFDVALRRFKRSCEKAGVLAEVRRREFYEKPTTERKRAKASAVKRHAKKLARENARRTRLY,1,6
MNTLELSARVLECGAMRHTPAGLPALELLLVHESEVVEAGHPRRVELTISAVALGDLALLLADTPLGTEMQVQGFLAPARKDSVKVKLHLQQARRIAGSMGRDPLVG,1,6
MFIEERDYRIKPGKAGLFVATYEAHGLALQKKYLGRFLGYFMSEIGELNHVVAWWAYDSLDERDAARSRMLADPQWQAYLERVTDLIDLQQTRILKPVSFSPIQ,1,6
...,...,...
EDATVRSGVFATNRGGALVRIEGLRGFIPGSHISTRKPKEELVGEELPLKFLEVDEERNRLVLSHRRA,0,2
GHCFGCGEEQSHGLHLAARAGQGVSITAEFTVQPAHQGAPGLAHGGVLATALDETLGSLNWLLRTIAVTGRLETDFVRPVPVGTVLYLEAEVTAVAGRKIYSTATGRIGGPEGPVAVRADALFVEVKVDHFTDNGRQEEIRAAMNDPDQLRRARAFEVNP,0,2
EDRRRLALIICNTKFDHLPARNGAHYDIVGMKRLLQGLGYTVVDEKNLTARDMESVLRAFAARPEHKSSDSTFLVLMSHGILEGICGTAHKKKKPDVLLYDTIFQIFNNRNCLSLKDKPKVIIVQACRGEKHGELWVRDSPASLAVISSQSSENLEADSVCKIHEEKDFIAFCSSTPHNVSWRDRTRGSIFITELITCFQKYSCCCHLMEIFRKVQKSFEVPQAKAQMPTIERATLTRDFYLFPGN,1,2
MFKLKINNFQLGFKLVQWPVTNHLLKHFYVFPINNKGGLAIKRIISLALFKKRLNKDKINNCHVWEEELPDGSYDMGFNGNFNHMEKRKSGYVTQKQFSEFKDANNQRLIKIETTLAIQGEQINKLTQTVEKQGEQINQLVQVVLLQGASKLENFKWSKKHKDKSLMPAWIVWKIFWWKV,0,2


In [25]:
duplicated_values = []

for index in sequences_duplicated.index:
    duplicated_values.append(index[0])

df_duplicated = pd.DataFrame()
df_duplicated["sequence"] = duplicated_values
df_duplicated["index"] = [i for i in range(len(df_duplicated))]

In [26]:
grouped_seq_duplicated = df_duplicated.groupby(by="sequence").count()

In [32]:
ignored_sequences = grouped_seq_duplicated[grouped_seq_duplicated["index"]>1]
sequences_to_ignore = []

for index in ignored_sequences.index:
    sequences_to_ignore.append(index)

sequences_to_ignore
    

['MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQLFLWRKQYQEGSLTAVAAGEQVVPASELAAAMKQIKELQRLLGKKTMENELLKEAVEYGRAKKWIAHAPLLPGDGE',
 'MITDEEIRKVIAPLLLSGAKMLDKHCPKCGSPLFEKDGRVFCPVCEYREKQKKEMVKGVEERLMEKLTQLANSLPDDIDELEKHLRVMEKIIEVLEKYKKLEGRR',
 'MPDMSQLKELPFSTLQFYATAPYPCSYLPGRQARSQVAAPGHLINAGTYSQLVEQGFRRSGLFTYRPHCDNCHACVPVRVDAARFEPNRTQRRAWRSHQALRAFVAELAWSPEHYDLYTRYQQGRHPGGGMDEDSRTQYAQFLLTTRVNTRLVEFRAPQGQLAMISIIDVLDDGLSSVYTFYDPDMAGSLGTYSILWQIEQCRTLDLPWLYLGYWIADSRKMAYKANFRPLQMHVDGAWRETPP',
 'MRFSIVSLLHRLFLAGALALAAGAQAAPDPMGPPDKFVLDAATQTIDILKSDQNVKAGNLAHINQVVDAHILPFVNFQKTTRLAAGRYWRQATDTQKAELAKAFRGTLVRTYSGALTKVDNGTTIKLLPFRGDPNADDVVVRSLISQSNSQPVQVDYRLEKTPQGWRIYDMNVEGIWLIENYRNQFAQQINQNGIDGLIQALNQRNQ',
 'MTAVVNIAAYKFVSIANPADLREPMLEQAGQRQLKGTVLLAPEGINLFLAGAADAIEGFLRWLRADARFADLQAKYSESARMPFRKLLVKVKREIIRMDHPAIRPEAGRAPAVDAATLRRWLAQGRDDQGRELVMLDTRNAFEVEVGTFRGALDWRIERFTQFPQAVRDNQAALAGKTVVSFCTGGIRCEKAAIYMAEAGIEHVYQLEGGILKYFEETDGAGFDGACFVFDERVALDAALAPQA']

In [30]:
df_concat[df_concat["sequence"] == "MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQLFLWRKQYQEGSLTAVAAGEQVVPASELAAAMKQIKELQRLLGKKTMENELLKEAVEYGRAKKWIAHAPLLPGDGE"]

Unnamed: 0,sequence,response,id_seq
2539,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,0,2539
2540,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,0,2540
2545,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,0,2545
2547,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,0,2547
2549,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,0,2549
2551,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,0,2551
1298,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,1,11001
1299,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,1,11002
1304,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,1,11007
1306,MIDVLGPEKRRRRTTQEKIAIVQQSFEPGMTVSLVARQHGVAASQL...,1,11009


In [33]:
matrix_data_sequences = []

for index in sequences_no_duplicated.index:
    row = [
        index[0],
        index[1]
    ]
    matrix_data_sequences.append(row)

for index in sequences_duplicated.index:

    if index[0] not in sequences_to_ignore:
        row = [
            index[0],
            index[1]
        ]
        matrix_data_sequences.append(row)

df_to_export = pd.DataFrame(data=matrix_data_sequences, columns=["sequence", "response"])

In [34]:
df_to_export.shape

(9941, 2)

In [36]:
df_to_export["response"].value_counts()

0    5104
1    4837
Name: response, dtype: int64

In [37]:
df_to_export.to_csv("../../process_data/DSRESsol_export_data.csv", index=False)