In [1]:
import pandas as pd
from Bio import SeqIO

In [2]:
def fasta_to_csv(name_doc):
    matrix_data = []

    for record in SeqIO.parse(name_doc, "fasta"):
        id_seq = record.id
        class_seq = int(record.description.split(" ")[-1][-1])
        sequence = str(record.seq)
        
        row = [
            id_seq,
            sequence,
            class_seq
        ]

        matrix_data.append(row)

    df_process = pd.DataFrame(data=matrix_data, columns=["id_seq", "sequence", "response"])
    return df_process

In [3]:
name_train = "../../raw_data/PLM_sol/Train_dataset.fasta"
name_validation = "../../raw_data/PLM_sol/validation_dataset.fasta"
name_test = "../../raw_data/PLM_sol/test_dataset.fasta"

df_train = fasta_to_csv(name_train)
df_validation = fasta_to_csv(name_validation)
df_test = fasta_to_csv(name_validation)

In [4]:
df_train["response"].value_counts()

1    43291
0    28053
Name: response, dtype: int64

In [5]:
df_validation["response"].value_counts()

0    2000
1    2000
Name: response, dtype: int64

In [6]:
df_test["response"].value_counts()

0    2000
1    2000
Name: response, dtype: int64

In [7]:
df_concat = pd.concat([df_train, df_validation, df_test], axis=0)
df_concat

Unnamed: 0,id_seq,sequence,response
0,JCSG-368743,MFAQVHYPESLSGEELDEYLFKGWFRNGQRIFTTNFLNFDQKIYSA...,0
1,SECSG-T14G10.4,MRAPVTFLILCSLALLCSSANNCAWFVGQLQCTDTSKLENVVVEIW...,0
2,NESG-StR174,MQRNLSHIISQATSAPLLLEPAYARVFFCALGRESGINSLHIPGNN...,0
3,NESG-RpR95,MNMLRSLSIAAAACCLFVALPARAQDVPPPGIVVRGEAQKSVAPDT...,0
4,SGPP-Pfal007925AAA,MKYSELFYCFSIWEEAYKYDDMVKGKNKTVGRTSNKFEKVEYNNNN...,0
...,...,...,...
3995,MCSG-APC67223.1,AGSVLEIKDTEILKELGSTIGVEIANLHQALNSVNNNNELVKRDLY...,1
3996,PDB_230301_9167,EVHLVESGGGLVQPGRSLRLSCAASGFTFDDYTMHWVRQAPGKGLE...,1
3997,EFI-507412,SEITLNYAGPADLDTYGRYRPGDRPGPRYPFQGRIADGTGDFPAES...,1
3998,SSGCID-TrcrA.01530.a,MFRRCLRLCDFSVLLATFEPFELYGSKDTGICEMVVNNKKNSLNLL...,1


In [8]:
df_concat["response"].value_counts()

1    47291
0    32053
Name: response, dtype: int64

In [9]:
checking_duplicated = df_concat.groupby(by=["sequence", "response"]).count().sort_values(by="id_seq", ascending=False)

In [10]:
checking_duplicated[checking_duplicated["id_seq"] == 1].shape

(69014, 1)

In [11]:
sequences_no_duplicated = checking_duplicated[checking_duplicated["id_seq"] == 1]
sequences_duplicated = checking_duplicated[checking_duplicated["id_seq"] > 1]

In [13]:
sequences_duplicated.shape

(4000, 1)

In [14]:
sequences_duplicated

Unnamed: 0_level_0,Unnamed: 1_level_0,id_seq
sequence,response,Unnamed: 2_level_1
GSDSEVNQEAKPEVKPEVKPETHINLKVSDGSSEIFFKIKKTTPLRRLMEAFAKRQGKEMDSLTFLYDGIEIQADQTPEDLDMEDNDIIEAHREQIGGENLYFQSVAHGLAWSYYIGYLRLILPELQARIRTYNQHYNNLLRGAVSQRLYILLPLDCGVPDNLSMADPNIRFLDKLPQQTADRAGIKDRVYSNSIYELLENGQRAGTCVLEYATPLQTLFAMSQYSQAGFSREDRLEQAKLFCQTLEDILADAPESQNNCRLIAYQEPADDSSFSLSQEVLRHLRQEEKEEV,1,3
MHRGYALVVCSPGVTRTMIDIDDDLLARAAKELGTTTKKDTVHAALRAALRASAARSLMNRMAENATGTQDEALVNAMWRDGHPENTA,0,3
MHRALIASGLMSRCTLAHRYCSTKPAGDNAKLDDLATAYSQLTLREVSDLQRLIFKKLGHSDEFYEKALLRGLGGGGGAVMMAPAAAAAVAPAADAPAADALKTEKKKVEKLTYDVKLEKFAPEIKIKLIKELRTVTNLSIADAKKAVEKCPGLVATNMSKDDAEKLKELYEKLGAKVELL,0,3
GPGSMPPTRPEFEAPPDILYNADGAQRYTRSGRVQMVQRAMTRRALELLALPSGCPALLLDVGCGSGLSGEVLRECGHTWIGIDISTDMLGMAKASEELMREDDEKEDEGPIEGVDLRDVRWGPIHDENENENENDDHDSTDGDGNEAEAVLPPGPSIVEVMRGDMGEGLPFRPGAFDGAVSISAVQWLCQSDRRGQVPQRRLRALFQCLYNALRRGAKAAIQFYPANVEQLHMITRAAMLCGFSGGVVVDFPHSTKAKKHYLVVQAGQVAGGFMPPRPLSVCNEGGEEVEEEEECEEEDGEEEGEEENDEGQRFGARVGGRVAHKRSRSHGGRQAGARRRKDNRPVTGTKEWVLLKKAERRKFGLKTSEDTKYTMRRRRPRF,0,3
GPGSMPQDKRLIVTKAVVHKIQRLHGADTEAAAAEGIFIEFVVPPVVVSQCPDLLTEHVFQLPNRTYPRNGRTTCLIVPRVISRDAFKINKRHGYYDAVVTAEAICKRDAAEAVRRAALVSKTFSHFVVDERIVSKLPPCILAAVTASLPATARERDVVSSSAPGPITQTEKKVGVGKRKGAKLAEPLTTTTTTHVPALPKTSRKCITPLRNLEDRTSLTFRLSQGVLGGVVRVKNMGLLTFRVGHAGMTAGDVCENAKCFIFALKREFPTVWKYIHEFTMTSGVTEPIRFMEVHIRK,0,3
...,...,...
MTKRIMFTGGSGKAGRHVVQYLVEQGCQVLNIDTKPLDNPKVRTLITDITDSGQVFNALSSYMGLHEFDPSLRPQPVDAVVHFAAIPRIMITPDNEVFRINAMGTYNVIEAAVKLGIRKVIVASSETTYGLVFANEPRDPKYFPLDEEYDVDPMDSYALSKVVNEKTARAFAQRNGTDIYALRIGNVIEPHEYSLFPKWFADPGFRKRIAWSYIDARDLGQITLRAIEKDGLGYQVFNAANDDTSSDLPTAELLKRFYPNVPVKAALGEYETLLSNRKARDMLGFRPEHSWRKYVK,0,2
MPVYYNADKKTWYAMFYAKDYKGVNKKYKKTGFKKKKEAQEYEYEFKKKISKSVNMSFNSLYELYFEDYKKRHKPTAINTVETFFRLHILPFFDNIEISKINSYMIREWQNEMLEKKNENGKLFSENSKANIYAALKSMFNWAAKYQGLNENPCKNLGAFGSKKNRSEMKIWSVDDFNKFINLLEIKNKEKNGKYTDAIIIFKILFWTGLRIGEVLALTFDDINLKEKFIDVNKTISHINKKDYITTPKTLGSVRKVILPENLILDLKLYFSKFEKQKISKSERIFNLKKSQLRYILEKCSIQAEVEKIRLHDFRHSHASYLLFIQADITAISKRLGHDNLQTTINTYSHLYKDANKQLMKKLNTNN,0,2
HHHHHHSSGLVPRGSHMASMTGGQQMGRGSMMTNLQKEFFKRLKIPAKEITFNDLDEILLNMGMILPFENLDIMAGTIKNISKNNLVEKLLIQKRGGLCYELNSLLYYFLMDCGFQVYKVAGTVYDLYDNKWKPDDGHVIIILHHNKKDYVIDAGFASHLPLHPVPFSGEVISSQTGEYRIRKRTTQKGTHILEMRKGANGESTNFLQSEPSDEWKIGYAFTLDPIDEQKVNNIQKVIVEHKESPFNKGAITCKLTNYGHISLTNKNYTETFKGTKNKRPIESKDYARILRESFGITQVKYVGKTLERG,1,2
MGSMEFVNKQFNYKDPVNGVDIAYIKIPNAGQMQPVKAFKIHNKIWVIPERDTFTNPEEGDLNPPPEAKQVPVSYYDSTYLSTDNEKDNYLKGVTKLFERIYSTDLGRMLLTSIVRGIPFWGGSTIDTELKVIDTNCINVIQPDGSYRSEELNLVIIGPSADIIQFECKSFGHEVLNLTRNGYGSTQYIRFSPDFTFGFEESLEVDTNPLLGAGKFATDPAVTLAHQLIYAGHRLYGIAINPNRVFKVNTNAYYEMSGLEVSFEELRTFGGHDAKFIDSLQENEFRLYYYNKFKDIASTLNKAKSIVGTTASLQYMKNVFKEKYLLSEDTSGKFSVDKLKFDKLYKMLTEIYTEDNFVKFFKVLNRKTYLNFDKAVFKINIVPKVNYTIYDGFNLRNTNLAANFNGQNTEINNMNFTKLKNFTGLFEFYKLLCVDGGGGSGGGGSGGGGSEMDENLEQVSGIIGNLRHMALDMGNEIDTQNRQIDRIMEKADSNKTRIDEANQRATKMLGSGANSALAGGGGSGGGGSGGGGSLQCIKVNNWDLFFSPSEDNFTNDLNKGEEITSDTNIEAAEENISLDLIQQYYLTFNFDNEPENISIENLSSDIIGQLELMPNIERFPNGKKYELDKYTMFHYLRAQEFEHGKSRIALTNSVNEALLNPSRVYTFFSSDYVKKVNKATEAAMFLGWVEQLVYDFTDETSEVSTTDKIADITIIIPYIGPALNIGNMLYKDDFVGALIFSGAVILLEFIPEIAIPVLGTFALVSYIANKVLTVQTIDNALSKRNEKWDEVYKYIVTNWLAKVNTQIDLIRKKMKEALENQAEATKAIINYQYNQYTEEEKNNINFNIDDLSSKLNESINKAMININKFLNQCSVSYLMNSMIPYGVKRLEDFDASLKDALLKYIYDNRGTLIGQVDRLKDKVNNTLSTDIPFQLSKYVDNQRLLSTLEAHHHH,1,2


In [15]:
unique_duplicated = []

for index in sequences_duplicated.index:
    unique_duplicated.append(index[0])

unique_duplicated = list(set(unique_duplicated))
len(unique_duplicated)

4000

In [16]:
matrix_data_sequences = []

for index in sequences_no_duplicated.index:
    row = [
        index[0],
        index[1]
    ]
    matrix_data_sequences.append(row)

for index in sequences_duplicated.index:
    row = [
        index[0],
        index[1]
    ]
    matrix_data_sequences.append(row)

df_to_export = pd.DataFrame(data=matrix_data_sequences, columns=["sequence", "response"])

In [17]:
df_to_export["response"].value_counts()

1    44159
0    28855
Name: response, dtype: int64

In [18]:
df_to_export.to_csv("../../process_data/PLM_sol_export_data.csv", index=False)