In [75]:
import pandas as pd
import os
import numpy as np

In [76]:
canonical_residues = ['A','C', 'D', 'E', 'F', 'G', 'H', 'I', 'N', 'K', 
    'L', 'M', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

In [77]:
def check_canonical_residues(sequence):
    is_canon=True

    for residue in sequence:
        if residue not in canonical_residues:
            is_canon=False
            break
    
    return is_canon

In [78]:
processed_files = os.listdir("../../process_data/categorical/")

In [79]:
list_df = []

for element in processed_files:
    df_data = pd.read_csv(f"../../process_data/categorical/{element}")
    df_data["source"] = element
    
    if "id_seq" not in df_data.columns.tolist():
        df_data["id_seq"] = [i for i in range(len(df_data))]
    
    df_data = df_data[["id_seq", "sequence", "response", "source"]]
    list_df.append(df_data)

full_df = pd.concat(list_df, axis=0)
full_df.shape

(356088, 4)

In [80]:
full_df.reset_index(inplace=True)

In [81]:
unique_sequences = full_df["sequence"].unique()
len(unique_sequences)

164262

In [82]:
grouped_sequences = full_df.groupby(by=["sequence"]).count()

In [83]:
grouped_sequences.shape

(164262, 4)

In [84]:
grouped_sequences_only_one_response = grouped_sequences[grouped_sequences["id_seq"] == 1]
grouped_sequences_more_than_one_response = grouped_sequences[grouped_sequences["id_seq"] != 1]

In [85]:
grouped_sequences_more_than_one_response.shape

(91075, 4)

In [86]:
df_sequences_unique_response = pd.DataFrame()
df_sequences_unique_response["sequence"] = grouped_sequences_only_one_response.index.tolist()
df_sequences_unique_response = df_sequences_unique_response.merge(right=full_df, on="sequence")
df_sequences_unique_response = df_sequences_unique_response[["sequence", "response"]]
df_sequences_unique_response["response"].value_counts()

1    39348
0    33839
Name: response, dtype: int64

In [87]:
df_sequences_unique_response.head(5)

Unnamed: 0,sequence,response
0,AAAAAARAGKVRGQTPKVAKQEKKKKKTGRAKRRMQYNRRFVNVKG...,1
1,AAAAANHSTQESGFDYEGLIDSELQKKRLDKSYRYFNNINRLAKEF...,1
2,AAADGDDSLYPIAVLIDELRNEDVQLRLNSIKKLSTIALALGVERT...,1
3,AAAEIETIVRESEANRIQAQTWFSHPEKSKVSFRYDERETSSIRSI...,1
4,AAAFQEEFTVREDLMGLAIGTHGANIQQARKVPGVTAIELGEETCT...,1


In [88]:
process_sequences = []
sequences_ignored = []

for index in grouped_sequences_more_than_one_response.index:

    filter_doc = full_df[full_df["sequence"] == index]
    responses = filter_doc["response"].unique()

    if len(responses) == 1:
        row = [
            index,
            responses[0]
        ]
        process_sequences.append(row)
    else:
        sequences_ignored.append(index)

df_valid_sequences = pd.DataFrame(data=process_sequences, columns=["sequence", "response"])
df_sequences_to_check = pd.DataFrame()
df_sequences_to_check["sequence"] = sequences_ignored

In [89]:
df_valid_sequences.shape

(79566, 2)

In [90]:
df_sequences_to_check.shape

(11509, 1)

In [91]:
df_processed = pd.concat([df_valid_sequences, df_sequences_unique_response], axis=0)
df_processed.reset_index(inplace=True)
df_processed = df_processed.drop(columns=["index"])
df_processed.head(5)

Unnamed: 0,sequence,response
0,AAAAAAMIVQRVVLNSRPGKNGNPVAENFRMEEVYLPDNINEGQVQ...,1
1,AAAAAAMTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAI...,1
2,AAADLAVGQVVAQREIRLDAQAFAQFAALTGDAHPIHYDAGYVQRQ...,0
3,AAADSTSMSYSVTLTGPGPWGFRLQGGKDFNMPLTISRITPGSKAA...,1
4,AAAIAGAKLRKVSRMEDTSFPSGGNAIGVNSASSKTDTGRGNGPLP...,1


In [92]:
df_processed["response"].value_counts()

1    77344
0    75409
Name: response, dtype: int64

In [93]:
df_processed["sequence"].unique().shape

(152753,)

In [94]:
df_sequences_to_check.head(5)

Unnamed: 0,sequence
0,AAAKPADKALQAQLPEMALPREAAPRLDGLAAVLPQIHPAEAGKPA...
1,AAAPAVENKPVNPAKAKKEDRRNRDDEGQGRNAKGKGGKGGRDRNN...
2,AACQDEDHTVPTLPSGNDPEINDPVVEFYDWEKNRTELLTSTDMVL...
3,AACQGKPLPNTAMHSDAPTEWSFNFFTPKALPAVVTFAAILDVDGN...
4,AACSSEDDSSPEVNPENAAITFELSAVNGLTDGIGTRMPVYSQEAT...


In [95]:
full_df[full_df["sequence"] == df_sequences_to_check["sequence"][4569]]

Unnamed: 0,index,id_seq,sequence,response,source
23780,23780,23780,MISVDTMTNASSTEGISLHHISVSDLDNNCYLLTTESDEGTQGLLI...,0,deep_sol_export_data.csv
124541,30724,Psol_train_21781,MISVDTMTNASSTEGISLHHISVSDLDNNCYLLTTESDEGTQGLLI...,0,EPSOL_exported_data.csv
232230,52456,52456,MISVDTMTNASSTEGISLHHISVSDLDNNCYLLTTESDEGTQGLLI...,1,PLM_sol_export_data.csv
302889,18222,18222,MISVDTMTNASSTEGISLHHISVSDLDNNCYLLTTESDEGTQGLLI...,0,PaRSnIP_export_data.csv


In [96]:
length_sequences = []

for index in df_processed.index:
    length_sequences.append(len(df_processed["sequence"][index]))

df_processed["length_sequence"] = length_sequences

In [97]:
length_sequences = []

for index in df_sequences_to_check.index:
    length_sequences.append(len(df_sequences_to_check["sequence"][index]))

df_sequences_to_check["length_sequence"] = length_sequences

In [98]:
df_sequences_to_check.head(5)

Unnamed: 0,sequence,length_sequence
0,AAAKPADKALQAQLPEMALPREAAPRLDGLAAVLPQIHPAEAGKPA...,149
1,AAAPAVENKPVNPAKAKKEDRRNRDDEGQGRNAKGKGGKGGRDRNN...,163
2,AACQDEDHTVPTLPSGNDPEINDPVVEFYDWEKNRTELLTSTDMVL...,376
3,AACQGKPLPNTAMHSDAPTEWSFNFFTPKALPAVVTFAAILDVDGN...,204
4,AACSSEDDSSPEVNPENAAITFELSAVNGLTDGIGTRMPVYSQEAT...,346


In [100]:
df_processed.head(5)

Unnamed: 0,sequence,response,length_sequence
0,AAAAAAMIVQRVVLNSRPGKNGNPVAENFRMEEVYLPDNINEGQVQ...,1,357
1,AAAAAAMTMMDMNFKYCHKIMKKHSKSFSYAFDLLPEDQRKAVWAI...,1,293
2,AAADLAVGQVVAQREIRLDAQAFAQFAALTGDAHPIHYDAGYVQRQ...,0,151
3,AAADSTSMSYSVTLTGPGPWGFRLQGGKDFNMPLTISRITPGSKAA...,1,734
4,AAAIAGAKLRKVSRMEDTSFPSGGNAIGVNSASSKTDTGRGNGPLP...,1,178


In [101]:
df_processed_filter = df_processed[df_processed["length_sequence"]<1200]
df_processed_filter.reset_index(inplace=True)
df_processed_filter = df_processed_filter.drop(columns=["index"])
df_processed_filter.shape

(151864, 3)

In [102]:
df_processed_filter_to_check = df_sequences_to_check[df_sequences_to_check["length_sequence"]<1200]
df_processed_filter_to_check.reset_index(inplace=True)
df_processed_filter_to_check = df_processed_filter_to_check.drop(columns=["index"])
df_processed_filter_to_check.shape

(11450, 2)

In [103]:
df_processed_filter["is_canon_seq"] = [check_canonical_residues(sequence) for sequence in df_processed_filter["sequence"]]

In [104]:
df_processed_filter["is_canon_seq"].value_counts()

True     151355
False       509
Name: is_canon_seq, dtype: int64

In [110]:
df_processed_filter = df_processed_filter[df_processed_filter["is_canon_seq"]==True]
df_processed_filter.to_csv("../../datasets_for_training/binary_processed_dataset.csv", index=False)

In [105]:
df_processed_filter_to_check["is_canon_seq"] = [check_canonical_residues(sequence) for sequence in df_processed_filter_to_check["sequence"]]

In [106]:
df_processed_filter_to_check["is_canon_seq"].value_counts()

True    11450
Name: is_canon_seq, dtype: int64

In [107]:
df_processed_filter_to_check.head(5)

Unnamed: 0,sequence,length_sequence,is_canon_seq
0,AAAKPADKALQAQLPEMALPREAAPRLDGLAAVLPQIHPAEAGKPA...,149,True
1,AAAPAVENKPVNPAKAKKEDRRNRDDEGQGRNAKGKGGKGGRDRNN...,163,True
2,AACQDEDHTVPTLPSGNDPEINDPVVEFYDWEKNRTELLTSTDMVL...,376,True
3,AACQGKPLPNTAMHSDAPTEWSFNFFTPKALPAVVTFAAILDVDGN...,204,True
4,AACSSEDDSSPEVNPENAAITFELSAVNGLTDGIGTRMPVYSQEAT...,346,True


In [108]:
df_processed_filter_to_check.to_csv("../../datasets_for_training/sequences_with_multiple_categories.csv", index=False)