In [1]:
import pandas as pd
import ast
import numpy as np

### Train-test split and encoding for physiochemical

In [2]:
fe_df = pd.read_csv("../dataset/physiochemical/output_results.csv")
fe_df

Unnamed: 0,Original,AAC,DC,molecular_weight,isoelectric_point,gravy,aromaticity,instability_index,aliphatic_index,boman_index
0,MQAKILRIATRKSPLAICQACYVCNKLKHYHPHIQTELIPIITTGD...,"[0.0707395498392283, 0.02572347266881029, 0.04...","[0.0, 0.0032258064516129032, 0.003225806451612...",35011.6433,9.364049,-0.041479,0.051447,38.157235,116.720257,0.359100
1,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,"[0.05517241379310345, 0.02413793103448276, 0.0...","[0.0034602076124567475, 0.0, 0.003460207612456...",32275.0348,8.347963,-0.154483,0.058621,42.982793,105.862069,0.291586
2,MKKELIIGTRSSPLALWQAEFTKAELSRHFPELNITLKLVKTTGDV...,"[0.07051282051282051, 0.01282051282051282, 0.0...","[0.0, 0.0, 0.003215434083601286, 0.02250803858...",34393.4749,6.042860,-0.122436,0.044872,40.237821,101.923077,0.262981
3,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,"[0.09324104234527687, 0.023615635179153095, 0....","[0.010792099368764, 0.0030543677458766036, 0.0...",529990.8042,6.424364,-0.162032,0.053542,46.634426,90.848941,0.260615
4,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,"[0.07840297889946214, 0.02399669011170873, 0.0...","[0.008069522036002483, 0.0028967515001034555, ...",527221.8439,5.875981,-0.186202,0.060612,42.951614,90.877120,0.276119
...,...,...,...,...,...,...,...,...,...,...
110217,MVRTRLAISVVLVSTLLLLNVKAKSVDPYKVLGVSKDAKQREIQKA...,"[0.050699300699300696, 0.008741258741258742, 0...","[0.0035026269702276708, 0.0, 0.0, 0.0070052539...",62569.2650,9.518709,-0.490559,0.089161,38.246521,72.237762,0.166154
110218,MPKAPKQQPPEPEWIGDGESTSPSDKVVKKGKKDKKIKKTFFEELA...,"[0.07558859975216853, 0.009913258983890954, 0....","[0.009925558312655087, 0.0012406947890818859, ...",91651.1398,7.208401,-0.883643,0.055762,47.106072,74.361834,0.030607
110219,MTDPHTARTIVGIVGNVISFGLFCAPIPTMVKIWKMKSVSEFKPDP...,"[0.05, 0.020833333333333332, 0.033333333333333...","[0.0041841004184100415, 0.0041841004184100415,...",27209.3824,8.883244,0.681667,0.137500,34.603792,111.583333,0.752250
110220,MAVPASPQHPRGYGILLLTLLLKALATTASACNHLRPQDATFSHDS...,"[0.07772020725388601, 0.031088082901554404, 0....","[0.0, 0.010416666666666666, 0.0052083333333333...",22115.7405,9.055116,-0.532124,0.067358,65.206218,81.502591,0.197876


In [3]:
train_df = pd.read_csv("../dataset/ec40/train.csv")
valid_df = pd.read_csv("../dataset/ec40/valid.csv")
test_df = pd.read_csv("../dataset/ec40/test.csv")

In [4]:
fe_train = fe_df[fe_df['Original'].isin(train_df['sequence'])]
fe_test  = fe_df[fe_df['Original'].isin(test_df['sequence'])]
fe_valid = fe_df[fe_df['Original'].isin(valid_df['sequence'])]
train_df = train_df.merge(fe_train, left_on='sequence', right_on='Original', how='left')
test_df  = test_df.merge(fe_test, left_on='sequence', right_on='Original', how='left')
valid_df = valid_df.merge(fe_valid, left_on='sequence', right_on='Original', how='left')

In [5]:
def encode_ec_vector(x):
    if isinstance(x, str):
        try:
            parsed = ast.literal_eval(x)
            if isinstance(parsed, list):
                x = parsed[0]
            else:
                x = parsed
        except Exception:
            pass
    if isinstance(x, str):
        parts = x.split('.')
    else:
        parts = []

    vec = []
    for part in parts:
        part = part.strip()
        if part == '-' or part == '':
            vec.append(-1)
        else:
            try:
                vec.append(int(part))
            except ValueError:
                try:
                    vec.append(float(part))
                except Exception:
                    vec.append(-1)
    return vec

def convert_str_to_list(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return x
    return x

def expand_column(df, column_names, drop_original=False):

    for column_name in column_names:
        expanded_cols = df[column_name].apply(pd.Series)
        expanded_cols.columns = [f"{column_name}_{i}" for i in expanded_cols.columns]
        df = pd.concat([df, expanded_cols], axis=1)
        if drop_original:
            df = df.drop(columns=[column_name])
    return df

In [6]:
train_df['ec'] = train_df['ec'].apply(encode_ec_vector)
valid_df['ec'] = valid_df['ec'].apply(encode_ec_vector)
test_df['ec'] = test_df['ec'].apply(encode_ec_vector)

In [7]:
cols_to_convert = ['AAC', 'DC']
for col in cols_to_convert:
    train_df[col] = train_df[col].apply(convert_str_to_list)
    valid_df[col] = valid_df[col].apply(convert_str_to_list)
    test_df[col] = test_df[col].apply(convert_str_to_list)

In [8]:
train_df = expand_column(train_df, ['ec', 'AAC', 'DC'], True)
valid_df = expand_column(valid_df, ['ec', 'AAC', 'DC'], True)
test_df = expand_column(test_df, ['ec', 'AAC', 'DC'], True)

### Add HMM Features

In [9]:
def merge_hmm(df, flag="train"):
    feature_df = pd.read_csv(f"../dataset/HMM/{flag}_features.csv")
    final_df = pd.merge(df, feature_df, left_on='accession',right_on='query_name', how="left").fillna(0)
    return final_df

train_df = merge_hmm(train_df, "train")
valid_df = merge_hmm(valid_df, "valid")
test_df = merge_hmm(test_df, "test")

In [10]:
train_df

Unnamed: 0,accession,sequence,traintest,negative_for,mainclass_set,sprot_version,len,cluster_ID,representative,Original,...,DC_395,DC_396,DC_397,DC_398,DC_399,query_name,E-value,score,coverage,num_domains
0,A4XK06,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_171115,False,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,...,0.000000,0.000000,0.000000,0.000000,0.000000,A4XK06,2.900000e-71,27.0,0.920690,2.0
1,Q9VR91,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_134383,True,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,...,0.001425,0.001833,0.001629,0.000204,0.000407,Q9VR91,3.000000e-03,717.7,0.476181,57.0
2,O95714,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_42431,False,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,...,0.001862,0.002069,0.000621,0.000207,0.000207,O95714,4.200000e-02,9.5,0.650393,89.0
3,O66129,MIALSYKAFLNPYIIEVEKRLYECIQSDSETINKAAHHILSSGGKR...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_79215,True,MIALSYKAFLNPYIIEVEKRLYECIQSDSETINKAAHHILSSGGKR...,...,0.000000,0.003086,0.000000,0.000000,0.000000,O66129,1.400000e-04,227.6,1.064615,4.0
4,Q68CP4,MTGARASAAEQRRAGRSGQARAAERAAGMSGAGRALAALLLAASVL...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_263145,True,MTGARASAAEQRRAGRSGQARAAERAAGMSGAGRALAALLLAASVL...,...,0.001511,0.000000,0.003021,0.003021,0.001511,Q68CP4,8.400000e-03,30.3,0.506787,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45755,Q5UPX0,MDDYEKSKVLTKRYKKLKKLLKMVYGYDNFRPRQYEIINKVINGED...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_55008,True,MDDYEKSKVLTKRYKKLKKLLKMVYGYDNFRPRQYEIINKVINGED...,...,0.000000,0.000000,0.003656,0.000000,0.003656,Q5UPX0,1.200000e-05,81.3,1.312044,9.0
45756,Q5UR49,MSLVPKAGYPFIIYFLDNINKYISDKTIQTYLTAFQINVDNLKVIN...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_514766,True,MSLVPKAGYPFIIYFLDNINKYISDKTIQTYLTAFQINVDNLKVIN...,...,0.001441,0.000000,0.001441,0.000000,0.002882,Q5UR49,6.900000e-04,92.2,2.138129,21.0
45757,P64636,MHINIAWQDVDTVLLDMDGTLLDLAFDNYFWQKLVPETWGAKNGVT...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_36634,True,MHINIAWQDVDTVLLDMDGTLLDLAFDNYFWQKLVPETWGAKNGVT...,...,0.000000,0.000000,0.000000,0.004525,0.000000,P64636,8.100000e-04,77.9,2.729730,6.0
45758,P94559,MNVLIISDSHGLEEELQTIAKRHEAEVDLMIHCGDSELETRHPALE...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_533178,True,MNVLIISDSHGLEEELQTIAKRHEAEVDLMIHCGDSELETRHPALE...,...,0.000000,0.000000,0.000000,0.000000,0.005952,P94559,1.100000e-01,37.6,1.928994,3.0


### Split and Add peptides features

In [11]:
pp_df = pd.read_csv("../dataset/peptides/seq_feats.csv")
pp_df = pp_df.drop(columns=['Unnamed: 0'])

In [12]:
pp_df['data'] = pp_df['data'].apply(lambda s: eval(s, {"np": np}))
pp_df['data_dict'] = pp_df['data'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else {})
data_expanded = pp_df['data_dict'].apply(pd.Series)
pp_df = pd.concat([pp_df, data_expanded], axis=1)
pp_df.drop(columns=['data', 'data_dict'], inplace=True)
pp_df = pp_df.drop_duplicates(subset='accession', keep='first')
pp_df

Unnamed: 0,accession,AF1,AF2,AF3,AF4,AF5,BLOSUM1,BLOSUM2,BLOSUM3,BLOSUM4,...,VSTPV2,VSTPV3,VSTPV4,VSTPV5,VSTPV6,Z1,Z2,Z3,Z4,Z5
0,Q7VRM4,-0.029281,-0.096226,-0.116617,0.355304,0.013183,0.046045,-0.311190,-0.251190,0.042540,...,0.136302,0.420418,-0.278006,0.169357,-0.133891,-0.090675,-0.511543,-0.505498,-0.362797,0.305659
1,A4XK06,0.031880,-0.077574,-0.360039,0.377882,-0.194147,0.144586,-0.305724,-0.199793,0.021276,...,0.198414,0.413690,-0.335655,0.229759,-0.193793,0.191931,-0.526621,-0.395103,-0.531103,0.289207
2,Q8KCJ4,0.000600,-0.117652,-0.161189,0.459098,-0.060263,0.138526,-0.330513,-0.174231,-0.037244,...,0.197628,0.442051,-0.355417,0.176282,-0.223397,0.107724,-0.648397,-0.419199,-0.496090,0.283910
3,Q9VR91,-0.069875,0.008929,-0.553377,0.458163,-0.248112,0.162258,-0.265145,-0.061735,-0.036521,...,0.087494,0.466179,-0.437199,0.225794,-0.309963,0.170827,-0.700737,-0.225900,-0.406464,0.306325
4,O95714,-0.041783,-0.000918,-0.451391,0.421223,-0.238567,0.159524,-0.267429,-0.080556,-0.024731,...,0.096173,0.443808,-0.421132,0.215689,-0.279313,0.188678,-0.649019,-0.242199,-0.456235,0.270306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110203,P40228,0.089087,0.136184,-0.281678,0.276177,-0.164250,0.240139,-0.140592,-0.078293,0.070732,...,0.076341,0.334077,-0.452683,0.289652,-0.232997,0.455889,-0.314216,-0.060453,-0.533693,0.187631
110205,Q8L9X2,0.072175,-0.008675,-0.057239,0.289849,0.056188,0.135665,-0.173498,-0.205961,-0.010049,...,0.164532,0.336847,-0.316897,0.236305,-0.160148,0.185961,-0.255172,-0.447783,-0.264335,0.182217
110206,Q54AX5,0.011130,-0.030923,-0.312671,0.409070,-0.216800,0.128902,-0.324157,-0.250333,-0.015706,...,0.170039,0.402863,-0.299765,0.151412,-0.159196,-0.161608,-0.485118,-0.383902,-0.661588,0.472235
110218,Q7YR37,0.277529,-0.177994,-0.164657,0.327573,-0.084777,0.372825,-0.186022,-0.241475,-0.139554,...,0.232639,0.335440,-0.380099,0.272131,-0.207881,0.557708,-0.319343,-0.476357,-0.481772,0.315167


In [None]:
pp_train = pp_df[pp_df['accession'].isin(train_df['accession'])]
pp_test  = pp_df[pp_df['accession'].isin(test_df['accession'])]
pp_valid = pp_df[pp_df['accession'].isin(valid_df['accession'])]
train_df = train_df.merge(pp_train, left_on='accession', right_on='accession', how='left')
test_df  = test_df.merge(pp_test, left_on='accession', right_on='accession', how='left')
valid_df = valid_df.merge(pp_valid, left_on='accession', right_on='accession', how='left')

In [14]:
train_df

Unnamed: 0,accession,sequence,traintest,negative_for,mainclass_set,sprot_version,len,cluster_ID,representative,Original,...,VSTPV2,VSTPV3,VSTPV4,VSTPV5,VSTPV6,Z1,Z2,Z3,Z4,Z5
0,A4XK06,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_171115,False,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,...,0.198414,0.413690,-0.335655,0.229759,-0.193793,0.191931,-0.526621,-0.395103,-0.531103,0.289207
1,Q9VR91,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_134383,True,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,...,0.087494,0.466179,-0.437199,0.225794,-0.309963,0.170827,-0.700737,-0.225900,-0.406464,0.306325
2,O95714,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_42431,False,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,...,0.096173,0.443808,-0.421132,0.215689,-0.279313,0.188678,-0.649019,-0.242199,-0.456235,0.270306
3,O66129,MIALSYKAFLNPYIIEVEKRLYECIQSDSETINKAAHHILSSGGKR...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_79215,True,MIALSYKAFLNPYIIEVEKRLYECIQSDSETINKAAHHILSSGGKR...,...,0.117631,0.360800,-0.354215,0.253908,-0.135877,0.168492,-0.332492,-0.314738,-0.447969,0.251938
4,Q68CP4,MTGARASAAEQRRAGRSGQARAAERAAGMSGAGRALAALLLAASVL...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_263145,True,MTGARASAAEQRRAGRSGQARAAERAAGMSGAGRALAALLLAASVL...,...,0.070995,0.430256,-0.347949,0.126863,-0.318069,-0.412036,-0.599759,-0.247285,-0.336094,0.256787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45755,Q5UPX0,MDDYEKSKVLTKRYKKLKKLLKMVYGYDNFRPRQYEIINKVINGED...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_55008,True,MDDYEKSKVLTKRYKKLKKLLKMVYGYDNFRPRQYEIINKVINGED...,...,0.168960,0.359106,-0.272701,0.183175,-0.148960,-0.005420,-0.330237,-0.383011,-0.402555,0.230420
45756,Q5UR49,MSLVPKAGYPFIIYFLDNINKYISDKTIQTYLTAFQINVDNLKVIN...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_514766,True,MSLVPKAGYPFIIYFLDNINKYISDKTIQTYLTAFQINVDNLKVIN...,...,0.129986,0.275353,-0.283281,0.204619,-0.114173,-0.096835,-0.121640,-0.361914,-0.448230,0.190950
45757,P64636,MHINIAWQDVDTVLLDMDGTLLDLAFDNYFWQKLVPETWGAKNGVT...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_36634,True,MHINIAWQDVDTVLLDMDGTLLDLAFDNYFWQKLVPETWGAKNGVT...,...,0.076937,0.366396,-0.373874,0.182883,-0.177703,-0.019144,-0.322297,-0.230586,-0.340180,0.257568
45758,P94559,MNVLIISDSHGLEEELQTIAKRHEAEVDLMIHCGDSELETRHPALE...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_533178,True,MNVLIISDSHGLEEELQTIAKRHEAEVDLMIHCGDSELETRHPALE...,...,0.130237,0.415325,-0.423136,0.232604,-0.183432,0.242249,-0.566036,-0.371479,-0.476272,0.235207


In [17]:
valid_df

Unnamed: 0,accession,sequence,traintest,negative_for,mainclass_set,sprot_version,len,cluster_ID,representative,Original,...,VSTPV2,VSTPV3,VSTPV4,VSTPV5,VSTPV6,Z1,Z2,Z3,Z4,Z5
0,C1A4M2,MLRIALPNKGRLSEDTRGLFNDAGLEVRSSGERALTASLGGEFEAI...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_502236,False,MLRIALPNKGRLSEDTRGLFNDAGLEVRSSGERALTASLGGEFEAI...,...,0.116918,0.479007,-0.552808,0.095582,-0.325000,0.130890,-0.852603,-0.435274,-0.500411,0.331062
1,C1A4M2,MLRIALPNKGRLSEDTRGLFNDAGLEVRSSGERALTASLGGEFEAI...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_502236,False,MLRIALPNKGRLSEDTRGLFNDAGLEVRSSGERALTASLGGEFEAI...,...,0.116918,0.479007,-0.552808,0.095582,-0.325000,0.130890,-0.852603,-0.435274,-0.500411,0.331062
2,Q8TYD5,MITVAVPNKGRLHEPALKLLERAGIGVEEPLGRRLKARTTDPDIEV...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_502236,False,MITVAVPNKGRLHEPALKLLERAGIGVEEPLGRRLKARTTDPDIEV...,...,0.179470,0.451625,-0.449682,0.053251,-0.240389,0.025512,-0.820141,-0.472120,-0.679011,0.272792
3,Q47XB7,MIDKLAREELVDMVPYQSARRLFASGDNEQANSRTWLNANEAPGQG...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_281669,False,MIDKLAREELVDMVPYQSARRLFASGDNEQANSRTWLNANEAPGQG...,...,0.121848,0.411875,-0.420707,0.168533,-0.171549,0.052582,-0.478940,-0.260299,-0.609891,0.362745
4,Q9PBC6,MNTQTPTVLDLVRQELRNFAGYSSARSVALTGDLWLNANESAWPNP...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_281669,False,MNTQTPTVLDLVRQELRNFAGYSSARSVALTGDLWLNANESAWPNP...,...,0.058438,0.483288,-0.496329,-0.013260,-0.312575,-0.180521,-0.785178,-0.285836,-0.434164,0.445918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7306,P34248,MDKYLNSFVDHLSEWSSRAFRNNSSSANQSASNKELEQVFEQINAI...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_514601,True,MDKYLNSFVDHLSEWSSRAFRNNSSSANQSASNKELEQVFEQINAI...,...,0.054855,0.331857,-0.318348,0.171891,-0.159046,-0.183237,-0.222692,-0.175656,-0.458416,0.287683
7307,P54954,MITVKNIRKAFKDLVVLDGIDLEVKRGEVVAIIGPSGSGKSTLLRC...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_467526,False,MITVKNIRKAFKDLVVLDGIDLEVKRGEVVAIIGPSGSGKSTLLRC...,...,0.203012,0.423655,-0.343976,0.164297,-0.188273,0.102008,-0.601365,-0.577430,-0.459920,0.329398
7308,O34349,MYKPVSLFLFFLILAAAIHTNAVQSADEAISKAAVLIRQPWLNEVM...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_257221,True,MYKPVSLFLFFLILAAAIHTNAVQSADEAISKAAVLIRQPWLNEVM...,...,0.121133,0.414926,-0.212217,0.053054,-0.322857,-0.824433,-0.586946,-0.400985,-0.163596,0.231872
7309,Q1MR19,MKYVAIDYGTKYTGIAVSDSMGVFAFPKQSIIMTTQKEFFIKLVEL...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_18204,False,MKYVAIDYGTKYTGIAVSDSMGVFAFPKQSIIMTTQKEFFIKLVEL...,...,0.214437,0.361761,-0.248662,0.107887,-0.150352,-0.306408,-0.462746,-0.580563,-0.485423,0.244366


In [18]:
test_df

Unnamed: 0,accession,sequence,traintest,negative_for,mainclass_set,sprot_version,len,cluster_ID,representative,Original,...,VSTPV2,VSTPV3,VSTPV4,VSTPV5,VSTPV6,Z1,Z2,Z3,Z4,Z5
0,Q8EB91,MAHYFVGDVQGCFAELQRLLAKVDFNPSRDELWAVGDLVARGPDSL...,1,0.0,Hydrolases,2017_03,1,cdhit40.fasta_599,False,MAHYFVGDVQGCFAELQRLLAKVDFNPSRDELWAVGDLVARGPDSL...,...,0.102920,0.368759,-0.313102,0.202117,-0.203796,-0.022920,-0.276241,-0.239307,-0.242518,0.316460
1,Q9Y2R2,MDQREILQKFLDEAQSKKITKEEFANEFLKLKRQSTKYKADKTYPT...,1,0.0,Hydrolases,2017_03,1,cdhit40.fasta_194622,True,MDQREILQKFLDEAQSKKITKEEFANEFLKLKRQSTKYKADKTYPT...,...,0.063408,0.319343,-0.317175,0.249963,-0.212317,0.268625,-0.227323,-0.147893,-0.443395,0.297014
2,Q4P9E5,MSAEPSVQTTSSSGPTELRSAPSYAGSWTKLTPPLTPWVVSLLSDL...,1,0.0,Hydrolases,2017_03,1,cdhit40.fasta_437483,True,MSAEPSVQTTSSSGPTELRSAPSYAGSWTKLTPPLTPWVVSLLSDL...,...,0.123859,0.376480,-0.463116,0.228175,-0.214016,0.467106,-0.379492,-0.339961,-0.427940,0.385724
3,Q588V7,MDSDSSKSRIDQFYVSKKRKHQSPNLKSGRNEKNVKVTGERSPGDK...,1,0.0,Hydrolases,2017_03,1,cdhit40.fasta_346428,True,MDSDSSKSRIDQFYVSKKRKHQSPNLKSGRNEKNVKVTGERSPGDK...,...,0.134485,0.415506,-0.382841,0.253542,-0.222396,0.314410,-0.512233,-0.296551,-0.444916,0.276314
4,E1QCT1,MKNKKDKTQKPKVIDEKFVAFFKSLNIEPQNWQFYEDAFVHSSYVN...,1,0.0,Hydrolases,2017_03,1,cdhit40.fasta_196657,True,MKNKKDKTQKPKVIDEKFVAFFKSLNIEPQNWQFYEDAFVHSSYVN...,...,0.182128,0.293227,-0.300319,0.229468,-0.207092,0.093582,-0.132660,-0.386525,-0.403404,0.329645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8176,Q54944,MANIVNFTDKQFENRLNDNLEELIQGKKAVESPTAFLLGGQPGSGK...,1,0.0,Transferases,2017_03,1,cdhit40.fasta_18384,False,MANIVNFTDKQFENRLNDNLEELIQGKKAVESPTAFLLGGQPGSGK...,...,0.142404,0.337909,-0.354948,0.150523,-0.198990,0.193275,-0.420557,-0.446620,-0.538467,0.331672
8177,Q9T080,MGGLKFHVLMYPWFATGHMTPFLFLANKLAEKGHTVTFLIPKKALK...,1,0.0,Transferases,2017_03,1,cdhit40.fasta_405900,False,MGGLKFHVLMYPWFATGHMTPFLFLANKLAEKGHTVTFLIPKKALK...,...,0.088176,0.395890,-0.341055,0.128967,-0.271890,-0.118242,-0.585451,-0.290110,-0.442681,0.228242
8178,Q03VR7,MAQTIDIANPTRTQAILNEYGLRAKKKFGQNFLTDLNVLHNIVEAA...,1,0.0,Transferases,2017_03,1,cdhit40.fasta_32179,False,MAQTIDIANPTRTQAILNEYGLRAKKKFGQNFLTDLNVLHNIVEAA...,...,0.132136,0.413119,-0.429593,0.063322,-0.198339,-0.094915,-0.586983,-0.475424,-0.561797,0.392169
8179,A0LH38,MSKLVPPHGKEKKLKPLLLEGAALAAEKEKAKTLKVVPMTSREASD...,1,0.0,Transferases,2017_03,1,cdhit40.fasta_437049,False,MSKLVPPHGKEKKLKPLLLEGAALAAEKEKAKTLKVVPMTSREASD...,...,0.160098,0.413195,-0.332390,0.176780,-0.311854,0.073000,-0.556951,-0.317585,-0.295122,0.225878


### Drop unnecessary columns and save all features

In [19]:
drop_columns=['Original', 'traintest', 'negative_for', 'mainclass_set', 'sprot_version', 'len', 'cluster_ID', 'representative', 'sequence', 'accession', 'query_name']
train_df = train_df.drop(columns=drop_columns)
valid_df = valid_df.drop(columns=drop_columns)
test_df = test_df.drop(columns=drop_columns)

In [20]:
train_df.to_csv("../dataset/all_features/train.csv", index=False)
test_df.to_csv("../dataset/all_features/test.csv", index=False)
valid_df.to_csv("../dataset/all_features/valid.csv", index=False)