In [1]:
import pandas as pd
import ast
import numpy as np

### Train-test split and encoding for physiochemical

In [2]:
fe_df = pd.read_csv("../dataset/physiochemical/output_results.csv")
fe_df

Unnamed: 0,Original,AAC,DC,molecular_weight,isoelectric_point,gravy,aromaticity,instability_index,aliphatic_index,boman_index
0,MQAKILRIATRKSPLAICQACYVCNKLKHYHPHIQTELIPIITTGD...,"[0.0707395498392283, 0.02572347266881029, 0.04...","[0.0, 0.0032258064516129032, 0.003225806451612...",35011.6433,9.364049,-0.041479,0.051447,38.157235,116.720257,0.359100
1,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,"[0.05517241379310345, 0.02413793103448276, 0.0...","[0.0034602076124567475, 0.0, 0.003460207612456...",32275.0348,8.347963,-0.154483,0.058621,42.982793,105.862069,0.291586
2,MKKELIIGTRSSPLALWQAEFTKAELSRHFPELNITLKLVKTTGDV...,"[0.07051282051282051, 0.01282051282051282, 0.0...","[0.0, 0.0, 0.003215434083601286, 0.02250803858...",34393.4749,6.042860,-0.122436,0.044872,40.237821,101.923077,0.262981
3,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,"[0.09324104234527687, 0.023615635179153095, 0....","[0.010792099368764, 0.0030543677458766036, 0.0...",529990.8042,6.424364,-0.162032,0.053542,46.634426,90.848941,0.260615
4,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,"[0.07840297889946214, 0.02399669011170873, 0.0...","[0.008069522036002483, 0.0028967515001034555, ...",527221.8439,5.875981,-0.186202,0.060612,42.951614,90.877120,0.276119
...,...,...,...,...,...,...,...,...,...,...
110217,MVRTRLAISVVLVSTLLLLNVKAKSVDPYKVLGVSKDAKQREIQKA...,"[0.050699300699300696, 0.008741258741258742, 0...","[0.0035026269702276708, 0.0, 0.0, 0.0070052539...",62569.2650,9.518709,-0.490559,0.089161,38.246521,72.237762,0.166154
110218,MPKAPKQQPPEPEWIGDGESTSPSDKVVKKGKKDKKIKKTFFEELA...,"[0.07558859975216853, 0.009913258983890954, 0....","[0.009925558312655087, 0.0012406947890818859, ...",91651.1398,7.208401,-0.883643,0.055762,47.106072,74.361834,0.030607
110219,MTDPHTARTIVGIVGNVISFGLFCAPIPTMVKIWKMKSVSEFKPDP...,"[0.05, 0.020833333333333332, 0.033333333333333...","[0.0041841004184100415, 0.0041841004184100415,...",27209.3824,8.883244,0.681667,0.137500,34.603792,111.583333,0.752250
110220,MAVPASPQHPRGYGILLLTLLLKALATTASACNHLRPQDATFSHDS...,"[0.07772020725388601, 0.031088082901554404, 0....","[0.0, 0.010416666666666666, 0.0052083333333333...",22115.7405,9.055116,-0.532124,0.067358,65.206218,81.502591,0.197876


In [3]:
train_df = pd.read_csv("../dataset/ec40/train.csv")
valid_df = pd.read_csv("../dataset/ec40/valid.csv")
test_df = pd.read_csv("../dataset/ec40/test.csv")

In [4]:
fe_train = fe_df[fe_df['Original'].isin(train_df['sequence'])]
fe_test  = fe_df[fe_df['Original'].isin(test_df['sequence'])]
fe_valid = fe_df[fe_df['Original'].isin(valid_df['sequence'])]
train_df = train_df.merge(fe_train, left_on='sequence', right_on='Original', how='left')
test_df  = test_df.merge(fe_test, left_on='sequence', right_on='Original', how='left')
valid_df = valid_df.merge(fe_valid, left_on='sequence', right_on='Original', how='left')

In [5]:
def encode_ec_vector(x):
    if isinstance(x, str):
        try:
            parsed = ast.literal_eval(x)
            if isinstance(parsed, list):
                x = parsed[0]
            else:
                x = parsed
        except Exception:
            pass
    if isinstance(x, str):
        parts = x.split('.')
    else:
        parts = []

    vec = []
    for part in parts:
        part = part.strip()
        if part == '-' or part == '':
            vec.append(-1)
        else:
            try:
                vec.append(int(part))
            except ValueError:
                try:
                    vec.append(float(part))
                except Exception:
                    vec.append(-1)
    return vec

def convert_str_to_list(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return x
    return x

def expand_column(df, column_names, drop_original=False):

    for column_name in column_names:
        expanded_cols = df[column_name].apply(pd.Series)
        expanded_cols.columns = [f"{column_name}_{i}" for i in expanded_cols.columns]
        df = pd.concat([df, expanded_cols], axis=1)
        if drop_original:
            df = df.drop(columns=[column_name])
    return df

In [6]:
train_df['ec'] = train_df['ec'].apply(encode_ec_vector)
valid_df['ec'] = valid_df['ec'].apply(encode_ec_vector)
test_df['ec'] = test_df['ec'].apply(encode_ec_vector)

In [7]:
cols_to_convert = ['AAC', 'DC']
for col in cols_to_convert:
    train_df[col] = train_df[col].apply(convert_str_to_list)
    valid_df[col] = valid_df[col].apply(convert_str_to_list)
    test_df[col] = test_df[col].apply(convert_str_to_list)

In [8]:
train_df = expand_column(train_df, ['ec', 'AAC', 'DC'], True)
valid_df = expand_column(valid_df, ['ec', 'AAC', 'DC'], True)
test_df = expand_column(test_df, ['ec', 'AAC', 'DC'], True)

### Add HMM Features

In [9]:
def merge_hmm(df, flag="train"):
    feature_df = pd.read_csv(f"../dataset/HMM/{flag}_features.csv")
    final_df = pd.merge(df, feature_df, left_on='accession',right_on='query_name', how="left").fillna(0)
    return final_df

train_df = merge_hmm(train_df, "train")
valid_df = merge_hmm(valid_df, "valid")
test_df = merge_hmm(test_df, "test")

In [10]:
train_df

Unnamed: 0,accession,sequence,traintest,negative_for,mainclass_set,sprot_version,len,cluster_ID,representative,Original,...,DC_395,DC_396,DC_397,DC_398,DC_399,query_name,E-value,score,coverage,num_domains
0,A4XK06,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_171115,False,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,...,0.000000,0.000000,0.000000,0.000000,0.000000,A4XK06,2.900000e-71,27.0,0.920690,2.0
1,Q9VR91,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_134383,True,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,...,0.001425,0.001833,0.001629,0.000204,0.000407,Q9VR91,3.000000e-03,717.7,0.476181,57.0
2,O95714,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_42431,False,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,...,0.001862,0.002069,0.000621,0.000207,0.000207,O95714,4.200000e-02,9.5,0.650393,89.0
3,O66129,MIALSYKAFLNPYIIEVEKRLYECIQSDSETINKAAHHILSSGGKR...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_79215,True,MIALSYKAFLNPYIIEVEKRLYECIQSDSETINKAAHHILSSGGKR...,...,0.000000,0.003086,0.000000,0.000000,0.000000,O66129,1.400000e-04,227.6,1.064615,4.0
4,Q68CP4,MTGARASAAEQRRAGRSGQARAAERAAGMSGAGRALAALLLAASVL...,0,0.0,Transferases,2017_03,1,cdhit40.fasta_263145,True,MTGARASAAEQRRAGRSGQARAAERAAGMSGAGRALAALLLAASVL...,...,0.001511,0.000000,0.003021,0.003021,0.001511,Q68CP4,8.400000e-03,30.3,0.506787,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45755,Q5UPX0,MDDYEKSKVLTKRYKKLKKLLKMVYGYDNFRPRQYEIINKVINGED...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_55008,True,MDDYEKSKVLTKRYKKLKKLLKMVYGYDNFRPRQYEIINKVINGED...,...,0.000000,0.000000,0.003656,0.000000,0.003656,Q5UPX0,1.200000e-05,81.3,1.312044,9.0
45756,Q5UR49,MSLVPKAGYPFIIYFLDNINKYISDKTIQTYLTAFQINVDNLKVIN...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_514766,True,MSLVPKAGYPFIIYFLDNINKYISDKTIQTYLTAFQINVDNLKVIN...,...,0.001441,0.000000,0.001441,0.000000,0.002882,Q5UR49,6.900000e-04,92.2,2.138129,21.0
45757,P64636,MHINIAWQDVDTVLLDMDGTLLDLAFDNYFWQKLVPETWGAKNGVT...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_36634,True,MHINIAWQDVDTVLLDMDGTLLDLAFDNYFWQKLVPETWGAKNGVT...,...,0.000000,0.000000,0.000000,0.004525,0.000000,P64636,8.100000e-04,77.9,2.729730,6.0
45758,P94559,MNVLIISDSHGLEEELQTIAKRHEAEVDLMIHCGDSELETRHPALE...,0,0.0,Hydrolases,2017_03,1,cdhit40.fasta_533178,True,MNVLIISDSHGLEEELQTIAKRHEAEVDLMIHCGDSELETRHPALE...,...,0.000000,0.000000,0.000000,0.000000,0.005952,P94559,1.100000e-01,37.6,1.928994,3.0


### Split and Add peptides features

In [11]:
pp_df = pd.read_csv("../dataset/peptides/seq_feats.csv")
pp_df = pp_df.drop(columns=['Unnamed: 0'])

In [12]:
pp_df['data'] = pp_df['data'].apply(lambda s: eval(s, {"np": np}))
pp_df['data_dict'] = pp_df['data'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else {})
data_expanded = pp_df['data_dict'].apply(pd.Series)
pp_df = pd.concat([pp_df, data_expanded], axis=1)
pp_df.drop(columns=['data', 'data_dict'], inplace=True)
pp_df = pp_df.drop_duplicates(subset='accession', keep='first')
pp_df

Unnamed: 0,accession,AF1,AF2,AF3,AF4,AF5,BLOSUM1,BLOSUM2,BLOSUM3,BLOSUM4,...,VSTPV2,VSTPV3,VSTPV4,VSTPV5,VSTPV6,Z1,Z2,Z3,Z4,Z5
0,Q7VRM4,-0.029281,-0.096226,-0.116617,0.355304,0.013183,0.046045,-0.311190,-0.251190,0.042540,...,0.136302,0.420418,-0.278006,0.169357,-0.133891,-0.090675,-0.511543,-0.505498,-0.362797,0.305659
1,A4XK06,0.031880,-0.077574,-0.360039,0.377882,-0.194147,0.144586,-0.305724,-0.199793,0.021276,...,0.198414,0.413690,-0.335655,0.229759,-0.193793,0.191931,-0.526621,-0.395103,-0.531103,0.289207
2,Q8KCJ4,0.000600,-0.117652,-0.161189,0.459098,-0.060263,0.138526,-0.330513,-0.174231,-0.037244,...,0.197628,0.442051,-0.355417,0.176282,-0.223397,0.107724,-0.648397,-0.419199,-0.496090,0.283910
3,Q9VR91,-0.069875,0.008929,-0.553377,0.458163,-0.248112,0.162258,-0.265145,-0.061735,-0.036521,...,0.087494,0.466179,-0.437199,0.225794,-0.309963,0.170827,-0.700737,-0.225900,-0.406464,0.306325
4,O95714,-0.041783,-0.000918,-0.451391,0.421223,-0.238567,0.159524,-0.267429,-0.080556,-0.024731,...,0.096173,0.443808,-0.421132,0.215689,-0.279313,0.188678,-0.649019,-0.242199,-0.456235,0.270306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110203,P40228,0.089087,0.136184,-0.281678,0.276177,-0.164250,0.240139,-0.140592,-0.078293,0.070732,...,0.076341,0.334077,-0.452683,0.289652,-0.232997,0.455889,-0.314216,-0.060453,-0.533693,0.187631
110205,Q8L9X2,0.072175,-0.008675,-0.057239,0.289849,0.056188,0.135665,-0.173498,-0.205961,-0.010049,...,0.164532,0.336847,-0.316897,0.236305,-0.160148,0.185961,-0.255172,-0.447783,-0.264335,0.182217
110206,Q54AX5,0.011130,-0.030923,-0.312671,0.409070,-0.216800,0.128902,-0.324157,-0.250333,-0.015706,...,0.170039,0.402863,-0.299765,0.151412,-0.159196,-0.161608,-0.485118,-0.383902,-0.661588,0.472235
110218,Q7YR37,0.277529,-0.177994,-0.164657,0.327573,-0.084777,0.372825,-0.186022,-0.241475,-0.139554,...,0.232639,0.335440,-0.380099,0.272131,-0.207881,0.557708,-0.319343,-0.476357,-0.481772,0.315167


In [13]:
pp_train = pp_df[pp_df['accession'].isin(train_df['accession'])]
pp_test  = pp_df[pp_df['accession'].isin(test_df['accession'])]
pp_valid = pp_df[pp_df['accession'].isin(valid_df['accession'])]
train_df = train_df.merge(pp_train, left_on='accession', right_on='accession', how='left')
test_df  = test_df.merge(pp_test, left_on='accession', right_on='accession', how='left')
valid_df = valid_df.merge(pp_valid, left_on='accession', right_on='accession', how='left')

### Apply PCA to PSSM

In [14]:
train_pssm = pd.read_pickle('../dataset/pssm/train_pssms.pkl')
valid_pssm = pd.read_pickle('../dataset/pssm/valid_pssms.pkl')
test_pssm = pd.read_pickle('../dataset/pssm/test_pssms.pkl')

In [15]:
test_pssm

Unnamed: 0,accession_id,pssm_matrix
0,P61802,"[[-1.0, -1.0, -2.0, -3.0, -1.0, 0.0, -2.0, -3...."
1,Q96EG1,"[[-1.0, -2.0, -2.0, -3.0, -2.0, -1.0, -2.0, -3..."
2,Q5LEQ9,"[[-1.0, -2.0, -2.0, -3.0, -2.0, -1.0, -2.0, -3..."
3,P07684,"[[-1.0, -2.0, -2.0, -3.0, -2.0, -1.0, -2.0, -3..."
4,Q0BZU8,"[[-2.0, -2.0, -3.0, -4.0, -2.0, -1.0, -3.0, -4..."
...,...,...
5454,P34152,"[[-1.0, -1.0, -2.0, -3.0, -1.0, 0.0, -2.0, -3...."
5455,F4JXF9,"[[-1.0, -2.0, -3.0, -3.0, -2.0, -1.0, -2.0, -3..."
5456,Q8T1C6,"[[-1.0, -2.0, -2.0, -3.0, -2.0, -1.0, -2.0, -3..."
5457,Q86HW6,"[[-1.0, -2.0, -2.0, -3.0, -2.0, -1.0, -2.0, -3..."


In [16]:
def get_min_m(df, matrix_column='pssm_matrix'):
    m_values = df[matrix_column].apply(lambda mat: mat.shape[0])
    min_m = m_values.min()
    max_m = m_values.max()
    return min_m, max_m

train_min_m, train_max_m = get_min_m(train_pssm, matrix_column='pssm_matrix')
valid_min_m, valid_max_m = get_min_m(valid_pssm, matrix_column='pssm_matrix')
test_min_m, test_max_m = get_min_m(test_pssm, matrix_column='pssm_matrix')
print("train_df:pssm_matrix range: ", train_min_m, train_max_m)
print("valid_df:pssm_matrix range: ", valid_min_m, valid_max_m)
print("test_df:pssm_matrix range: ", test_min_m, test_max_m)

train_df:pssm_matrix range:  32 34350
valid_df:pssm_matrix range:  44 8515
test_df:pssm_matrix range:  53 5488


In [17]:
def truncate_and_flatten(matrix, target_rows=32, n_cols=42):

    matrix = np.array(matrix, dtype=float)
    m = matrix.shape[0]
    if m < target_rows:
        pad = np.zeros((target_rows - m, n_cols))
        matrix = np.vstack([matrix, pad])
    else:
        matrix = matrix[:target_rows, :]
    return matrix.flatten()

In [18]:
features = train_pssm['pssm_matrix'].apply(lambda x: truncate_and_flatten(x, target_rows=32, n_cols=42))
features_matrix = np.stack(features.values)
new_train_pssm = pd.DataFrame(features_matrix, columns=[f'pssm_{i}' for i in range(features_matrix.shape[1])])
new_train_pssm.insert(0, 'accession', train_pssm['accession_id'])
new_train_pssm

Unnamed: 0,accession,pssm_0,pssm_1,pssm_2,pssm_3,pssm_4,pssm_5,pssm_6,pssm_7,pssm_8,...,pssm_1334,pssm_1335,pssm_1336,pssm_1337,pssm_1338,pssm_1339,pssm_1340,pssm_1341,pssm_1342,pssm_1343
0,U3NEE3,-1.0,-2.0,-2.0,-3.0,-1.0,-1.0,-2.0,-3.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,76.0,24.0,0.61,0.01
1,Q8ZY40,-4.0,-5.0,-6.0,-7.0,-5.0,-4.0,-5.0,-6.0,-5.0,...,0.0,0.0,1.0,10.0,5.0,0.0,0.0,1.0,0.93,0.44
2,A7GDQ1,-4.0,-5.0,-5.0,-6.0,-5.0,-4.0,-5.0,-6.0,-5.0,...,0.0,4.0,3.0,12.0,5.0,0.0,1.0,2.0,0.13,0.11
3,Q68CP4,-1.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34,0.01
4,Q4JW55,-1.0,-2.0,-3.0,-4.0,-2.0,-1.0,-3.0,-3.0,-2.0,...,0.0,2.0,0.0,0.0,3.0,0.0,0.0,14.0,0.85,0.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30962,Q12697,-1.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,6.0,0.0,0.0,15.0,17.0,0.0,0.0,0.0,0.25,0.04
30963,Q09449,-1.0,-1.0,-2.0,-3.0,-1.0,0.0,-2.0,-3.0,-2.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,2.03,0.00
30964,P0C7L7,-2.0,-2.0,-3.0,-4.0,-2.0,-1.0,-3.0,-4.0,-2.0,...,0.0,1.0,50.0,1.0,1.0,0.0,1.0,1.0,0.87,0.11
30965,Q97T80,-1.0,-2.0,-3.0,-4.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,30.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.49,0.08


In [19]:
features = valid_pssm['pssm_matrix'].apply(lambda x: truncate_and_flatten(x, target_rows=32, n_cols=42))
features_matrix = np.stack(features.values)
new_valid_pssm = pd.DataFrame(features_matrix, columns=[f'pssm_{i}' for i in range(features_matrix.shape[1])])
new_valid_pssm.insert(0, 'accession', valid_pssm['accession_id'])
new_valid_pssm

Unnamed: 0,accession,pssm_0,pssm_1,pssm_2,pssm_3,pssm_4,pssm_5,pssm_6,pssm_7,pssm_8,...,pssm_1334,pssm_1335,pssm_1336,pssm_1337,pssm_1338,pssm_1339,pssm_1340,pssm_1341,pssm_1342,pssm_1343
0,O27550,-2.0,-2.0,-3.0,-4.0,-2.0,-1.0,-3.0,-3.0,-2.0,...,0.0,0.0,5.0,7.0,2.0,0.0,0.0,9.0,0.28,0.10
1,Q18EV8,-3.0,-3.0,-4.0,-5.0,-3.0,-3.0,-4.0,-5.0,-4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.80,0.33
2,Q58601,-2.0,-2.0,-3.0,-4.0,-3.0,2.0,-2.0,-4.0,-2.0,...,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,1.29,0.28
3,Q8TYD5,-2.0,-2.0,-3.0,-4.0,-2.0,-1.0,-3.0,-4.0,-3.0,...,0.0,0.0,4.0,10.0,2.0,0.0,0.0,0.0,0.51,0.13
4,C1A4M2,-2.0,-2.0,-3.0,-4.0,-2.0,-1.0,-3.0,-4.0,-3.0,...,0.0,0.0,4.0,6.0,5.0,0.0,0.0,0.0,0.57,0.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4916,A2A288,-1.0,-2.0,-3.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37,0.04
4917,Q5HYM0,-1.0,-1.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.53,0.02
4918,P54954,-4.0,-5.0,-6.0,-7.0,-5.0,-4.0,-6.0,-6.0,-5.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,23.0,1.56,0.54
4919,Q4WRC2,-1.0,-2.0,-3.0,-4.0,-2.0,-1.0,-3.0,-3.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.05,0.02


In [20]:
features = test_pssm['pssm_matrix'].apply(lambda x: truncate_and_flatten(x, target_rows=32, n_cols=42))
features_matrix = np.stack(features.values)
new_test_pssm = pd.DataFrame(features_matrix, columns=[f'pssm_{i}' for i in range(features_matrix.shape[1])])
new_test_pssm.insert(0, 'accession', test_pssm['accession_id'])
new_test_pssm

Unnamed: 0,accession,pssm_0,pssm_1,pssm_2,pssm_3,pssm_4,pssm_5,pssm_6,pssm_7,pssm_8,...,pssm_1334,pssm_1335,pssm_1336,pssm_1337,pssm_1338,pssm_1339,pssm_1340,pssm_1341,pssm_1342,pssm_1343
0,P61802,-1.0,-1.0,-2.0,-3.0,-1.0,0.0,-2.0,-3.0,-2.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,2.29,0.11
1,Q96EG1,-1.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,0.0,0.0,3.0,28.0,9.0,0.0,0.0,0.0,0.38,0.03
2,Q5LEQ9,-1.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,0.0,0.0,0.0,4.0,6.0,0.0,0.0,2.0,0.46,0.16
3,P07684,-1.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.44,0.07
4,Q0BZU8,-2.0,-2.0,-3.0,-4.0,-2.0,-1.0,-3.0,-4.0,-2.0,...,2.0,0.0,16.0,3.0,17.0,0.0,0.0,8.0,0.25,0.17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5454,P34152,-1.0,-1.0,-2.0,-3.0,-1.0,0.0,-2.0,-3.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40,0.00
5455,F4JXF9,-1.0,-2.0,-3.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,0.0,0.0,1.0,65.0,18.0,0.0,0.0,0.0,1.08,0.46
5456,Q8T1C6,-1.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,0.0,0.0,71.0,8.0,0.0,0.0,0.0,0.0,1.37,0.10
5457,Q86HW6,-1.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,-2.0,...,0.0,8.0,0.0,0.0,4.0,0.0,0.0,7.0,0.41,0.09


In [21]:
train_df = pd.merge(train_df, new_train_pssm, on='accession', how='left')
valid_df = pd.merge(valid_df, new_valid_pssm, on='accession', how='left')
test_df = pd.merge(test_df, new_test_pssm, on='accession', how='left')

### Drop unnecessary columns

In [22]:
drop_columns=['Original', 'traintest', 'negative_for', 'mainclass_set', 'sprot_version', 'len', 'cluster_ID', 'representative', 'sequence', 'accession', 'query_name']
train_df = train_df.drop(columns=drop_columns)
valid_df = valid_df.drop(columns=drop_columns)
test_df = test_df.drop(columns=drop_columns)

### Preview data

In [23]:
train_df

Unnamed: 0,molecular_weight,isoelectric_point,gravy,aromaticity,instability_index,aliphatic_index,boman_index,ec_0,ec_1,ec_2,...,pssm_1334,pssm_1335,pssm_1336,pssm_1337,pssm_1338,pssm_1339,pssm_1340,pssm_1341,pssm_1342,pssm_1343
0,32275.0348,8.347963,-0.154483,0.058621,42.982793,105.862069,0.291586,2,5,1,...,0.0,0.0,7.0,7.0,3.0,4.0,0.0,0.0,0.55,0.27
1,529990.8042,6.424364,-0.162032,0.053542,46.634426,90.848941,0.260615,2,3,2,...,1.0,2.0,3.0,4.0,3.0,1.0,2.0,35.0,0.04,0.01
2,527221.8439,5.875981,-0.186202,0.060612,42.951614,90.877120,0.276119,2,3,2,...,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.19,0.02
3,37082.7180,5.953111,-0.295385,0.089231,50.063692,94.553846,0.349231,2,5,1,...,4.0,1.0,0.0,0.0,0.0,0.0,1.0,11.0,1.05,0.35
4,73292.5257,8.690096,0.295475,0.110106,38.304540,110.030166,0.535460,2,3,1,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45755,62736.3791,9.224539,-0.222993,0.094891,37.310420,96.733577,0.360274,3,6,4,...,0.0,1.0,65.0,6.0,1.0,0.0,1.0,0.0,1.73,0.74
45756,81911.2453,8.032326,-0.263022,0.133813,38.216604,95.251799,0.419914,3,6,4,...,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43,0.01
45757,25398.5248,5.070572,-0.309459,0.094595,31.677928,91.036036,0.396667,3,1,3,...,0.0,0.0,1.0,17.0,1.0,0.0,0.0,1.0,0.25,0.07
45758,18860.1921,5.420075,-0.202367,0.053254,45.992367,102.721893,0.323846,3,1,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,1.05,0.15


In [24]:
valid_df

Unnamed: 0,molecular_weight,isoelectric_point,gravy,aromaticity,instability_index,aliphatic_index,boman_index,ec_0,ec_1,ec_2,...,pssm_1334,pssm_1335,pssm_1336,pssm_1337,pssm_1338,pssm_1339,pssm_1340,pssm_1341,pssm_1342,pssm_1343
0,31190.2300,5.807604,0.129110,0.044521,34.019178,106.849315,0.276849,2,4,2,...,0.0,0.0,4.0,6.0,5.0,0.0,0.0,0.0,0.57,0.15
1,31190.2300,5.807604,0.129110,0.044521,34.019178,106.849315,0.276849,2,4,2,...,0.0,0.0,4.0,6.0,5.0,0.0,0.0,0.0,0.57,0.15
2,30874.1651,4.520314,0.083039,0.035336,35.536749,114.946996,0.350530,2,4,2,...,0.0,0.0,4.0,10.0,2.0,0.0,0.0,0.0,0.51,0.13
3,40919.2341,5.070800,-0.127717,0.073370,43.825815,101.548913,0.318641,2,6,1,...,1.0,1.0,12.0,13.0,9.0,0.0,2.0,3.0,0.15,0.07
4,38965.5379,6.088900,0.230685,0.049315,46.790137,110.438356,0.350685,2,6,1,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.86,0.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7306,67524.3177,5.310035,-0.039864,0.127768,35.901925,99.454855,0.515588,3,4,23,...,0.0,0.0,0.0,45.0,24.0,0.0,0.0,0.0,0.39,0.08
7307,27741.9071,7.076193,-0.202410,0.040161,40.854618,103.815261,0.272008,3,6,3,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,23.0,1.56,0.54
7308,22789.0384,9.775615,0.689163,0.133005,30.181330,122.413793,0.729852,3,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.09
7309,16269.9465,9.272117,0.092254,0.098592,47.411268,109.718310,0.460282,3,1,-1,...,1.0,5.0,0.0,6.0,2.0,0.0,7.0,13.0,0.05,0.05


In [25]:
test_df

Unnamed: 0,molecular_weight,isoelectric_point,gravy,aromaticity,instability_index,aliphatic_index,boman_index,ec_0,ec_1,ec_2,...,pssm_1334,pssm_1335,pssm_1336,pssm_1337,pssm_1338,pssm_1339,pssm_1340,pssm_1341,pssm_1342,pssm_1343
0,31244.5963,6.589935,-0.292701,0.083942,35.858029,94.708029,0.351971,3,6,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.82,0.14
1,91703.7386,7.501406,-0.608922,0.095415,48.009170,68.859975,0.175068,3,1,3,...,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.43,0.04
2,85429.3523,8.992518,-0.547197,0.059974,42.901838,81.186441,0.096988,3,6,4,...,0.0,0.0,16.0,14.0,3.0,0.0,1.0,1.0,0.34,0.16
3,238520.6175,8.151592,-0.347029,0.058032,46.601862,87.400186,0.206356,3,6,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.01
4,32652.1900,9.253421,-0.484752,0.117021,27.952163,86.099291,0.292021,3,1,26,...,7.0,1.0,5.0,2.0,4.0,2.0,11.0,6.0,0.15,0.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8176,32403.6261,6.557480,-0.588850,0.062718,37.762718,82.857143,0.186585,2,7,1,...,0.0,0.0,0.0,44.0,0.0,0.0,0.0,0.0,0.38,0.02
8177,50614.7558,5.565185,-0.030110,0.083516,34.673407,95.186813,0.413341,2,4,1,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.22,0.52
8178,32683.2839,6.855999,-0.028136,0.067797,30.228136,109.084746,0.374508,2,1,1,...,0.0,98.0,0.0,0.0,0.0,0.0,2.0,0.0,3.40,0.98
8179,45490.2139,7.631908,-0.233171,0.082927,24.039537,87.292683,0.338293,2,7,7,...,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.41,0.04


### Save all features

In [26]:
train_df.to_pickle("../dataset/all_features/train.pkl")
test_df.to_pickle("../dataset/all_features/test.pkl")
valid_df.to_pickle("../dataset/all_features/valid.pkl")