### Numerical representation strategies demonstration

In [1]:
import pandas as pd
import sys
sys.path.insert(0, "../")
from src.numerical_representation_strategy.embedding_representations import BioEmbeddings
from src.numerical_representation_strategy.fft_encoder import FFTTransform
from src.numerical_representation_strategy.one_hot_encoding import OneHotEncoder
from src.numerical_representation_strategy.physicochemical_properties import PhysicochemicalEncoder

### Loading dataset

In [2]:
df_data = pd.read_csv("../results_demo/train_df.csv")
df_data.head(5)

Unnamed: 0,sequence,activity
0,ERANSVTWNPHKMMGVPLQC,1
1,MNILLEYVVKSFD,1
2,LLYGDAEKPAESGGSQPPRA,1
3,NEEAGDGTTTATVLARSIAK,1
4,ASAGIAEVATSIKVGAQARVDWLPQETILFDRAALFRRL,0


### Numerical representation strategies explored in this work

#### One Hot encoder

In [3]:
one_hot_instance = OneHotEncoder(
    dataset=df_data,
    column_sequence="sequence",
    max_length=150
)

data_coded = one_hot_instance.run_process()
data_coded["activity"] = df_data["activity"]
data_coded.head(5)

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_2991,p_2992,p_2993,p_2994,p_2995,p_2996,p_2997,p_2998,p_2999,activity
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Physicochemical properties

In [4]:
dataset_encoder=pd.read_csv("../input_data_for_coding/cluster_encoders.csv")
dataset_encoder.index = dataset_encoder["residue"]
dataset_encoder.head(5)

Unnamed: 0_level_0,residue,Group_0,Group_1,Group_2,Group_3,Group_4,Group_5,Group_6,Group_7
residue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,A,290.40675,71.850787,6.250299,44.65141,-107.792042,15.33599,56.16028,92.925289
R,R,172.577375,-6.96389,84.091653,200.152218,51.157141,172.36012,1.448105,-37.39311
N,N,-38.377385,-90.145475,-21.731374,-191.180531,73.940581,-259.135737,-54.69043,-77.746565
D,D,159.436015,-56.585499,-28.963699,-232.261465,55.369736,-216.012067,-29.383132,-7.421269
C,C,-4.241925,15.678516,-34.886819,-156.2126,-54.192823,-242.000209,10.074813,40.041394


In [5]:
physicochemical_instance = PhysicochemicalEncoder(
    dataset=df_data,
    property_encoder="Group_0",
    dataset_encoder=dataset_encoder,
    name_column_seq="sequence",
    columns_to_ignore=["activity"]
)

physicochemical_instance.run_process()
physicochemical_instance.df_data_encoded.head(5)

Encoding and Processing results
Creating dataset
Export dataset


Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_141,p_142,p_143,p_144,p_145,p_146,p_147,p_148,p_149,activity
0,-0.028483,172.577375,290.40675,-38.377385,-314.201739,150.752932,-252.509397,-118.154599,-38.377385,317.102424,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,21.944601,-38.377385,-34.080828,-91.117252,-91.117252,-0.028483,-10.20771,150.752932,150.752932,195.599646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,-91.117252,-91.117252,-10.20771,-104.495222,159.436015,290.40675,-0.028483,195.599646,317.102424,290.40675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,-38.377385,-0.028483,-0.028483,290.40675,-104.495222,159.436015,-104.495222,-252.509397,-252.509397,-252.509397,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,290.40675,-314.201739,290.40675,-104.495222,-34.080828,290.40675,-0.028483,150.752932,290.40675,-252.509397,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


#### FFT transform

In [6]:
fft_instance = FFTTransform(
    dataset=physicochemical_instance.df_data_encoded,
    size_data=len(physicochemical_instance.df_data_encoded.columns)-1,
    columns_to_ignore=["activity"]
)

response_fft = fft_instance.encoding_dataset()
response_fft.head(5)

Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset


Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_119,p_120,p_121,p_122,p_123,p_124,p_125,p_126,p_127,activity
0,248.24569,248.330815,247.701935,244.035037,234.345403,215.853071,186.837086,148.035163,107.563734,97.380905,...,962.828612,1011.614852,1060.533851,1107.821076,1151.652339,1190.272913,1222.105479,1245.836423,1260.481472,1
1,187.384882,191.647319,203.722661,221.859426,244.007096,268.322693,293.331717,317.90508,341.187478,362.532443,...,489.317788,479.154683,468.660224,458.288879,448.500007,439.741154,432.42759,426.918881,423.49464,1
2,947.195506,937.928697,909.652018,861.053842,790.315554,695.600511,575.68924,431.010166,267.22043,130.738608,...,151.019681,124.616558,102.651858,84.973127,71.608625,62.590595,57.550814,55.462893,54.951729,1
3,143.130764,162.809623,211.824468,276.380219,349.453017,427.760267,509.182367,591.762517,673.379688,751.686402,...,1069.417961,968.613928,863.379343,755.922731,648.574341,543.997328,445.777109,359.804486,296.766057,1
4,2009.00598,1911.582043,1634.018106,1218.817304,734.365023,311.882595,387.248853,681.720847,866.942599,916.438798,...,1170.291493,1351.387894,1605.208991,1865.353585,2045.76533,2079.94419,1941.345389,1658.260844,1336.388964,0


#### Embedding through bio-embedding tool

In [8]:
bioembedding_instance = BioEmbeddings(
    dataset=df_data,
    seq_column="sequence",
    is_reduced=True,
    device="cuda",
    column_response="activity",
    path_export="../results_demo/"
)

bioembedding_instance.apply_prottrans_t5_uniref(name_export="df_training")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
df_testing = pd.read_csv("../results_demo/independent_df.csv")
bioembedding_instance = BioEmbeddings(
    dataset=df_testing,
    seq_column="sequence",
    is_reduced=True,
    device="cuda",
    column_response="activity",
    path_export="../results_demo/"
)

bioembedding_instance.apply_prottrans_t5_uniref(name_export="independent_df")