In [1]:
import pandas as pd 
import numpy as np 
import torch 
from torch.utils.data import TensorDataset

from tokenizers import (
    Tokenizer, 
    models,
    normalizers,
    pre_tokenizers,
    trainers
)

In [2]:
data = pd.read_csv("./data/data.csv") 

In [3]:
data.columns

Index(['chain_id', 'first_res', 'input', 'dssp8'], dtype='object')

In [4]:
data = data.loc[:, ['input', 'dssp8']] 
data

Unnamed: 0,input,dssp8
0,VMKANVTKKTLNEGLGLLERVIPSRSSNPLLTALKVETSEGGLTLS...,CEEEEEEHHHHHHHHHHHHHHSCSCCSSTTTTEEEEEECSSEEEEE...
1,PVRVGLSVDASALGHTIPPDYTGLSYEQAQMANPNYFSGANTQLAG...,CEEEEEEEEEEEEEEEEPTTCCEEEEEGGGGGCTTTSSTTCHHHHH...
2,AYVLDTNVAIHLRDGDPEVTTRVTALNGAILLSIISRVELEGGVYR...,CEEECHHHHHHHHTTCHHHHHHHHHCCSCEEEEHHHHHHHHHHHTS...
3,MEFSVKSGSPEKQRSACIVVGVFEPRRLSPIAEQLDKISDGYISAL...,CEEEEECCCGGGCCCSCEEEEEETTTEECHHHHHHHHHHTSHHHHH...
4,SKGFFVDTTRCTACRGCQVACKQWHGNPATPTENTGFHQNPPDFNF...,CEEEEEEGGGCCCCCHHHHHHHHHHTPPPCCCCCSSCSCCSSSCBT...
...,...,...
18726,LNQDATILRQAKLGLSDPAQSLSSWSDNNDVTPaKWLGVSaDATSN...,CHHHHHHHHHHHHTSBCTTCTTTTTTSCTTCCGGGSTTEEECTTSC...
18727,HMPVHIVDPREHVFVHAITSECVMLACEVDREDAPVRWYKDGQEVE...,CCCCCCCBCCSCEEEECCTTCCEEEEEECSCTTSCCEEEETTSCCC...
18728,MTTAERWQKIQAQAPDVIFDLAKRAAAAKGPKANLVIGAYRDEQGR...,CCHHHHHHTPPPPPPPHHHHHHHHHHHCCSSCEECCSCCCBCTTSC...
18729,GSHMLEVLTQKHKPAESQQQAAETEGSaNKKDQNEbKSPaKWHNDA...,CCCSCCCSCCSSCSCCSSCCCCCCHHHHHHSCTTTCCTTSCEETTS...


In [5]:
input_tokenizer = Tokenizer(models.WordLevel(unk_token = '[UNK]')) 

input_tokenizer.pre_tokenizer = pre_tokenizers.Split(pattern='', behavior='isolated')
input_tokenizer.normalizer  = normalizers.Lowercase()

trainer = trainers.WordLevelTrainer(special_tokens = ['[UNK]'])

sequences = list(data['input'])

input_tokenizer.train_from_iterator(sequences, trainer) 
input_tokenizer.enable_truncation(1000)
input_tokenizer.enable_padding() 

In [6]:
output_tokenizer = Tokenizer(models.WordLevel(unk_token = '[UNK]'))
output_tokenizer.pre_tokenizer = pre_tokenizers.Split(pattern='', behavior = 'isolated')
output_tokenizer.normalizer = normalizers.Lowercase()

output_sequences = list(data['dssp8'])

output_tokenizer.train_from_iterator(output_sequences, trainer) 
output_tokenizer.enable_truncation(1000)
output_tokenizer.enable_padding()

In [7]:
input_tokenizer.save("./trained_tokenizers/input_tokenizer.json") 
output_tokenizer.save("./trained_tokenizers/output-tokenizer.json")

In [8]:
inputs = list(map(lambda x: x.ids, input_tokenizer.encode_batch(data["input"])))

targets = list(map(lambda x: x.ids, output_tokenizer.encode_batch(data['dssp8'])))

In [9]:
inputs = np.array(inputs) 
targets = np.array(targets)

In [10]:
processed_data = TensorDataset(torch.tensor(inputs), torch.tensor(targets))

In [11]:
torch.save(processed_data, "./data/processed_data.pt")