In [8]:
import pandas as pd
import numpy as np

from tqdm import tqdm 
from biopandas.pdb import PandasPdb
from transformers import TFAutoModel, AutoTokenizer, EsmTokenizer, TFEsmModel

from src.data_fix import *

In [2]:
test_df = pd.read_csv('data/test.csv')

In [3]:
fixed_train_df = load_fixed_train_df(original_train_file_path='data/train.csv',
                                     update_file_path='data/train_updates_20220929.csv'
                                     )

### EDA and Protein Sequence Tokenize

In [4]:
fixed_train_df['sequence_len'] = fixed_train_df['protein_sequence'].apply(lambda x: len(x))
test_df['sequence_len'] = test_df['protein_sequence'].apply(lambda x: len(x))

In [5]:
fixed_train_df['sequence_len'].describe()

count    28981.000000
mean       450.468617
std        415.159049
min          5.000000
25%        212.000000
50%        351.000000
75%        537.000000
max       8798.000000
Name: sequence_len, dtype: float64

In [6]:
test_df['sequence_len'].describe()

count    2413.000000
mean      220.968090
std         0.175798
min       220.000000
25%       221.000000
50%       221.000000
75%       221.000000
max       221.000000
Name: sequence_len, dtype: float64

In [9]:
tokenizer = EsmTokenizer.from_pretrained('esm')
model = TFEsmModel.from_pretrained('esm')

Some layers from the model checkpoint at esm were not used when initializing TFEsmModel: ['lm_head']
- This IS expected if you are initializing TFEsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFEsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFEsmModel were not initialized from the model checkpoint at esm and are newly initialized: ['esm/pooler/dense/bias:0', 'esm/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
tokenized = tokenizer.batch_encode_plus(fixed_train_df['protein_sequence'].to_list(),
                                        add_special_tokens=True,
                                        max_length=3000,
                                        truncation=True,
                                        padding=True,
                                        return_tensors='tf'
                                        )

In [29]:
tokenized['attention_mask']

<tf.Tensor: shape=(28981, 3000), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>