In [1]:
!pip install ankh

Collecting ankh
  Downloading ankh-1.10.0-py3-none-any.whl (31 kB)
Collecting biopython<2.0,>=1.80 (from ankh)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets<3.0.0,>=2.7.1 (from ankh)
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece<0.2.0,>=0.1.97 (from ankh)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers<5.0.0,>=4.25.1 (from ankh)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch
import random

seed = 7
torch.manual_seed(seed)
random.seed(seed)

import ankh

In [3]:
def get_num_params(model):
    return sum(p.numel() for p in model.parameters())

In [4]:
def get_n_mask_tokens(n):
    return [f"<extra_id_{i}>" for i in range(n)]

def append_n_mask_tokens(input_, n):
    return input_ + "".join(get_n_mask_tokens(n))

### Select the available device.

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Available device:', device)

Available device: cuda:0


### Load Ankh large model.

In [6]:
model, tokenizer = ankh.load_large_model(generation=True)
model.eval()
model.to(device=device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/7.52G [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(144, 1536)
  (encoder): T5Stack(
    (embed_tokens): Embedding(144, 1536)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1536, out_features=1024, bias=False)
              (k): Linear(in_features=1536, out_features=1024, bias=False)
              (v): Linear(in_features=1536, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1536, bias=False)
              (relative_attention_bias): Embedding(64, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1536, out_features=3840, bias=False)
              (wi_1): Linear(in_features=1536, out_features=3840, bias=False)
           

In [7]:
print(f"Number of parameters:", get_num_params(model))

Number of parameters: 1878705152


### Test Autoregressive generation on a sequence.

In [8]:
test_seq = "QVQLVESGGGLVQPGGSL"
num_new_tokens = 5
masked_seq = append_n_mask_tokens(test_seq, n=num_new_tokens)
maximum_length = num_new_tokens * 2  + 1
num_beams = 5
temperature = 1.0

In [9]:
encoded = tokenizer.encode_plus(masked_seq, add_special_tokens=True, return_tensors='pt')
input_ids = encoded['input_ids'].to(device)

In [10]:
input_ids

tensor([[ 16,   6,  16,   4,   6,   9,   7,   5,   5,   5,   4,   6,  16,  13,
           5,   5,   7,   4, 143, 142, 141, 140, 139,   1]], device='cuda:0')

In [11]:
generation = model.generate(input_ids=input_ids, temperature = temperature,
                                max_length = maximum_length,
                                num_beams = num_beams,
                                do_sample=True if temperature > 0 else False)

output_ids = generation[0].squeeze()

In [12]:
generated_tokens = list(tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))

In [13]:
test_output = f"{test_seq}" + "".join(generated_tokens)
print(test_output)

QVQLVESGGGLVQPGGSLVQPGG


In [14]:
len(generated_tokens)

5

In [1]:
!pip install seqeval==0.0.5



In [2]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch
import numpy as np
import random

seed = 7

torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

import ankh

from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import Trainer, TrainingArguments, EvalPrediction
from datasets import load_dataset

from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy import stats
from functools import partial
import pandas as pd
from tqdm.auto import tqdm

In [3]:
def get_num_params(model):
    return sum(p.numel() for p in model.parameters())

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Available device:', device)

Available device: cuda:0


In [5]:
model, tokenizer = ankh.load_large_model()
model.eval()
model.to(device=device)

T5EncoderModel(
  (shared): Embedding(144, 1536)
  (encoder): T5Stack(
    (embed_tokens): Embedding(144, 1536)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1536, out_features=1024, bias=False)
              (k): Linear(in_features=1536, out_features=1024, bias=False)
              (v): Linear(in_features=1536, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1536, bias=False)
              (relative_attention_bias): Embedding(64, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1536, out_features=3840, bias=False)
              (wi_1): Linear(in_features=1536, out_features=3840, bias=False)
              (wo): Lin

In [6]:
print(f"Number of parameters:", get_num_params(model))

Number of parameters: 1151707648


In [7]:
training_dataset = load_dataset("proteinea/SSP", data_files={'train': ['training_hhblits.csv']})
casp12_dataset = load_dataset("proteinea/SSP", data_files={'test': ['CASP12.csv']})
casp14_dataset = load_dataset("proteinea/SSP", data_files={'test': ['CASP14.csv']})
ts115_dataset = load_dataset("proteinea/SSP", data_files={'test': ['TS115.csv']})
cb513_dataset = load_dataset("proteinea/SSP", data_files={'test': ['CB513.csv']})

In [8]:
input_column_name = 'input'
labels_column_name = 'dssp3' # You can change it to "dssp8" if you want to work with 8 states.
disorder_column_name = 'disorder'
training_sequences, training_labels, training_disorder = (
    training_dataset['train'][input_column_name],
    training_dataset['train'][labels_column_name],
    training_dataset['train'][disorder_column_name]
)


casp12_sequences, casp12_labels, casp12_disorder = (
    casp12_dataset['test'][input_column_name],
    casp12_dataset['test'][labels_column_name],
    casp12_dataset['test'][disorder_column_name]
)

casp14_sequences, casp14_labels, casp14_disorder = (
    casp14_dataset['test'][input_column_name],
    casp14_dataset['test'][labels_column_name],
    casp14_dataset['test'][disorder_column_name]
)

ts115_sequences, ts115_labels, ts115_disorder = (
    ts115_dataset['test'][input_column_name],
    ts115_dataset['test'][labels_column_name],
    ts115_dataset['test'][disorder_column_name]
)

cb513_sequences, cb513_labels, cb513_disorder = (
    cb513_dataset['test'][input_column_name],
    cb513_dataset['test'][labels_column_name],
    cb513_dataset['test'][disorder_column_name]
)

In [9]:
def preprocess_dataset(sequences, labels, disorder, max_length=None):

    sequences = ["".join(seq.split()) for seq in sequences]

    if max_length is None:
        max_length = len(max(sequences, key=lambda x: len(x)))

    seqs = [list(seq)[:max_length] for seq in sequences]

    labels = ["".join(label.split()) for label in labels]
    labels = [list(label)[:max_length] for label in labels]

    disorder = [" ".join(disorder.split()) for disorder in disorder]
    disorder = [disorder.split()[:max_length] for disorder in disorder]

    assert len(seqs) == len(labels) == len(disorder)
    return seqs, labels, disorder

In [10]:
def embed_dataset(model, sequences, shift_left = 0, shift_right = -1):
    inputs_embedding = []
    with torch.no_grad():
        for sample in tqdm(sequences):
            ids = tokenizer.batch_encode_plus([sample], add_special_tokens=True,
                                              padding=True, is_split_into_words=True,
                                              return_tensors="pt")
            embedding = model(input_ids=ids['input_ids'].to(device))[0]
            embedding = embedding[0].detach().cpu().numpy()[shift_left:shift_right]
            inputs_embedding.append(embedding)
    return inputs_embedding

In [11]:
training_sequences, training_labels, training_disorder = preprocess_dataset(training_sequences,
                                                                            training_labels,
                                                                            training_disorder)
casp12_sequences, casp12_labels, casp12_disorder = preprocess_dataset(casp12_sequences,
                                                                      casp12_labels,
                                                                      casp12_disorder)

casp14_sequences, casp14_labels, casp14_disorder = preprocess_dataset(casp14_sequences,
                                                                      casp14_labels,
                                                                      casp14_disorder)
ts115_sequences, ts115_labels, ts115_disorder = preprocess_dataset(ts115_sequences,
                                                                   ts115_labels,
                                                                   ts115_disorder)
cb513_sequences, cb513_labels, cb513_disorder = preprocess_dataset(cb513_sequences,
                                                                   cb513_labels,
                                                                   cb513_disorder)

In [12]:
training_embeddings = embed_dataset(model, training_sequences[:10])
casp12_embeddings = embed_dataset(model, casp12_sequences[:10])
casp14_embeddings = embed_dataset(model, casp14_sequences[:10])
ts115_embeddings = embed_dataset(model, ts115_sequences[:10])
cb513_embeddings = embed_dataset(model, cb513_sequences[:10])

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
# Consider each label as a tag for each token
unique_tags = set(tag for doc in training_labels for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [14]:
def encode_tags(labels):
    labels = [[tag2id[tag] for tag in doc] for doc in labels]
    return labels

In [15]:
train_labels_encodings = encode_tags(training_labels)
casp12_labels_encodings = encode_tags(casp12_labels)
casp14_labels_encodings = encode_tags(casp14_labels)
ts115_labels_encodings = encode_tags(ts115_labels)
cb513_labels_encodings = encode_tags(cb513_labels)

In [16]:
def mask_disorder(labels, masks):
    for label, mask in zip(labels,masks):
        for i, disorder in enumerate(mask):
            if disorder == "0.0":
                label[i] = -100
    return labels

In [17]:
train_labels_encodings = mask_disorder(train_labels_encodings, training_disorder)
casp12_labels_encodings = mask_disorder(casp12_labels_encodings, casp12_disorder)
casp14_labels_encodings = mask_disorder(casp14_labels_encodings, casp14_disorder)
ts115_labels_encodings = mask_disorder(ts115_labels_encodings, ts115_disorder)
cb513_labels_encodings = mask_disorder(cb513_labels_encodings, cb513_disorder)

In [18]:
class SSPDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        embedding = self.encodings[idx]
        labels = self.labels[idx]
        return {'embed': torch.tensor(embedding), 'labels': torch.tensor(labels)}

    def __len__(self):
        return len(self.labels)

In [19]:
training_dataset = SSPDataset(training_embeddings, train_labels_encodings[:10])
casp12_dataset = SSPDataset(casp12_embeddings, casp12_labels_encodings[:10])
casp14_dataset = SSPDataset(casp14_embeddings, casp14_labels_encodings[:10])
ts115_dataset = SSPDataset(ts115_embeddings, ts115_labels_encodings[:10])
cb513_dataset = SSPDataset(cb513_embeddings, cb513_labels_encodings[:10])

In [20]:
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray):
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != torch.nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(id2tag[label_ids[i][j]])
                    preds_list[i].append(id2tag[preds[i][j]])

        return preds_list, out_label_list

def compute_metrics(p: EvalPrediction):
    preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
    return {
        "accuracy": accuracy_score(out_label_list, preds_list),
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }

In [21]:
def model_init(num_tokens, embed_dim):
    hidden_dim = int(embed_dim / 2)
    num_hidden_layers = 1 # Number of hidden layers in ConvBert.
    nlayers = 1 # Number of ConvBert layers.
    nhead = 4
    dropout = 0.2
    conv_kernel_size = 7
    downstream_model = ankh.ConvBertForMultiClassClassification(num_tokens=num_tokens,
                                                                input_dim=embed_dim,
                                                                nhead=nhead,
                                                                hidden_dim=hidden_dim,
                                                                num_hidden_layers=num_hidden_layers,
                                                                num_layers=nlayers,
                                                                kernel_size=conv_kernel_size,
                                                                dropout=dropout)
    return downstream_model.cuda()

In [38]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m143.4/244.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0


In [22]:
import accelerate

In [23]:
model_type = 'ankh_large'
experiment = f'ssp3_{model_type}'

training_args = TrainingArguments(
    output_dir=f'./results_{experiment}',
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=1000,
    learning_rate=1e-03,
    weight_decay=0.0,
    logging_dir=f'./logs_{experiment}',
    logging_steps=200,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=16,
    fp16=False,
    fp16_opt_level="02",
    run_name=experiment,
    seed=seed,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    save_strategy="epoch"
)

In [24]:
model_embed_dim = 1536 # Embedding dimension for ankh large.

trainer = Trainer(
    model_init=partial(model_init, num_tokens=len(unique_tags), embed_dim=model_embed_dim),
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=casp12_dataset,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.022782,0.355456,0.120285,0.165498,0.139315
2,No log,1.974012,0.344769,0.117647,0.168303,0.138488
3,No log,1.938078,0.33802,0.113101,0.168303,0.135287
3,No log,1.910374,0.327334,0.110407,0.171108,0.134213


TrainOutput(global_step=5, training_loss=0.7859787940979004, metrics={'train_runtime': 7.2658, 'train_samples_per_second': 6.882, 'train_steps_per_second': 0.688, 'total_flos': 0.0, 'train_loss': 0.7859787940979004, 'epoch': 3.2})

In [26]:
predictions, labels, metrics_output = trainer.predict(test_dataset)

NameError: ignored

In [None]:
metrics_output