In [1]:
import gpn.model
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import torch
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer

In [2]:
model_path = "songlab/gpn-brassicales"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.get_vocab()

{'t': 6, '[PAD]': 0, '[UNK]': 2, 'a': 3, '[MASK]': 1, 'g': 5, 'c': 4}

In [5]:
model_for_mlm = AutoModelForMaskedLM.from_pretrained(model_path)
model_for_mlm.eval()

ConvNetForMaskedLM(
  (model): ConvNetModel(
    (embedding): GPNEmbedding()
    (encoder): Sequential(
      (0): ConvLayer(
        (conv): Sequential(
          (0): TransposeLayer()
          (1): Conv1d(512, 512, kernel_size=(9,), stride=(1,), padding=same)
          (2): TransposeLayer()
          (3): GELU(approximate='none')
          (4): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (ffn): Sequential(
          (0): Linear(in_features=512, out_features=512, bias=True)
          (1): GELU(approximate='none')
          (2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
      )
      (1): ConvLayer(
        (conv): Sequential(
          (0): TransposeLayer()
          (1): Conv1d(512, 512, kernel_size=(9,), stride=(1,), padding=same, dilation=(2,))
          (2): TransposeLayer()
          (3): GELU(approximate='none')
          (4): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (ffn): Sequential(
          (0

In [6]:
from gpn.data import load_fasta

genome = load_fasta("./data/GCF_000001735.4.fna.gz")

chromosome_5 = genome["NC_003076.8"]

chromosome_5[0:100]

'TATACCATGTACCCTCAAccttaaaaccctaaaacctatactataaatctttaaaacctaTACTCTAAACCATAGGGTTTGTGAGTTTGCATAAAGTGTC'

In [12]:
center_position = 3000
max_context_length = 5000

start = int(center_position-(max_context_length/2)-1)
end = int(center_position+(max_context_length/2))

print(f"start: {start}, end: {end}")

sequence = chromosome_5[start:end]

len(sequence)

start: 499, end: 5500


5001

In [13]:
input_ids = tokenizer(sequence, return_tensors="pt", return_attention_mask=False, return_token_type_ids=False)["input_ids"]

In [19]:
step = 50
steps = 10
for i in range(0, steps*step, step):
    print(i)

0
50
100
150
200
250
300
350
400
450


In [None]:
with torch.no_grad():
    all_logits = model_for_mlm(input_ids=input_ids).logits
all_logits.shape

In [None]:
acgt_idxs = [tokenizer.get_vocab()[nuc] for nuc in ["a", "c", "g", "t"]]
nucleotide_logits = all_logits[:, :, acgt_idxs]
output_probs = torch.nn.functional.softmax(nucleotide_logits, dim=-1)