In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
import os

# --- Configuration ---
# Using DNABERT-2: Efficient foundation model for genome analysis
MODEL_NAME = "zhihan1996/DNABERT-2-117M"
DATA_PATH = = "posneg_sequences.csv"  # Ensure your sequence data is here
OUTPUT_PATH = "./results/embeddings.csv"

def load_model():
    """Load pretrained DNABERT-2 model and tokenizer."""
    print(f"Loading model: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
    return tokenizer, model

def get_embedding(sequence, tokenizer, model):
    """Generate a 768-dim embedding via mean pooling of hidden states."""
    inputs = tokenizer(sequence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling across token-level hidden states [cite: 38]
        embedding = outputs[0].mean(dim=1).cpu().numpy().flatten()
    return embedding

def main():
    tokenizer, model = load_model()

    # Loading dataset containing POS_Bgy1, POS_Bgy2, and 10 Hard Negatives [cite: 32]
    df = pd.read_csv(DATA_PATH)

    print("Generating embeddings for intra-genomic sequences...")
    embeddings = []
    for seq in df['sequence']:
        embeddings.append(get_embedding(seq, tokenizer, model))

    # Store numerical embeddings back to dataframe
    df['embedding'] = embeddings

import os
os.makedirs("results", exist_ok=True)

    if not os.path.exists('./results'):
        os.makedirs('./results')

    df.to_csv(OUTPUT_PATH, index=False)
    print(f"Success! Embeddings saved to {OUTPUT_PATH}")

if __name__ == "__main__":
    main()

Loading model: zhihan1996/DNABERT-2-117M


A new version of the following files was downloaded from https://huggingface.co/zhihan1996/DNABERT-2-117M:
- flash_attn_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/468M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: './data/kb290_sequences.csv'

In [None]:
pip install triton

Collecting triton
  Downloading triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Downloading triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (188.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.3/188.3 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.6.0
