In [9]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModel
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier


In [None]:
model_id = "zhihan1996/DNA_bert_6"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModel.from_pretrained(model_id)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_bert.py:   0%|          | 0.00/807 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/zhihan1996/DNA_bert_6:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/359M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(4101, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)

In [None]:
df = pd.read_excel("/content/Output.xlsx")
df = df[['Sequence', 'Label']].dropna()
df['Label'] = df['Label'].astype(int)

In [None]:
# convert dna to kmers
def seq2kmer(seq, k=6):
    return " ".join([seq[i:i+k] for i in range(len(seq) - k + 1)])

df['kmers'] = df['Sequence'].str.upper().apply(lambda x: seq2kmer(x, k=6))

In [None]:
# generate embeddings
def get_embedding(seq):
    tokens = tokenizer(seq, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    tokens = {k: v.to(device) for k, v in tokens.items()}
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

print("Generating embeddings...")
embeddings = []
for seq in tqdm(df['kmers']):
    embeddings.append(get_embedding(seq))

X = np.array(embeddings)
y = df['Label'].values

Generating embeddings...


  return forward_call(*args, **kwargs)
100%|██████████| 221/221 [00:37<00:00,  5.82it/s]


In [10]:
# 5. Cross-validation with Random Forest
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
roc_aucs = []

print("\nPerforming 5-fold CV with Random Forest...")
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold} ---")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]


    clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    y_prob = clf.predict_proba(X_val)[:, 1]

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    roc = roc_auc_score(y_val, y_prob)

    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, ROC-AUC: {roc:.4f}")

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    roc_aucs.append(roc)



Performing 5-fold CV with Random Forest...

--- Fold 1 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6444, Precision: 0.6579, Recall: 0.8929, ROC-AUC: 0.4548

--- Fold 2 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.7045, Precision: 0.7143, Recall: 0.8929, ROC-AUC: 0.5848

--- Fold 3 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6136, Precision: 0.6774, Recall: 0.7500, ROC-AUC: 0.5312

--- Fold 4 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6818, Precision: 0.6857, Recall: 0.8889, ROC-AUC: 0.6002

--- Fold 5 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6818, Precision: 0.6757, Recall: 0.9259, ROC-AUC: 0.5839


In [11]:
print("\n=== Cross-Validation Results ===")
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average ROC-AUC: {np.mean(roc_aucs):.4f}")

print(f"\nStandard Deviation (Accuracy): {np.std(accuracies):.4f}")
print(f"Standard Deviation (ROC-AUC): {np.std(roc_aucs):.4f}")


=== Cross-Validation Results ===
Average Accuracy: 0.6653
Average Precision: 0.6822
Average Recall: 0.8701
Average ROC-AUC: 0.5510

Standard Deviation (Accuracy): 0.0322
Standard Deviation (ROC-AUC): 0.0535
