In [12]:
%pip install seaborn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
# ================================================================
# Cell 1 — Setup (MPS device, imports, seeds)
# ================================================================
import os
import random
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras import callbacks

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)


Using device: mps


In [2]:
# Cell 2 — file locate (tries common Kaggle and /mnt/data)
candidate_paths = [
    "/kaggle/input/vista-sequence/vista_sequences.fasta",
    "Vista_Dataset/vista_sequences.fasta",
    "/mnt/data/vista_sequence.fasta",
    "/kaggle/working/vista_sequence.fasta",
    "/kaggle/input/vista-enhancers/vista_sequence.fasta",
]
fasta_path = None
for p in candidate_paths:
    if os.path.exists(p):
        fasta_path = p
        break
if fasta_path is None:
    raise FileNotFoundError(f"vista_sequence.fasta not found. Tried paths:\n" + "\n".join(candidate_paths))
print("Using fasta file:", fasta_path)


Using fasta file: Vista_Dataset/vista_sequences.fasta


In [3]:
# Cell 3 — parse FASTA and build dataframe
def parse_fasta_to_records(path):
    # Expect fasta headers to include label/species; if not, you may need to adapt to your fasta header format.
    records = []
    with open(path, 'r') as fh:
        header = None
        seq_lines = []
        for line in fh:
            line=line.strip()
            if not line:
                continue
            if line.startswith(">"):
                if header is not None:
                    seq = "".join(seq_lines).upper()
                    records.append((header, seq))
                header = line[1:]
                seq_lines = []
            else:
                seq_lines.append(line)
        # last
        if header is not None:
            seq = "".join(seq_lines).upper()
            records.append((header, seq))
    return records

records = parse_fasta_to_records(fasta_path)
print("Parsed sequences:", len(records))
# Inspect a few headers to ensure label extraction
for h,s in records[:5]:
    print("H:", h)


Parsed sequences: 3408
H: Human|chr16:86430087-86430726 | element 1 | positive  | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12]
H: Human|chr16:85620095-85621736 | element 2 | negative
H: Human|chr16:80423343-80424652 | element 3 | negative
H: Human|chr16:80372593-80373755 | element 4 | positive  | neural tube[6/10] | hindbrain (rhombencephalon)[10/10] | midbrain (mesencephalon)[10/10]
H: Human|chr16:79969907-79971297 | element 5 | negative


In [4]:
# Cell 4 — build dataframe with label extraction (try to infer from header)
rows = []
for header, seq in records:
    h = header.lower()
    # Best-effort inference:
    if "human" in h or "hs" in h:
        species = "human"
    elif "mouse" in h or "mm" in h:
        species = "mouse"
    else:
        # fallback: if file has species in separate field, split by '|' or whitespace:
        species = "unknown"
    # enhancer presence detection:
    if "enhancer" in h or "positive" in h or "pos" in h:
        enhancer_flag = 1
    elif "non" in h or "negative" in h or "neg" in h or "not" in h:
        enhancer_flag = 0
    else:
        # fallback: try numeric statuses
        enhancer_flag = None
    rows.append({"header": header, "sequence": seq, "species": species, "enhancer": enhancer_flag})

df = pd.DataFrame(rows)
print(df.shape)
df['seq_len'] = df['sequence'].str.len()
df.seq_len.describe()
# If many 'unknown' or enhancer None, print sample to let user confirm header format:
print(df['species'].value_counts(dropna=False))
print("Enhancer flag counts:", df['enhancer'].value_counts(dropna=False))
df.head(8)


(3408, 4)
species
human    2002
mouse    1406
Name: count, dtype: int64
Enhancer flag counts: enhancer
1    1750
0    1658
Name: count, dtype: int64


Unnamed: 0,header,sequence,species,enhancer,seq_len
0,Human|chr16:86430087-86430726 | element 1 | po...,AACTGAAGGGACCCCGTTAGCATATAAACAAAAGGTGGGGGGTAGC...,human,1,640
1,Human|chr16:85620095-85621736 | element 2 | ne...,GGCCCTGGTATGTTTGTTCTTCCAGGGGCTCCCAGGATGGATCCAG...,human,0,1642
2,Human|chr16:80423343-80424652 | element 3 | ne...,AAGATTGCCATTTGGGGTGTTTCTTGGGGCTAAGAACCATGAAGAC...,human,0,1310
3,Human|chr16:80372593-80373755 | element 4 | po...,GTGACAGAGACAGACAGTGACAGAGACAGATTTTAGAATTTGAACA...,human,1,1163
4,Human|chr16:79969907-79971297 | element 5 | ne...,TGACACCCACTATTATCCAGTCCTTGATAAACCTCTTTATTTGTTC...,human,0,1391
5,Human|chr16:79949950-79951518 | element 6 | ne...,AGTCACCCAGGTGGTAGTGGGCTGCAGATGCTGTGGGTTTTGTTTC...,human,0,1569
6,Human|chr16:79026563-79028162 | element 7 | ne...,ACAGAAGCCTCAAGCCTAACCAACAAGAAAGATCACTTCATATGCA...,human,0,1600
7,Human|chr16:78933253-78934686 | element 9 | ne...,TTGTTCCGGAAACCTAACTCCAAATCTTTGAACTTCCTAGAAACCT...,human,0,1434


In [5]:
# Cell 5 — filter & prepare labels for scenarios
# Scenario 1: only enhancer sequences (paper's first scenario predicted human vs mouse among enhancers).
df_enhancers = df[df['enhancer']==1].copy()
print("Enhancer sequences (for scenario1):", len(df_enhancers))

# Scenario 2: all sequences -> classes: human_enhancer, mouse_enhancer, no_enhancer
def class_label_row(r):
    if r.enhancer==1 and r.species=="human":
        return 0  # human enhancer
    if r.enhancer==1 and r.species=="mouse":
        return 1  # mouse enhancer
    return 2      # no enhancer

df['class_s2'] = df.apply(class_label_row, axis=1)
print(df['class_s2'].value_counts())


Enhancer sequences (for scenario1): 1750
class_s2
2    1658
0    1029
1     721
Name: count, dtype: int64


In [6]:
# Cell 6 — encoding schemes
INT_MAP = {'A':1,'C':3,'G':2,'T':4,'N':0}
ATOMIC_MAP = {'A':70,'C':58,'G':78,'T':66,'N':0}
EIIP_MAP = {'A':0.1260,'C':0.1340,'G':0.0806,'T':0.1335,'N':0.0}

# BFDNA: per-sequence frequencies (the paper uses for each base the frequency across the whole sequence;
# then every position mapped to that base's frequency value).
def encode_sequence_integer(seq):
    return [INT_MAP.get(b,0) for b in seq]

def encode_sequence_atomic(seq):
    return [ATOMIC_MAP.get(b,0) for b in seq]

def encode_sequence_eiip(seq):
    return [EIIP_MAP.get(b,0.0) for b in seq]

def encode_sequence_bfdna(seq):
    L = len(seq)
    # count bases
    counts = {'A':0,'C':0,'G':0,'T':0}
    for b in seq:
        if b in counts:
            counts[b]+=1
    freqs = {b: (counts[b]/L if L>0 else 0.0) for b in counts}
    # map each position to its base frequency value (paper example uses this)
    return [freqs.get(b,0.0) for b in seq]

ENCODERS = {
    'integer': encode_sequence_integer,
    'atomic': encode_sequence_atomic,
    'eiip': encode_sequence_eiip,
    'bfdna': encode_sequence_bfdna
}


In [7]:
# Cell 7 — prepare encoded arrays, pad sequences to max_len and min-max normalize per scheme
from tensorflow.keras.preprocessing.sequence import pad_sequences
def prepare_encoded_array(seqs, encoder_name, max_len_cap=2000):
    """Encode and pad sequences; truncate to max_len_cap to avoid OOM."""
    enc = ENCODERS[encoder_name]
    encoded = [enc(s) for s in seqs]
    # Cap very long sequences
    if max_len_cap:
        encoded = [x[:max_len_cap] for x in encoded]
    max_len = max(len(x) for x in encoded)
    padded = pad_sequences(encoded, maxlen=max_len, dtype='float32',
                           padding='post', truncating='post', value=0.0)
    # Min–max normalize across dataset
    minv, maxv = padded.min(), padded.max()
    if maxv > minv:
        padded = (padded - minv) / (maxv - minv)
    return padded, max_len

# Example for BFDNA
X_bfdna, maxlen_bfdna = prepare_encoded_array(df['sequence'].tolist(), 'bfdna')
print("BFDNA shape", X_bfdna.shape, "maxlen", maxlen_bfdna)


BFDNA shape (3408, 2000) maxlen 2000


In [8]:
# Cell 8 — helper: metric computations used in paper (CSI, G-mean)
def classification_metrics(y_true, y_pred, average='binary'):
    # y_true: 1d labels
    # y_pred: 1d predicted labels
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average=average, zero_division=0)
    rec = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average, zero_division=0)
    # compute confusion matrix elements (for binary)
    if average=='binary' or len(np.unique(y_true))==2:
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        # CSI = Precision + TPR - 1 (paper definition)
        tpr = tp/(tp+fn) if (tp+fn)>0 else 0.0
        csi = prec + tpr - 1
        specificity = tn/(tn+fp) if (tn+fp)>0 else 0.0
        gmean = math.sqrt(rec * specificity)
    else:
        # for multiclass, compute macro variants:
        csi = None
        gmean = None
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    return {'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'CSI':csi, 'G-mean':gmean, 'MCC':mcc, 'Kappa':kappa}

# multiclass macro-average ROC AUC:
def multiclass_roc_auc_score(y_true, y_proba, average="macro"):
    # y_true integer labels, y_proba: N x C
    try:
        return roc_auc_score(to_categorical(y_true), y_proba, average=average, multi_class='ovr')
    except Exception as e:
        print("roc_auc_score error:", e)
        return None


In [None]:
# Cell 9 — model builders (scenario1: binary, scenario2: multiclass)
def build_bilstm_scenario1(input_shape):
    # paper: 256 BiLSTM -> dropout .15 -> 128 BiLSTM -> dropout .2 -> 64 BiLSTM -> dropout .2
    # SeLU activations; BatchNorm; Flatten; Dense 512,256,128; Sigmoid output
    inp = layers.Input(shape=input_shape)
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True, activation='tanh'))(inp)
    x = layers.Activation('selu')(x)
    x = layers.Dropout(0.15, seed=SEED)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, activation='tanh'))(x)
    x = layers.Activation('selu')(x)
    x = layers.Dropout(0.20, seed=SEED)(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, activation='tanh'))(x)
    x = layers.Activation('selu')(x)
    x = layers.Dropout(0.20, seed=SEED)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(512, activation='selu')(x)
    x = layers.Dense(256, activation='selu')(x)
    x = layers.Dense(128, activation='selu')(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inp, outputs=out)

    
    model.compile(loss='binary_crossentropy', optimizer=optimizers.RMSprop(), metrics=['accuracy'])
    return model

def build_bilstm_scenario2(input_shape, n_classes):
    # paper: 128 BiLSTM -> dropout .15 -> 64 BiLSTM -> dropout .2 -> BatchNorm -> Flatten -> Dense 256,128 -> Softmax
    inp = layers.Input(shape=input_shape)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, activation='tanh'))(inp)
    x = layers.Activation('selu')(x)
    x = layers.Dropout(0.15, seed=SEED)(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, activation='tanh'))(x)
    x = layers.Activation('selu')(x)
    x = layers.Dropout(0.20, seed=SEED)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(256, activation='selu')(x)
    x = layers.Dense(128, activation='selu')(x)
    out = layers.Dense(n_classes, activation='softmax')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(), metrics=['accuracy'])
    return model


In [10]:
# Cell 10 — training helper to train and evaluate a model; returns history and metrics
def train_and_evaluate_model(X, y, scenario=1, encoder_name='bfdna', batch_size=32, epochs=500):
    # X: padded 2D array (samples, seq_len). We'll reshape to (samples, seq_len, 1)
    X3 = np.expand_dims(X, -1)
    if scenario == 1:
        # binary: y are species labels for enhancer-only samples (human=0, mouse=1)
        y_bin = y  # should be 0/1
        X_train, X_temp, y_train, y_temp = train_test_split(X3, y_bin, test_size=0.30, random_state=SEED, stratify=y_bin)
        # split temp into val/test equally: 0.15 each of full
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp)
        model = build_bilstm_scenario1(input_shape=X3.shape[1:])
        opt = optimizers.RMSprop()
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    else:
        # multi-class: y are 0..C-1
        n_classes = len(np.unique(y))
        y_cat = to_categorical(y, num_classes=n_classes)
        X_train, X_temp, y_train, y_temp = train_test_split(X3, y_cat, test_size=0.30, random_state=SEED, stratify=y)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=np.argmax(y_temp, axis=1))
        model = build_bilstm_scenario2(input_shape=X3.shape[1:], n_classes=n_classes)
        model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(), metrics=['accuracy'])

    # Callbacks
    outdir = f"./outputs/{encoder_name}/scenario{scenario}"
    os.makedirs(outdir, exist_ok=True)
    ckpt = callbacks.ModelCheckpoint(os.path.join(outdir, "best_model.h5"), monitor='val_loss', save_best_only=True, verbose=1)
    csvlog = callbacks.CSVLogger(os.path.join(outdir, "training_log.csv"))
    # Paper trained full 500 epochs — we avoid EarlyStopping to be faithful, but you can enable it.
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[ckpt, csvlog], verbose=2)
    # load best
    model.load_weights(os.path.join(outdir, "best_model.h5"))
    # Predict
    if scenario==1:
        y_pred_prob = model.predict(X_test).ravel()
        y_pred = (y_pred_prob >= 0.5).astype(int)
        y_true = y_test
    else:
        y_pred_prob = model.predict(X_test)
        y_pred = np.argmax(y_pred_prob, axis=1)
        y_true = np.argmax(y_test, axis=1)

    # metrics
    results = {}
    if scenario==1:
        auc_score = roc_auc_score(y_true, y_pred_prob)
        m = classification_metrics(y_true, y_pred, average='binary')
        m['AUC'] = auc_score
        results = m
    else:
        auc_score = multiclass_roc_auc_score(y_true, y_pred_prob)
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        mcc = matthews_corrcoef(y_true, y_pred)
        kappa = cohen_kappa_score(y_true, y_pred)
        # For CSI and G-mean paper printed single scores; we will leave CSI/G-mean as None for multiclass (could compute per-class)
        results = {'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'CSI':None, 'G-mean':None, 'MCC':mcc, 'Kappa':kappa, 'AUC':auc_score}

    # save predictions & test y
    np.savez(os.path.join(outdir, "test_preds_and_truth.npz"), y_true=y_true, y_pred=y_pred, y_pred_prob=y_pred_prob)
    # save training history
    pd.DataFrame(history.history).to_csv(os.path.join(outdir, "history.csv"), index=False)
    return model, history, results, (X_test, y_test, y_pred, y_pred_prob)


In [None]:
# Cell 11 — full pipeline loop over encoders and both scenarios (warning: heavy; you can run one encoder at a time)
encoders = ['integer','atomic','eiip','bfdna']
all_results = {'scenario1':{}, 'scenario2':{}}
for encoder in encoders:
    print("\n\n### Encoder:", encoder)
    X, _ = prepare_encoded_array(df['sequence'].tolist(), encoder)
    # Scenario 1 uses only enhancer sequences and species labels among enhancers
    df_e = df[df['enhancer']==1].reset_index(drop=True)
    X_e, _ = prepare_encoded_array(df_e['sequence'].tolist(), encoder)
    # species mapping among enhancers:
    species_map = df_e['species'].map({'human':0,'mouse':1}).fillna(0).astype(int).values
    # Train scenario1
    print("Training Scenario 1 (human vs mouse enhancers) for encoder", encoder)
    model1, hist1, res1, testinfo1 = train_and_evaluate_model(X_e, species_map, scenario=1, encoder_name=encoder, batch_size=16, epochs=10)
    all_results['scenario1'][encoder] = res1
    print("Scenario1 results:", res1)
    # Scenario2: multiclass
    print("Training Scenario 2 (human enhancer / mouse enhancer / no enhancer) for encoder", encoder)
    # class_s2 in df already (0 human enh,1 mouse enh,2 no enhancer)
    X_all, _ = prepare_encoded_array(df['sequence'].tolist(), encoder)
    classes_s2 = df['class_s2'].values
    model2, hist2, res2, testinfo2 = train_and_evaluate_model(X_all, classes_s2, scenario=2, encoder_name=encoder, batch_size=32, epochs=10)
    all_results['scenario2'][encoder] = res2
    print("Scenario2 results:", res2)
    # Save intermediate results
    pd.DataFrame(all_results).to_csv(f"./outputs/{encoder}_summary_results.csv")




### Encoder: integer
Training Scenario 1 (human vs mouse enhancers) for encoder integer
Epoch 1/10

Epoch 1: val_loss improved from None to 2.63610, saving model to ./outputs/integer/scenario1/best_model.h5




77/77 - 288s - 4s/step - accuracy: 0.5184 - loss: 3.4620 - val_accuracy: 0.4122 - val_loss: 2.6361
Epoch 2/10

Epoch 2: val_loss did not improve from 2.63610
77/77 - 317s - 4s/step - accuracy: 0.5429 - loss: 1.1237 - val_accuracy: 0.5878 - val_loss: 3.2987
Epoch 3/10

Epoch 3: val_loss improved from 2.63610 to 2.53238, saving model to ./outputs/integer/scenario1/best_model.h5




77/77 - 360s - 5s/step - accuracy: 0.4971 - loss: 1.0098 - val_accuracy: 0.4122 - val_loss: 2.5324
Epoch 4/10


In [None]:
# Cell 12 — plotting helpers (example: training curves, ROC, confusion matrix)
import itertools
def plot_training(history_csv_path=None, history_obj=None, outpath=None):
    if history_obj is not None:
        history = history_obj.history
    else:
        history = pd.read_csv(history_csv_path).to_dict()
    plt.figure(figsize=(10,4))
    if 'loss' in history:
        plt.subplot(1,2,1)
        plt.plot(history['loss'], label='train_loss')
        plt.plot(history['val_loss'], label='val_loss')
        plt.legend(); plt.title("Loss")
    if 'accuracy' in history:
        plt.subplot(1,2,2)
        plt.plot(history['accuracy'], label='train_acc')
        plt.plot(history['val_accuracy'], label='val_acc')
        plt.legend(); plt.title("Accuracy")
    if outpath:
        plt.savefig(outpath)
    plt.show()

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues, outpath=None):
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=classes, yticklabels=classes)
    plt.ylabel('True label'); plt.xlabel('Predicted label'); plt.title(title)
    if outpath:
        plt.savefig(outpath)
    plt.show()

def plot_roc_binary(y_true, y_prob, outpath=None):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
    plt.plot([0,1],[0,1],'--')
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC'); plt.legend()
    if outpath:
        plt.savefig(outpath)
    plt.show()
