In [1]:
# !pip install tensorflow==2.17.0 sklearn matplotlib seaborn

import numpy as np
import re
import random
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras import layers, models

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, cohen_kappa_score, matthews_corrcoef,
    roc_auc_score, roc_curve, auc
)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)


# Make sure GPU is used if available
print("Physical GPUs:", tf.config.list_physical_devices("GPU"))

Physical GPUs: []


In [3]:
def infer_label(header):
    h = header.lower()
    if h.startswith("human"): return "human"
    if h.startswith("mouse"): return "mouse"
    if "homo" in h or "hs" in h: return "human"
    if "mus" in h or "mm" in h: return "mouse"
    return "human" if "human" in h else "mouse"

def load_fasta(path):
    seqs, labs = [], []
    buf, lab = [], None
    with open(path) as f:
        for line in f:
            line=line.strip()
            if not line: continue
            if line.startswith(">"):
                if buf:
                    seqs.append("".join(buf).upper())
                    labs.append(lab)
                buf=[]
                lab=infer_label(line[1:])
            else:
                buf.append(line)
        if buf:
            seqs.append("".join(buf).upper())
            labs.append(lab)
    return seqs, labs

seqs, labs = load_fasta("Vista_Dataset/vista_sequences.fasta")
print(len(seqs), len(labs))


3408 3408


In [4]:
MAXLEN = 1000

def trim(s):
    if len(s) <= MAXLEN: return s
    mid=len(s)//2
    start=mid-MAXLEN//2
    return s[start:start+MAXLEN]

seqs=[trim(s) for s in seqs]
print("max length:", max(len(s) for s in seqs))

max length: 1000


In [5]:
INTEGER={"A":1,"C":3,"G":2,"T":4}
ATOMIC={"A":70,"C":58,"G":78,"T":66}
EIIP={"A":0.1260,"C":0.1340,"G":0.0806,"T":0.1335}

def encode_map(seq, M):
    return [M.get(nt,0) for nt in seq]

def encode_bfdna(seq):
    c = Counter(seq)
    tot = len(seq)
    f = {b:c.get(b,0)/tot for b in "ACGT"}
    return [f.get(nt,0) for nt in seq]

def encode(seqs, scheme):
    if scheme=="integer": M=INTEGER
    if scheme=="atomic": M=ATOMIC
    if scheme=="eiip": M=EIIP
    if scheme=="bfdna": M=None

    X=np.zeros((len(seqs), MAXLEN), np.float32)
    for i, s in enumerate(seqs):
        if scheme=="bfdna": enc = encode_bfdna(s)
        else: enc = encode_map(s, M)
        X[i,:len(enc)]=enc

    # normalize
    m = X!=0
    if np.any(m):
        mn, mx = X[m].min(), X[m].max()
        if mx>mn:
            X[m]=(X[m]-mn)/(mx-mn)
    return X[...,None]


In [6]:
le = LabelEncoder()
y = le.fit_transform(labs)

idx=np.arange(len(y))
train_idx, test_idx = train_test_split(idx, test_size=0.15, stratify=y, random_state=SEED)
train_idx, val_idx = train_test_split(train_idx, test_size=0.15/0.85, stratify=y[train_idx], random_state=SEED)

In [7]:
def build_fast_model(input_shape):

    inp = layers.Input(shape=input_shape)

    # CNN reduces length drastically
    x = layers.Conv1D(64, 7, padding="same", activation="selu")(inp)
    x = layers.MaxPooling1D(4)(x)

    x = layers.Conv1D(128, 5, padding="same", activation="selu")(x)
    x = layers.MaxPooling1D(4)(x)

    # BiLSTM (light)
    x = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(16))(x)

    x = layers.Dense(64, activation="selu")(x)
    x = layers.Dense(32, activation="selu")(x)

    out = layers.Dense(1, activation="sigmoid")(x)

    model = models.Model(inp, out)
    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.Adam(1e-3),
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")],
    )
    return model


In [8]:
# ===============================
# METRICS & PLOTS
# ===============================

def compute_metrics(y_true, y_prob, t=0.5):
    y_pred = (y_prob >= t).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    sp = tn/(tn+fp) if tn+fp>0 else 0
    rc = tp/(tp+fn) if tp+fn>0 else 0

    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": rc,
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "CSI": precision_score(y_true, y_pred,zero_division=0) + rc - 1,
        "G-mean": np.sqrt(sp * rc),
        "MCC": matthews_corrcoef(y_true, y_pred),
        "Kappa": cohen_kappa_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_prob),
        "CM": cm
    }


def plot_cm(cm, scheme):
    sns.heatmap(cm, annot=True, fmt="d", cbar=False,
                xticklabels=label_enc.classes_,
                yticklabels=label_enc.classes_)
    plt.title(f"Confusion Matrix – {scheme}")
    plt.show()


def plot_roc(y_true, y_prob, scheme):
    y_bin = label_binarize(y_true, classes=[0,1])
    plt.figure(figsize=(5,5))
    fpr, tpr, _ = roc_curve(y_bin.ravel(), np.vstack([1-y_prob, y_prob]).T.ravel())
    auc_val = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"AUC={auc_val:.3f}")
    plt.plot([0,1],[0,1],'k--')
    plt.legend()
    plt.title(f"ROC – {scheme}")
    plt.show()


In [10]:
encodings = ["integer","atomic","eiip","bfdna"]
results={}

for scheme in encodings:
    print("\n=== Encoding:", scheme,"===")

    X = encode(seqs, scheme)
    X_train, X_val, X_test = X[train_idx], X[val_idx], X[test_idx]
    y_train_, y_val_, y_test_ = y[train_idx], y[val_idx], y[test_idx]

    model = build_fast_model(X_train.shape[1:])
    model.summary()

    history = model.fit(
        X_train, y_train_,
        validation_data=(X_val, y_val_),
        epochs=30,         # <<< Very fast + good accuracy
        batch_size=32,
        verbose=1,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
    )

    y_prob = model.predict(X_test).ravel()
    # call the metrics function defined above
    m = compute_metrics(y_test_, y_prob)
    results[scheme] = m
    print(m)

    # compute_metrics stores confusion matrix under "CM"
    plot_cm(m["CM"], f"CM {scheme}")
    plot_roc(y_test_, y_prob, f"ROC {scheme}")



=== Encoding: integer ===


Epoch 1/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step - accuracy: 0.5797 - auc: 0.5401 - loss: 0.6764 - val_accuracy: 0.5938 - val_auc: 0.6522 - val_loss: 0.6594
Epoch 2/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - accuracy: 0.5881 - auc: 0.5784 - loss: 0.6720 - val_accuracy: 0.5410 - val_auc: 0.6000 - val_loss: 0.6734
Epoch 3/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - accuracy: 0.6133 - auc: 0.6156 - loss: 0.6601 - val_accuracy: 0.5938 - val_auc: 0.6582 - val_loss: 0.6580
Epoch 4/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.6338 - auc: 0.6587 - loss: 0.6409 - val_accuracy: 0.6133 - val_auc: 0.6908 - val_loss: 0.6299
Epoch 5/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.6602 - auc: 0.6923 - loss: 0.6219 - val_accuracy: 0.6426 - val_auc: 0.6901 - val_loss: 0.6253
Epoch 6/30
[1m75/75[0m [32m━━━━━

NameError: name 'label_enc' is not defined

In [None]:
import pandas as pd

rows = []
for scheme, m in results.items():
    rows.append({
        "Encoding": scheme,
        "Accuracy": m["accuracy"],
        "Precision": m["precision"],
        "Recall": m["recall"],
        "F1-score": m["f1"],
        "CSI": m["CSI"],
        "G-mean": m["G-mean"],
        "MCC": m["MCC"],
        "Kappa": m["Kappa"],
        "AUC": m["AUC"],
    })

df_results = pd.DataFrame(rows)
print("\n\nSummary (Scenario 1, all encodings):")
print(df_results.to_string(index=False, float_format=lambda x: f"{x*100:5.2f}%"))
