In [37]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, cohen_kappa_score, matthews_corrcoef,
    roc_auc_score, roc_curve, auc
)

print("TensorFlow:", tf.__version__)

# Reproducibility (best-effort)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Make sure GPU is used if available
print("Physical GPUs:", tf.config.list_physical_devices("GPU"))

TensorFlow: 2.19.0
Physical GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [38]:
# ===============================
# UNIVERSAL VISTA FASTA PARSER
# ===============================

def infer_label_from_header(header: str):
    """Detect species from ANY VISTA header style."""
    h = header.lower()

    # Primary patterns
    if h.startswith("human"): return "human"
    if h.startswith("mouse"): return "mouse"

    # Secondary patterns
    if "homo" in h or "hsap" in h or "hs|" in h: return "human"
    if "mus" in h or "mm|" in h or "mmus" in h: return "mouse"

    # Fallback → Look at first | segment
    first = header.split("|")[0].lower()
    if "human" in first: return "human"
    if "mouse" in first: return "mouse"

    raise ValueError("Cannot identify species from header: " + header)


def load_fasta(path):
    sequences = []
    labels = []
    seq_buffer = []
    current_label = None

    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            if line.startswith(">"):
                # Save previous sequence
                if seq_buffer and current_label is not None:
                    sequences.append("".join(seq_buffer).upper())
                    labels.append(current_label)

                header = line[1:].strip()
                current_label = infer_label_from_header(header)
                seq_buffer = []
            else:
                # Clean sequence lines
                seq_buffer.append(line.replace(" ", "").replace("\t", ""))

        # SAVE LAST
        if seq_buffer and current_label is not None:
            sequences.append("".join(seq_buffer).upper())
            labels.append(current_label)

    return sequences, labels


# USE IT
sequences, labels = load_fasta("vista_sequences.fasta")

print("Sequences loaded:", len(sequences))
print("Labels loaded:", len(labels))
print("Unique labels:", set(labels))


Sequences loaded: 3408
Labels loaded: 3408
Unique labels: {'mouse', 'human'}


In [39]:
# ===============================
# ENCODING FUNCTIONS
# ===============================

INTEGER = {"A":1,"C":3,"G":2,"T":4}
ATOMIC  = {"A":70,"C":58,"G":78,"T":66}
EIIP    = {"A":0.1260,"C":0.1340,"G":0.0806,"T":0.1335}

def encode_map(seq, MAP):
    return [MAP.get(nt, 0.0) for nt in seq]

def encode_bfdna(seq):
    count = Counter(seq)
    total = len(seq)
    freqs = {b: count.get(b,0)/total for b in "ACGT"}
    return [freqs.get(nt,0) for nt in seq]

def encode_sequences(seq_list, scheme):
    if scheme == "integer": M = INTEGER
    if scheme == "atomic":  M = ATOMIC
    if scheme == "eiip":    M = EIIP
    if scheme == "bfdna":   M = None

    max_len = max(len(s) for s in seq_list)
    X = np.zeros((len(seq_list), max_len), dtype=np.float32)

    for i, seq in enumerate(seq_list):
        if scheme == "bfdna":
            enc = encode_bfdna(seq)
        else:
            enc = encode_map(seq, M)

        X[i, :len(enc)] = enc

    # Normalize (ignore zeros)
    mask = X != 0
    if np.any(mask):
        vmin, vmax = X[mask].min(), X[mask].max()
        if vmax > vmin:
            X[mask] = (X[mask] - vmin) / (vmax - vmin)

    return X[..., np.newaxis]


In [40]:
# ===============================
# TRAIN / VAL / TEST
# ===============================

label_enc = LabelEncoder()
y = label_enc.fit_transform(labels)

# FIRST SPLIT → test 15%
idx = np.arange(len(y))
trainval_idx, test_idx = train_test_split(
    idx, test_size=0.15, random_state=SEED, stratify=y
)

# SECOND SPLIT → val 15% of total = 0.176 of trainval
val_ratio = 0.15 / 0.85

train_idx, val_idx = train_test_split(
    trainval_idx, test_size=val_ratio,
    random_state=SEED, stratify=y[trainval_idx]
)

print("Train:", len(train_idx), "Val:", len(val_idx), "Test:", len(test_idx))


Train: 2384 Val: 512 Test: 512


In [41]:
def build_model(input_shape):
    inp = layers.Input(shape=input_shape)
    x = layers.Masking(mask_value=0)(inp)

    x = layers.Bidirectional(layers.LSTM(
        64, activation="selu", return_sequences=True
    ))(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Bidirectional(layers.LSTM(
        32, activation="selu", return_sequences=False
    ))(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Dense(128, activation="selu")(x)
    x = layers.Dense(64, activation="selu")(x)

    out = layers.Dense(1, activation="sigmoid")(x)

    model = models.Model(inp, out)
    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.RMSprop(1e-3),
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
    )
    return model


In [42]:
# ===============================
# METRICS & PLOTS
# ===============================

def compute_metrics(y_true, y_prob, t=0.5):
    y_pred = (y_prob >= t).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    sp = tn/(tn+fp) if tn+fp>0 else 0
    rc = tp/(tp+fn) if tp+fn>0 else 0

    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": rc,
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "CSI": precision_score(y_true, y_pred,zero_division=0) + rc - 1,
        "G-mean": np.sqrt(sp * rc),
        "MCC": matthews_corrcoef(y_true, y_pred),
        "Kappa": cohen_kappa_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_prob),
        "CM": cm
    }


def plot_cm(cm, scheme):
    sns.heatmap(cm, annot=True, fmt="d", cbar=False,
                xticklabels=label_enc.classes_,
                yticklabels=label_enc.classes_)
    plt.title(f"Confusion Matrix – {scheme}")
    plt.show()


def plot_roc(y_true, y_prob, scheme):
    y_bin = label_binarize(y_true, classes=[0,1])
    plt.figure(figsize=(5,5))
    fpr, tpr, _ = roc_curve(y_bin.ravel(), np.vstack([1-y_prob, y_prob]).T.ravel())
    auc_val = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"AUC={auc_val:.3f}")
    plt.plot([0,1],[0,1],'k--')
    plt.legend()
    plt.title(f"ROC – {scheme}")
    plt.show()


In [None]:
# ===============================
# SCENARIO 1 – RUN ALL 4 ENCODINGS
# ===============================

encodings = ["integer", "atomic", "eiip", "bfdna"]
results = {}

for scheme in encodings:
    print("\n========================")
    print("Encoding:", scheme)
    print("========================")

    X = encode_sequences(sequences, scheme)
    input_shape = X.shape[1:]

    X_train, X_val, X_test = X[train_idx], X[val_idx], X[test_idx]
    y_train, y_val, y_test = y[train_idx], y[val_idx], y[test_idx]

    model = build_model(input_shape)

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=64,
        verbose=1,
        callbacks=[tf.keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=25, restore_best_weights=True)]
    )

    y_prob = model.predict(X_test).ravel()
    m = compute_metrics(y_test, y_prob)
    results[scheme] = m

    print(m)

    plot_cm(m["CM"], scheme)
    plot_roc(y_test, y_prob, scheme)



Encoding: integer
Epoch 1/50
[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m3s[0m 4s/step - accuracy: 0.5668 - auc: 0.4930 - loss: nan

In [None]:
import pandas as pd

rows = []
for scheme, m in results.items():
    rows.append({
        "Encoding": scheme,
        "Accuracy": m["accuracy"],
        "Precision": m["precision"],
        "Recall": m["recall"],
        "F1-score": m["f1"],
        "CSI": m["CSI"],
        "G-mean": m["G-mean"],
        "MCC": m["MCC"],
        "Kappa": m["Kappa"],
        "AUC": m["AUC"],
    })

df_results = pd.DataFrame(rows)
print("\n\nSummary (Scenario 1, all encodings):")
print(df_results.to_string(index=False, float_format=lambda x: f"{x*100:5.2f}%"))
