In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

df = pd.read_excel("taxanomy_data.xlsx")

# Map nucleotides to integers
alphabet = {'A':1, 'C':2, 'G':3, 'T':4, 'N':0}
def encode_seq(seq):
    return [alphabet.get(ch, 0) for ch in seq]

encoded = [encode_seq(s) for s in df['sequence']]
max_len = max(len(s) for s in encoded)      # or pick a fixed cutoff
X = pad_sequences(encoded, maxlen=max_len, padding='post', truncating='post')

# Prepare targets
targets_cat = ['kingdom','phylum','class','genus','species','novel_candidate']
encoders = {}
y_cat = []
for col in targets_cat:
    le = LabelEncoder()
    y = le.fit_transform(df[col])
    y_cat.append(to_categorical(y))
    encoders[col] = le

# Numeric targets
targets_num = ['kingdom_confidence','phylum_confidence',
               'class_confidence','genus_confidence','species_confidence','overall_confidence']
Y_num = df[targets_num].values


In [None]:
from tensorflow.keras import layers, models, Input

inp = Input(shape=(max_len,), dtype='int32')
x = layers.Embedding(input_dim=5, output_dim=64)(inp)
x = layers.Conv1D(128, 7, activation='relu')(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation='relu')(x)

outputs = []

# Categorical heads
# inside your model definition
outputs = []
for i, y in enumerate(y_cat):
    outputs.append(layers.Dense(y.shape[1], activation='softmax', name=f'cat_{targets_cat[i]}')(x))
for i, col in enumerate(targets_num):
    outputs.append(layers.Dense(1, activation='relu', name=f'num_{col}')(x))
model = models.Model(inputs=inp, outputs=outputs)


# Compile with multiple losses
losses  = {f'cat_{c}': 'categorical_crossentropy' for c in targets_cat}
losses |= {f'num_{c}': 'mse' for c in targets_num}

metrics = {f'cat_{c}': 'accuracy' for c in targets_cat}
metrics |= {f'num_{c}': 'mae' for c in targets_num}

model.compile(optimizer='adam', loss=losses, metrics=metrics)



In [None]:
# Keras expects targets in same order as outputs
model.fit(
    X,
    y_cat + [Y_num[:,i] for i in range(Y_num.shape[1])],
    epochs=5,
    batch_size=32,
    validation_split=0.2
)


Epoch 1/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 4s/step - cat_class_accuracy: 0.0894 - cat_class_loss: 2.7381 - cat_genus_accuracy: 0.0081 - cat_genus_loss: 3.0986 - cat_kingdom_accuracy: 0.2762 - cat_kingdom_loss: 1.0997 - cat_novel_candidate_accuracy: 0.8868 - cat_novel_candidate_loss: 0.5486 - cat_phylum_accuracy: 0.0861 - cat_phylum_loss: 2.3678 - cat_species_accuracy: 0.0431 - cat_species_loss: 3.0607 - loss: 15.9050 - num_class_confidence_loss: 0.2684 - num_class_confidence_mae: 0.4445 - num_genus_confidence_loss: 0.2231 - num_genus_confidence_mae: 0.3923 - num_kingdom_confidence_loss: 0.8709 - num_kingdom_confidence_mae: 0.9314 - num_overall_confidence_loss: 0.6568 - num_overall_confidence_mae: 0.8022 - num_phylum_confidence_loss: 0.7709 - num_phylum_confidence_mae: 0.8755 - num_species_confidence_loss: 0.1987 - num_species_confidence_mae: 0.3724 - val_cat_class_accuracy: 0.2061 - val_cat_class_loss: 2.5353 - val_cat_genus_accuracy: 0.1221 - val_cat_ge

<keras.src.callbacks.history.History at 0x7feb91f5bce0>

In [None]:
import pandas as pd
import numpy as np
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# ------------------------------------------------------------
# 1️⃣  Load data
# ------------------------------------------------------------
df = pd.read_excel("taxanomy_data.xlsx")   # or pd.read_csv("your_data.csv")
df = df.dropna(subset=["sequence", "novel_candidate"])
sequences = df["sequence"].str.upper()
# convert 'yes' -> 1, 'no' -> 0  (case-insensitive, handles stray spaces)
labels = (
    df["novel_candidate"]
    .str.strip()
    .str.lower()
    .map({"yes": 1, "no": 0})
    .astype(int)
    .values
)


# ------------------------------------------------------------
# 2️⃣  k-mer encoding
# ------------------------------------------------------------
k = 6  # choose k (4–8 is common)
alphabet = ["A", "C", "G", "T"]
kmers = ["".join(p) for p in product(alphabet, repeat=k)]
kmer_index = {kmer: i for i, kmer in enumerate(kmers)}

def kmer_count(seq):
    vec = np.zeros(len(kmers), dtype=np.float32)
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        if kmer in kmer_index:
            vec[kmer_index[kmer]] += 1
    return vec / max(len(seq) - k + 1, 1)   # normalize by total k-mers

X = np.vstack([kmer_count(s) for s in sequences])

# ------------------------------------------------------------
# 3️⃣  Train / test split
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

# Optional scaling (helps dense nets)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# ------------------------------------------------------------
# 4️⃣  Simple neural network classifier
# ------------------------------------------------------------
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

early = EarlyStopping(patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    callbacks=[early],
    verbose=1
)

# ------------------------------------------------------------
# 5️⃣  Evaluate & predict
# ------------------------------------------------------------
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {acc:.4f}")

# Predict probability of being novel
probs = model.predict(X_test)
preds = (probs >= 0.5).astype(int)
print(preds[:10])


Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 91ms/step - accuracy: 0.8692 - loss: 0.3909 - val_accuracy: 0.9712 - val_loss: 0.0931
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9819 - loss: 0.0688 - val_accuracy: 1.0000 - val_loss: 0.0070
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.9966 - loss: 0.0120 - val_accuracy: 1.0000 - val_loss: 0.0032
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9946 - loss: 0.0091 - val_accuracy: 1.0000 - val_loss: 0.0030
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.9982 - loss: 0.0030 - val_accuracy: 1.0000 - val_loss: 8.1789e-04
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 1.0000 - loss: 0.0025 - val_accuracy: 1.0000 - val_loss: 2.1182e-04
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

In [25]:
# k-mer features
k = 6
alphabet = ["A","C","G","T"]
kmers = ["".join(p) for p in product(alphabet, repeat=k)]
kmer_index = {kmer: i for i, kmer in enumerate(kmers)}

def kmer_count(seq):
    seq = seq.upper()
    vec = np.zeros(len(kmers), dtype=np.float32)
    for i in range(len(seq)-k+1):
        kmer = seq[i:i+k]
        if kmer in kmer_index:
            vec[kmer_index[kmer]] += 1
    return vec / max(len(seq)-k+1, 1)

X = np.vstack([kmer_count(s) for s in df["sequence"]])

In [26]:
# Load your dataset (filtered to non-novel only)
df = pd.read_excel("taxanomy_data.xlsx")
df = df[df["novel_candidate"].str.strip().str.lower() == "no"]

# Pick the taxonomy level to start with, e.g. kingdom
target_col = "kingdom"

# Encode the labels
le = LabelEncoder()
y = le.fit_transform(df[target_col].astype(str))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
params={
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "metric": "multi_logloss",
    "learning_rate": 0.01,
    "num_leaves": 63,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

gbm = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=2000,
    callbacks=[
        early_stopping(stopping_rounds=100),   # handles early stopping
        log_evaluation(period=50)              # prints every 50 rounds
    ]
)
preds = gbm.predict(X_test)
pred_labels = np.argmax(preds, axis=1)
acc = (pred_labels == y_test).mean()
print(f"Kingdom accuracy: {acc:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.095229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 266325
[LightGBM] [Info] Number of data points in the train set: 412, number of used features: 4096
[LightGBM] [Info] Start training from score -1.017077
[LightGBM] [Info] Start training from score -1.293636
[LightGBM] [Info] Start training from score -1.010388
Training until validation scores don't improve for 100 rounds
[50]	training's multi_logloss: 0.664048	valid_1's multi_logloss: 0.733977
[100]	training's multi_logloss: 0.423121	valid_1's multi_logloss: 0.519785
[150]	training's multi_logloss: 0.275128	valid_1's multi_logloss: 0.383947
[200]	training's multi_logloss: 0.179566	valid_1's multi_logloss: 0.285327
[250]	training's multi_logloss: 0.119224	valid_1's multi_logloss: 0.218836
[300]	training's multi_logloss: 0.0794141	valid_1's multi_logloss: 0.176423
[350]	training's multi_logloss: 0.0

In [None]:
# Load your dataset (filtered to non-novel only)
df = pd.read_excel("taxanomy_data.xlsx")
df = df[df["novel_candidate"].str.strip().str.lower() == "no"]

# Pick the taxonomy level to start with, e.g. kingdom
target_col = "kingdom"

# Encode the labels
le = LabelEncoder()
y = le.fit_transform(df[target_col].astype(str))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
params={
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "metric": "multi_logloss",
    "learning_rate": 0.01,
    "num_leaves": 63,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

gbm = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=2000,
    callbacks=[
        early_stopping(stopping_rounds=100),   # handles early stopping
        log_evaluation(period=50)              # prints every 50 rounds
    ]
)
preds = gbm.predict(X_test)
pred_labels = np.argmax(preds, axis=1)
acc = (pred_labels == y_test).mean()
print(f"Kingdom accuracy: {acc:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.183061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 266325
[LightGBM] [Info] Number of data points in the train set: 412, number of used features: 4096
[LightGBM] [Info] Start training from score -1.017077
[LightGBM] [Info] Start training from score -1.293636
[LightGBM] [Info] Start training from score -1.010388
Training until validation scores don't improve for 100 rounds
[50]	training's multi_logloss: 0.664048	valid_1's multi_logloss: 0.733977
[100]	training's multi_logloss: 0.423121	valid_1's multi_logloss: 0.519785
[150]	training's multi_logloss: 0.275128	valid_1's multi_logloss: 0.383947
[200]	training's multi_logloss: 0.179566	valid_1's multi_logloss: 0.285327
[250]	training's multi_logloss: 0.119224	valid_1's multi_logloss: 0.218836
[300]	training's multi_logloss: 0.0794141	valid_1's multi_logloss: 0.176423
[350]	training's multi_logloss: 0.0

In [None]:
# Load your dataset (filtered to non-novel only)
df = pd.read_excel("taxanomy_data.xlsx")
df = df[df["novel_candidate"].str.strip().str.lower() == "no"]

# Pick the taxonomy level to start with, e.g. kingdom
target_col = "phylum"

# Encode the labels
le = LabelEncoder()
y = le.fit_transform(df[target_col].astype(str))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
params={
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "metric": "multi_logloss",
    "learning_rate": 0.01,
    "num_leaves": 63,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

gbm = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=2000,
    callbacks=[
        early_stopping(stopping_rounds=100),   # handles early stopping
        log_evaluation(period=50)              # prints every 50 rounds
    ]
)
preds = gbm.predict(X_test)
pred_labels = np.argmax(preds, axis=1)
acc = (pred_labels == y_test).mean()
print(f"phylum accuracy: {acc:.4f}")


In [None]:
# Load your dataset (filtered to non-novel only)
df = pd.read_excel("taxanomy_data.xlsx")
df = df[df["novel_candidate"].str.strip().str.lower() == "no"]

# Pick the taxonomy level to start with, e.g. kingdom
target_col = "class"

# Encode the labels
le = LabelEncoder()
y = le.fit_transform(df[target_col].astype(str))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
params={
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "metric": "multi_logloss",
    "learning_rate": 0.01,
    "num_leaves": 63,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

gbm = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=2000,
    callbacks=[
        early_stopping(stopping_rounds=100),   # handles early stopping
        log_evaluation(period=50)              # prints every 50 rounds
    ]
)
preds = gbm.predict(X_test)
pred_labels = np.argmax(preds, axis=1)
acc = (pred_labels == y_test).mean()
print(f"class accuracy: {acc:.4f}")


In [None]:
# Load your dataset (filtered to non-novel only)
df = pd.read_excel("taxanomy_data.xlsx")
df = df[df["novel_candidate"].str.strip().str.lower() == "no"]

# Pick the taxonomy level to start with, e.g. kingdom
target_col = "genus"

# Encode the labels
le = LabelEncoder()
y = le.fit_transform(df[target_col].astype(str))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
params={
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "metric": "multi_logloss",
    "learning_rate": 0.01,
    "num_leaves": 63,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

gbm = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=2000,
    callbacks=[
        early_stopping(stopping_rounds=100),   # handles early stopping
        log_evaluation(period=50)              # prints every 50 rounds
    ]
)
preds = gbm.predict(X_test)
pred_labels = np.argmax(preds, axis=1)
acc = (pred_labels == y_test).mean()
print(f"genus accuracy: {acc:.4f}")


In [None]:
# Load your dataset (filtered to non-novel only)
df = pd.read_excel("taxanomy_data.xlsx")
df = df[df["novel_candidate"].str.strip().str.lower() == "no"]

# Pick the taxonomy level to start with, e.g. kingdom
target_col = "species"

# Encode the labels
le = LabelEncoder()
y = le.fit_transform(df[target_col].astype(str))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
params={
    "objective": "multiclass",
    "num_class": len(le.classes_),
    "metric": "multi_logloss",
    "learning_rate": 0.01,
    "num_leaves": 63,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

gbm = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=2000,
    callbacks=[
        early_stopping(stopping_rounds=100),   # handles early stopping
        log_evaluation(period=50)              # prints every 50 rounds
    ]
)
preds = gbm.predict(X_test)
pred_labels = np.argmax(preds, axis=1)
acc = (pred_labels == y_test).mean()
print(f"species accuracy: {acc:.4f}")
