In [5]:
import pandas as pd
import numpy as np
import json, re
from Bio.Seq import Seq
from collections import Counter

# -----------------------------
# Load raw codon usage data
# -----------------------------
df_raw = pd.read_csv("codon_usage.csv", low_memory=False)

species_id_col = "SpeciesID" if "SpeciesID" in df_raw.columns else df_raw.columns[0]

codon_cols = [
    c for c in df_raw.columns
    if isinstance(c, str) and re.fullmatch(r"[UACGT]{3}", c.upper())
]

# RNA → DNA
df = df_raw.copy()
for c in codon_cols:
    dna = c.upper().replace("U", "T")
    df[dna] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

codon_cols_dna = sorted(set(c.upper().replace("U", "T") for c in codon_cols))

# -----------------------------
# Long format
# -----------------------------
rows = []
for _, r in df.iterrows():
    for cod in codon_cols_dna:
        rows.append({
            "SpeciesID": r[species_id_col],
            "codon": cod,
            "codon_freq": r[cod],
            "aa": str(Seq(cod).translate())
        })

df_long = pd.DataFrame(rows)

# -----------------------------
# Preferred codon
# -----------------------------
preferred = (
    df_long.loc[
        df_long.groupby(["SpeciesID", "aa"])["codon_freq"].idxmax()
    ][["SpeciesID", "aa", "codon"]]
    .rename(columns={"codon": "preferred_codon"})
)

# -----------------------------
# Pivot codon features
# -----------------------------
pivot = df_long.pivot_table(
    index=["SpeciesID", "aa"],
    columns="codon",
    values="codon_freq",
    fill_value=0
).reset_index()

pivot = pivot.merge(preferred, on=["SpeciesID", "aa"], how="left")

# -----------------------------
# Minimal BWT features (SAFE)
# -----------------------------
def bwt_transform(s):
    s += "$"
    rotations = sorted(s[i:] + s[:i] for i in range(len(s)))
    return "".join(r[-1] for r in rotations)

def extract_bwt_features(seq):
    bwt = bwt_transform(seq)
    feats = {}
    feats["bwt_len"] = len(bwt)
    feats["bwt_entropy"] = -sum(
        (bwt.count(c)/len(bwt)) * np.log2(bwt.count(c)/len(bwt))
        for c in set(bwt)
    )
    return feats

bwt_rows = []
for (sid, aa), g in df_long.groupby(["SpeciesID", "aa"]):
    seq = "".join(
        g.sort_values("codon_freq", ascending=False)["codon"].tolist()
    )
    feats = extract_bwt_features(seq)
    feats["SpeciesID"] = sid
    feats["aa"] = aa
    bwt_rows.append(feats)

bwt_df = pd.DataFrame(bwt_rows)

# -----------------------------
# Merge & save
# -----------------------------
final_df = pivot.merge(bwt_df, on=["SpeciesID", "aa"], how="left").fillna(0)

final_df.to_csv("final_features_with_bwt.csv", index=False)

print("✅ final_features_with_bwt.csv created")
print("Shape:", final_df.shape)


✅ final_features_with_bwt.csv created
Shape: (259728, 69)


In [1]:
import pandas as pd
import json
from pathlib import Path


In [2]:
df = pd.read_csv("codon_usage.csv")
df.head()


  df = pd.read_csv("codon_usage.csv")


Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,100217,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.0005,0.00351,0.01203,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,100220,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,100755,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,100880,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,100887,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.0138,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [3]:
required_cols = ["SpeciesName", "SpeciesID"]
missing = [c for c in required_cols if c not in df.columns]

if missing:
    raise ValueError(f"Missing required columns: {missing}")

print("Required columns found ✔")


Required columns found ✔


In [4]:
species_id_map = (
    df[["SpeciesName", "SpeciesID"]]
    .drop_duplicates()
    .sort_values("SpeciesName")
    .set_index("SpeciesName")["SpeciesID"]
    .to_dict()
)

print("Total species:", len(species_id_map))


Total species: 13016


In [5]:
output_dir = Path("model_outputs")
output_dir.mkdir(exist_ok=True)

output_file = output_dir / "species_id_map.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(species_id_map, f, indent=2, ensure_ascii=False)

print("Saved file:", output_file)


Saved file: model_outputs\species_id_map.json


In [6]:
# show a few entries
list(species_id_map.items())[:10]


[("'Chlorella' ellipsoidea", 3072),
 ("'Flavobacterium' lutescens", 255),
 ('(Populus tomentosa x P. bolleana) x P. tomentosa', 418444),
 ('(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata', 328805),
 ('A-2 plaque virus', 120087),
 ('AKR (endogenous) murine leukemia virus', 11791),
 ('Abelson murine leukemia virus', 11788),
 ('Abies alba', 45372),
 ('Abies grandis', 46611),
 ('Abrus precatorius', 3816)]