In [None]:
!pip install pandas numpy scikit-learn xgboost joblib biopython optuna shap

In [None]:
# cell 1
import os
import math
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from Bio.Seq import Seq
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import joblib
import optuna
import shap
import warnings
warnings.filterwarnings("ignore")


In [None]:
# cell 2
CSV_PATH = "codon_usage.csv"   
assert os.path.exists(CSV_PATH), f"Put your Kaggle CSV at: {CSV_PATH}"

df_raw = pd.read_csv(CSV_PATH)
print("Loaded shape:", df_raw.shape)
display(df_raw.head())
print("\nColumns:", list(df_raw.columns))


In [14]:
# Robust single-cell parser + diagnostics for codon_usage.csv
import pandas as pd
import re
from Bio.Seq import Seq
from collections import Counter, defaultdict
import os

CSV_PATH = "codon_usage.csv"   # change if needed
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"File not found: {CSV_PATH} - put the CSV in the notebook folder or change path.")

# 1) Load file (all as str) and show basic info
df_raw = pd.read_csv(CSV_PATH, dtype=str)
pd.set_option('display.max_columns', 200)
print("Loaded:", CSV_PATH)
print("Shape:", df_raw.shape)
print("Columns:")
for i,c in enumerate(df_raw.columns):
    print(f"  {i+1}. {c!r}")
print("\nFirst 8 rows preview:")
display(df_raw.head(8))

# helper tests
def is_codon_token(s):
    if not isinstance(s, str): return False
    s = s.strip().upper().replace("U","T")
    return bool(re.fullmatch(r"[ACGT]{3}", s))

def column_codon_score(series, ncheck=500):
    total=0; codon_like=0
    for v in series.dropna().astype(str).head(ncheck):
        total+=1
        if is_codon_token(v): codon_like+=1
    return (codon_like/total) if total>0 else 0.0

def column_aa_score(series, ncheck=500):
    total=0; aa_like=0
    allowed = set(list("ACDEFGHIKLMNPQRSTVWY"))
    for v in series.dropna().astype(str).head(ncheck):
        vv=v.strip().upper()
        total+=1
        if len(vv)==1 and vv in allowed:
            aa_like+=1
    return (aa_like/total) if total>0 else 0.0

# 2) Heuristic: find columns by name
codon_col=None; aa_col=None; freq_col=None
for c in df_raw.columns:
    lc=c.lower()
    if any(k in lc for k in ("codon","triplet","trip","tri","triplet","tripletcodon")) and codon_col is None:
        codon_col=c
    if any(k in lc for k in ("amino","aa","residue")) and aa_col is None:
        aa_col=c
    if any(k in lc for k in ("freq","frequency","usage","percent","count")) and freq_col is None:
        freq_col=c

print("\nHeuristic by name found -> codon_col:", codon_col, "aa_col:", aa_col, "freq_col:", freq_col)

# 3) Score columns by codon-like content (print top scored)
scores = {c: column_codon_score(df_raw[c]) for c in df_raw.columns}
scores_sorted = sorted(scores.items(), key=lambda x:-x[1])
print("\nTop codon-like column scores (col:score):")
for c,s in scores_sorted[:8]:
    print(f"  {c!r}: {s:.3f}")

# 4) If we didn't find codon_col by name, pick top-scoring column if score >= threshold
if not codon_col:
    best_col, best_score = scores_sorted[0]
    if best_score >= 0.2:
        codon_col = best_col
        print(f"\nAuto-selected codon column: {codon_col} with score {best_score:.3f}")
    else:
        print("\nNo column has strong codon-like content (score <0.2). We'll scan ALL cells for 3-letter tokens and build rows.")

# 5) If aa_col is missing, score for amino-acid-like columns
if not aa_col:
    aa_scores = {c: column_aa_score(df_raw[c]) for c in df_raw.columns}
    aa_sorted = sorted(aa_scores.items(), key=lambda x:-x[1])
    print("\nTop amino-acid-like column scores (col:score):")
    for c,s in aa_sorted[:8]:
        print(f"  {c!r}: {s:.3f}")
    # pick if >= 0.15
    if aa_sorted and aa_sorted[0][1] >= 0.15:
        aa_col = aa_sorted[0][0]
        print(f"\nAuto-selected amino-acid column: {aa_col} with score {aa_sorted[0][1]:.3f}")

# 6) Build parsed rows using best available info OR scan all cells for codon tokens
rows=[]
if codon_col:
    print(f"\nUsing detected codon column -> {codon_col!r}")
    for _, r in df_raw.iterrows():
        val = r.get(codon_col, "")
        if not isinstance(val, str): continue
        codon = val.strip().upper().replace("U","T")
        if not is_codon_token(codon): 
            continue
        # find aa
        aa_val = None
        if aa_col:
            aa_val = str(r.get(aa_col,"")).strip().upper()
            if not (isinstance(aa_val,str) and len(aa_val)==1 and aa_val in list("ACDEFGHIKLMNPQRSTVWY")):
                aa_val = None
        if not aa_val:
            # try translate codon
            try:
                aa_val = str(Seq(codon).translate())
            except:
                aa_val = "X"
        rows.append({"aa": aa_val, "orig_codon": codon, "orig_freq": r.get(freq_col, None)})
else:
    # scan every cell for codon tokens; aggregate counts and record parent row index
    print("\nScanning all cells for codon tokens (may produce many rows)...")
    for idx in range(min(len(df_raw), 5000)):  # limit scans to first 5000 rows for speed
        row = df_raw.iloc[idx]
        for c in df_raw.columns:
            v = row[c]
            if not isinstance(v, str): 
                continue
            v2 = v.strip().upper().replace("U","T")
            if is_codon_token(v2):
                # translate
                try:
                    aa_val = str(Seq(v2).translate())
                except:
                    aa_val = "X"
                rows.append({"aa": aa_val, "orig_codon": v2, "orig_freq": None, "source_row": idx, "source_col": c})

# 7) Create DataFrame and save
if not rows:
    print("\nERROR: Parsing produced no rows. Please paste df_raw.head(10) output here so I can inspect file format.")
else:
    parsed = pd.DataFrame(rows)
    # normalize columns
    parsed = parsed.rename(columns={c:c for c in parsed.columns})
    # ensure columns exist
    if 'aa' not in parsed.columns or 'orig_codon' not in parsed.columns:
        print("Parsed DataFrame missing required columns. Columns:", parsed.columns)
    else:
        print("\nParsed rows count:", parsed.shape[0])
        print("Unique codons:", parsed['orig_codon'].nunique())
        print("Unique amino acids:", parsed['aa'].nunique())
        display(parsed.head(20))
        out = "codon_parsed_from_kaggle.csv"
        parsed.to_csv(out, index=False)
        print(f"\nSaved parsed CSV to: {out} (you can now continue training pipeline using this file)")


Loaded: codon_usage.csv
Shape: (13028, 69)
Columns:
  1. 'Kingdom'
  2. 'DNAtype'
  3. 'SpeciesID'
  4. 'Ncodons'
  5. 'SpeciesName'
  6. 'UUU'
  7. 'UUC'
  8. 'UUA'
  9. 'UUG'
  10. 'CUU'
  11. 'CUC'
  12. 'CUA'
  13. 'CUG'
  14. 'AUU'
  15. 'AUC'
  16. 'AUA'
  17. 'AUG'
  18. 'GUU'
  19. 'GUC'
  20. 'GUA'
  21. 'GUG'
  22. 'GCU'
  23. 'GCC'
  24. 'GCA'
  25. 'GCG'
  26. 'CCU'
  27. 'CCC'
  28. 'CCA'
  29. 'CCG'
  30. 'UGG'
  31. 'GGU'
  32. 'GGC'
  33. 'GGA'
  34. 'GGG'
  35. 'UCU'
  36. 'UCC'
  37. 'UCA'
  38. 'UCG'
  39. 'AGU'
  40. 'AGC'
  41. 'ACU'
  42. 'ACC'
  43. 'ACA'
  44. 'ACG'
  45. 'UAU'
  46. 'UAC'
  47. 'CAA'
  48. 'CAG'
  49. 'AAU'
  50. 'AAC'
  51. 'UGU'
  52. 'UGC'
  53. 'CAU'
  54. 'CAC'
  55. 'AAA'
  56. 'AAG'
  57. 'CGU'
  58. 'CGC'
  59. 'CGA'
  60. 'CGG'
  61. 'AGA'
  62. 'AGG'
  63. 'GAU'
  64. 'GAC'
  65. 'GAA'
  66. 'GAG'
  67. 'UAA'
  68. 'UAG'
  69. 'UGA'

First 8 rows preview:


Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,AUA,AUG,GUU,GUC,GUA,GUG,GCU,GCC,GCA,GCG,CCU,CCC,CCA,CCG,UGG,GGU,GGC,GGA,GGG,UCU,UCC,UCA,UCG,AGU,AGC,ACU,ACC,ACA,ACG,UAU,UAC,CAA,CAG,AAU,AAC,UGU,UGC,CAU,CAC,AAA,AAG,CGU,CGC,CGA,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,100217,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.0005,0.00351,0.01203,0.03208,0.001,0.0401,0.00551,0.02005,0.00752,0.02506,0.01103,0.0411,0.00902,0.03308,0.01003,0.05013,0.01554,0.01103,0.02356,0.03208,0.01203,0.00501,0.01003,0.01203,0.03158,0.01905,0.02456,0.01353,0.02155,0.00251,0.00652,0.0015,0.01554,0.00501,0.02105,0.00902,0.01053,0.00501,0.02256,0.00301,0.03108,0.00401,0.02607,0.00251,0.01153,0.00501,0.02356,0.01053,0.0386,0.00401,0.00702,0.00401,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,100220,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,0.02849,0.00204,0.0441,0.01153,0.0251,0.00882,0.03324,0.00814,0.04071,0.00814,0.03256,0.01085,0.04885,0.01221,0.01357,0.00678,0.02714,0.01221,0.00407,0.01425,0.01221,0.01967,0.02239,0.01289,0.02103,0.01493,0.00407,0.00475,0.00068,0.02035,0.0095,0.02782,0.01425,0.00611,0.00475,0.02917,0.00407,0.02374,0.00882,0.02917,0.00271,0.01628,0.00204,0.01967,0.00543,0.03392,0.00136,0.00678,0.00136,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,100755,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,0.01111,0.01028,0.01193,0.02283,0.01604,0.01316,0.0218,0.01625,0.01872,0.01213,0.0107,0.02406,0.01234,0.0144,0.00514,0.01604,0.0146,0.02098,0.0107,0.01728,0.01851,0.00864,0.01172,0.01892,0.01933,0.01419,0.01296,0.00967,0.01337,0.01337,0.01851,0.01131,0.01419,0.0109,0.02612,0.01275,0.01522,0.02365,0.02962,0.01789,0.01625,0.01234,0.01604,0.01687,0.02077,0.03949,0.00864,0.00596,0.00926,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,100880,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,0.01358,0.0094,0.01723,0.02402,0.02245,0.02507,0.02924,0.02089,0.02141,0.01723,0.01932,0.02141,0.00679,0.02245,0.00522,0.01358,0.00418,0.0141,0.00574,0.01201,0.00992,0.00366,0.02402,0.02663,0.02872,0.00992,0.0235,0.00522,0.01619,0.00836,0.02037,0.01358,0.02089,0.00731,0.02141,0.00888,0.01567,0.01253,0.02298,0.01358,0.00992,0.00888,0.00783,0.00679,0.03133,0.04282,0.00627,0.00261,0.00261,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,100887,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.0138,0.00548,0.00473,0.02076,0.02716,0.00867,0.0131,0.02773,0.02803,0.00508,0.0092,0.02965,0.02878,0.00574,0.01572,0.01577,0.01007,0.00508,0.00604,0.00679,0.01205,0.03127,0.00775,0.00959,0.00797,0.02006,0.00359,0.00933,0.01191,0.01616,0.00788,0.02593,0.00854,0.012,0.02098,0.02089,0.01367,0.01502,0.01809,0.02738,0.01796,0.01082,0.00705,0.01174,0.00858,0.03408,0.03964,0.0095,0.00429,0.00578,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131
5,vrl,0,101029,5274,Human adenovirus type 7d,0.02579,0.02218,0.01479,0.01024,0.02294,0.00758,0.01782,0.01403,0.02636,0.01327,0.01896,0.02579,0.01877,0.01346,0.00721,0.01782,0.02067,0.02313,0.01214,0.00265,0.01232,0.01953,0.02105,0.00322,0.01232,0.01631,0.01327,0.0256,0.01119,0.0201,0.01763,0.00986,0.0036,0.00929,0.01138,0.03527,0.03015,0.02844,0.00284,0.01555,0.02825,0.01953,0.01251,0.03906,0.03546,0.01138,0.00948,0.00683,0.01043,0.03223,0.00986,0.00398,0.00853,0.00322,0.00303,0.01593,0.00171,0.02427,0.02503,0.02825,0.0127,0.00133,0.00038,0.00209
6,vrl,0,101688,3042,Apple latent spherical virus,0.04635,0.01545,0.02005,0.024,0.02761,0.01611,0.01052,0.00493,0.02597,0.00888,0.01512,0.02268,0.02893,0.00789,0.01151,0.01611,0.04011,0.01151,0.01118,0.00427,0.02794,0.01085,0.01216,0.00131,0.01216,0.02137,0.01052,0.01512,0.00986,0.03846,0.01085,0.01249,0.00066,0.01085,0.00427,0.02728,0.01381,0.01315,0.00427,0.02465,0.01151,0.02465,0.01085,0.03452,0.01348,0.01545,0.0069,0.01512,0.00756,0.02926,0.02696,0.00888,0.00625,0.0069,0.00329,0.01315,0.00822,0.04011,0.01183,0.02663,0.02663,0.00033,0.00033,0.0
7,vrl,0,101764,2801,Aconitum latent virus,0.02285,0.02678,0.01214,0.02321,0.01714,0.02213,0.00893,0.01928,0.01785,0.02356,0.01107,0.01999,0.01714,0.01142,0.00893,0.02499,0.03106,0.01357,0.01571,0.01321,0.01392,0.01107,0.01178,0.005,0.01142,0.01964,0.01714,0.01214,0.01464,0.01499,0.00536,0.00821,0.005,0.01678,0.01464,0.01535,0.01142,0.01607,0.01071,0.01142,0.02321,0.01856,0.01107,0.0282,0.01535,0.01142,0.01821,0.01071,0.01464,0.02785,0.02713,0.01499,0.01607,0.00714,0.00678,0.0125,0.01107,0.03534,0.01571,0.03642,0.02785,0.00107,0.00036,0.00071



Heuristic by name found -> codon_col: Ncodons aa_col: CAA freq_col: None

Top codon-like column scores (col:score):
  'Kingdom': 0.000
  'DNAtype': 0.000
  'SpeciesID': 0.000
  'Ncodons': 0.000
  'SpeciesName': 0.000
  'UUU': 0.000
  'UUC': 0.000
  'UUA': 0.000

Using detected codon column -> 'Ncodons'

ERROR: Parsing produced no rows. Please paste df_raw.head(10) output here so I can inspect file format.


In [15]:
print(df_raw.columns.tolist())
print(df_raw.head(10).to_string(index=False))
print(df_raw.dtypes)


['Kingdom', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName', 'UUU', 'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA', 'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU', 'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC', 'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG', 'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG', 'UAA', 'UAG', 'UGA']
Kingdom DNAtype SpeciesID Ncodons                             SpeciesName     UUU     UUC     UUA     UUG     CUU     CUC     CUA     CUG     AUU     AUC     AUA     AUG     GUU     GUC     GUA     GUG     GCU     GCC     GCA     GCG     CCU     CCC     CCA     CCG     UGG     GGU     GGC     GGA     GGG     UCU     UCC     UCA     UCG     AGU     AGC     ACU     ACC     ACA     ACG     UAU     UAC     CAA     CAG     AAU     AAC     UGU     UGC     CAU     CAC     AAA     AAG     CGU 