In [22]:
from Bio import Entrez, SeqIO
from Bio.SeqUtils import gc_fraction
import pandas as pd
import os

In [None]:
input_folder = "../data/classified"
output_folder = "../data/feature_extraction"

# Map each input file to its matching genome accession
genome_mapping = {
    "Brandao_MCCM_full_raw_counts_tpm_filtered_classified.tsv": "NC_010326",
    "Finstrlova_Newman_full_raw_counts_tpm_filtered_classified.tsv": "NC_005880",
    "Guegler_T4_minusToxIN_full_raw_counts_tpm_filtered_classified.tsv": "NC_000866",
    "Guegler_T7_plusToxIN_full_raw_counts_tpm_filtered_classified.tsv": "NC_001604",
    "Lood_full_raw_counts_tpm_filtered_classified.tsv": "MK797984.1",
    "Sprenger_VC_WT_VP882_delta_cpdS_full_raw_counts_tpm_filtered_classified.tsv": "NC_009016.1",
    "Yang_full_raw_counts_tpm_filtered_classified.tsv": "NC_021316",
}

# Make sure output folder exists
os.makedirs(output_folder, exist_ok=True)

# ---- FUNCTION TO GET PROTEIN + DNA SEQUENCE ----
def get_sequences_from_geneid(genome_accession, geneid):
    try:
        handle = Entrez.efetch(db="nucleotide", id=genome_accession, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()
    except Exception as e:
        print(f"⚠ Failed to fetch {genome_accession}: {e}")
        return ("ERROR_FETCH", "ERROR_FETCH")

    tag = geneid.replace("gene-", "").strip()
    short_tag = tag.split("_")[-1]

    for feature in record.features: 
        if feature.type == "CDS":
            locus_tag = feature.qualifiers.get("locus_tag", [""])[0]
            gene = feature.qualifiers.get("gene", [""])[0]
            product = feature.qualifiers.get("product", [""])[0]

            if tag in [locus_tag, gene, product] or short_tag in [locus_tag, gene, product]:
                protein = feature.qualifiers.get("translation", ["TRANSLATION_NOT_FOUND"])[0]
                dna_seq = feature.location.extract(record.seq)
                return (protein, str(dna_seq))

    return ("NOT_FOUND", "NOT_FOUND")

# ---- MAIN PROCESSING ----
for file_name in os.listdir(input_folder):
    if not file_name.endswith(".tsv"):
        continue

    print(f"🔍 Processing: {file_name}")
    file_path = os.path.join(input_folder, file_name)
    df = pd.read_csv(file_path, sep="\t")

    genome_acc = genome_mapping.get(file_name)
    if not genome_acc:
        print(f"⚠ No genome accession mapped for {file_name}")
        continue

    protein_seqs = []
    dna_seqs = []

    for geneid in df["Geneid"]:
        protein, dna = get_sequences_from_geneid(genome_acc, geneid)
        protein_seqs.append(protein)
        dna_seqs.append(dna)

    df["ProteinSequence"] = protein_seqs
    df["DNASequence"] = dna_seqs

    out_path = os.path.join(output_folder, file_name)
    df.to_csv(out_path, sep="\t", index=False)
    print(f"✅ Saved to: {out_path}")

🔍 Processing: Brandao_MCCM_full_raw_counts_tpm_filtered_classified.tsv
✅ Saved to: ../data/feature_extraction\Brandao_MCCM_full_raw_counts_tpm_filtered_classified.tsv
🔍 Processing: Finstrlova_Newman_full_raw_counts_tpm_filtered_classified.tsv
✅ Saved to: ../data/feature_extraction\Finstrlova_Newman_full_raw_counts_tpm_filtered_classified.tsv
🔍 Processing: Guegler_T4_minusToxIN_full_raw_counts_tpm_filtered_classified.tsv
✅ Saved to: ../data/feature_extraction\Guegler_T4_minusToxIN_full_raw_counts_tpm_filtered_classified.tsv
🔍 Processing: Guegler_T7_plusToxIN_full_raw_counts_tpm_filtered_classified.tsv
✅ Saved to: ../data/feature_extraction\Guegler_T7_plusToxIN_full_raw_counts_tpm_filtered_classified.tsv
🔍 Processing: Lood_full_raw_counts_tpm_filtered_classified.tsv
✅ Saved to: ../data/feature_extraction\Lood_full_raw_counts_tpm_filtered_classified.tsv
🔍 Processing: Sprenger_VC_WT_VP882_delta_cpdS_full_raw_counts_tpm_filtered_classified.tsv
✅ Saved to: ../data/feature_extraction\Sprenger

In [53]:
# Folder containing your result files
results_folder = "../data/feature_extraction"

# Keywords to look for in failed extractions
failure_keywords = ["NOT_FOUND", "ERROR_FETCH", "TRANSLATION_NOT_FOUND"]

# Storage for summary
summary = {}

# Loop through all TSV files
for file_name in os.listdir(results_folder):
    if not file_name.endswith(".tsv"):
        continue

    file_path = os.path.join(results_folder, file_name)
    df = pd.read_csv(file_path, sep="\t")

    failed_rows = df[
        df["ProteinSequence"].isin(failure_keywords) |
        df["DNASequence"].isin(failure_keywords)
    ]

    if not failed_rows.empty:
        summary[file_name] = failed_rows[["Geneid", "ProteinSequence", "DNASequence"]]

# Report the results
if summary:
    print("❌ Failed sequence extractions found in the following files:\n")
    for fname, failures in summary.items():
        print(f"📄 {fname} — {len(failures)} failures")
        print(failures.to_string(index=False))
        print("-" * 60)
else:
    print("✅ All sequence extractions succeeded. No issues found.")

❌ Failed sequence extractions found in the following files:

📄 Finstrlova_Newman_full_raw_counts_tpm_filtered_classified.tsv — 4 failures
               Geneid ProteinSequence DNASequence
gene-CPT_phageK_gt004       NOT_FOUND   NOT_FOUND
gene-CPT_phageK_gt002       NOT_FOUND   NOT_FOUND
gene-CPT_phageK_gt003       NOT_FOUND   NOT_FOUND
gene-CPT_phageK_gt001       NOT_FOUND   NOT_FOUND
------------------------------------------------------------
📄 Guegler_T4_minusToxIN_full_raw_counts_tpm_filtered_classified.tsv — 10 failures
     Geneid ProteinSequence DNASequence
gene-T4t006       NOT_FOUND   NOT_FOUND
gene-T4t003       NOT_FOUND   NOT_FOUND
gene-T4t008       NOT_FOUND   NOT_FOUND
gene-T4t007       NOT_FOUND   NOT_FOUND
gene-T4s002       NOT_FOUND   NOT_FOUND
gene-T4t001       NOT_FOUND   NOT_FOUND
gene-T4s001       NOT_FOUND   NOT_FOUND
gene-T4t002       NOT_FOUND   NOT_FOUND
gene-T4t004       NOT_FOUND   NOT_FOUND
gene-T4t005       NOT_FOUND   NOT_FOUND
-----------------------------

# user-story 11: Sequence based features
@LuiseJedlitschka
@milli2908
@elivic734

## potential features:
- CG-count
- sequence-length
- K-Mer-frequency (SelectKBest) 