In [5]:
# Test Biopython
try:
    import Bio
    print("Biopython is installed. Version:", Bio.__version__)
except ImportError:
    print("Biopython is NOT installed.")

# Test COBRApy
try:
    import cobra
    print("COBRApy is installed. Version:", cobra.__version__)
except ImportError:
    print("COBRApy is NOT installed.")


Biopython is installed. Version: 1.85
COBRApy is installed. Version: 0.29.1


In [5]:
from pathlib import Path
from collections import Counter
import sys, subprocess, io, gzip, csv

# ----- CONFIG -----
FILE = None            # e.g. "synechocystis.fasta" or leave None to auto-detect
SELECT_INDEX = 0       # if multiple FASTA files are found, which one to use (0-based)
FASTA_EXTS = {".fa", ".fasta", ".fna", ".ffn", ".fas", ".fa.gz", ".fasta.gz", ".fna.gz"}

def _ensure_biopython():
    try:
        from Bio import SeqIO
        return SeqIO
    except Exception:
        print("Installing Biopython...", flush=True)
        subprocess.check_call([sys.executable, "-m", "pip", "install", "biopython", "--quiet"])
        from Bio import SeqIO
        return SeqIO

def _open_text(path):
    p = str(path)
    if p.endswith(".gz"):
        return io.TextIOWrapper(gzip.open(p, "rb"))
    return open(p, "rt", encoding="utf-8", errors="replace")

# ----- locate FASTA file -----
if FILE is None:
    candidates = []
    for ext in FASTA_EXTS:
        candidates.extend(Path(".").glob(f"**/*{ext}"))
    candidates = sorted(set(candidates))
    if not candidates:
        raise FileNotFoundError(
            "No FASTA files found in this folder. "
            "Set FILE='your_file.fasta' or place a .fa/.fasta/.fna file here."
        )
    if SELECT_INDEX >= len(candidates):
        raise IndexError(f"SELECT_INDEX={SELECT_INDEX} but only {len(candidates)} FASTA files found.")
    FILE = str(candidates[SELECT_INDEX])
    print(f"Using FASTA: {FILE}")

# ----- parse & count -----
SeqIO = _ensure_biopython()

total = Counter()
rows = []  # per-sequence rows for CSV
allowed = "ACGTN"

with _open_text(FILE) as handle:
    try:
        # stream-parse (handles huge files)
        for rec in SeqIO.parse(handle, "fasta"):
            seq = str(rec.seq).upper()
            c = Counter(seq)
            length = sum(c.values())
            others = length - sum(c[b] for b in allowed)
            gc = c["G"] + c["C"]
            acgt = sum(c[b] for b in "ACGT")
            gc_pct = (100.0 * gc / acgt) if acgt else 0.0

            rows.append({
                "id": rec.id,
                "length": length,
                "A": c["A"], "C": c["C"], "G": c["G"], "T": c["T"], "N": c.get("N", 0),
                "other": others, "GC_percent(excl_N)": round(gc_pct, 3)
            })
            total.update(seq)
    except Exception as e:
        raise RuntimeError(
            f"Failed to parse '{FILE}' as FASTA. "
            f"Is it really FASTA (not GenBank/GBFF/JSON)? Original error: {e}"
        )

# ----- print table (compact) -----
def _fmt_int(n): 
    return f"{n:,}"

print("\nPer-sequence counts:")
print("id\tlength\tA\tC\tG\tT\tN\tother\tGC%(excl_N)")
for r in rows[:200]:  # print first 200 to avoid spamming
    print(f"{r['id']}\t{_fmt_int(r['length'])}\t{_fmt_int(r['A'])}\t{_fmt_int(r['C'])}\t"
          f"{_fmt_int(r['G'])}\t{_fmt_int(r['T'])}\t{_fmt_int(r['N'])}\t{_fmt_int(r['other'])}\t{r['GC_percent(excl_N)']}")

if len(rows) > 200:
    print(f"... ({len(rows)-200} more sequences not printed)")

# ----- totals -----
tot_len = sum(total.values())
A, C, G, T, N = (total.get(b, 0) for b in "ACGTN")
ACGT = A + C + G + T
GC = G + C
GC_pct = (100.0 * GC / ACGT) if ACGT else 0.0
others = tot_len - (A + C + G + T + N)

print("\nTOTALS:")
print(f"Length: {_fmt_int(tot_len)}")
print(f"A: {_fmt_int(A)}  C: {_fmt_int(C)}  G: {_fmt_int(G)}  T: {_fmt_int(T)}  N: {_fmt_int(N)}  other: {_fmt_int(others)}")
print(f"GC% (excluding N): {GC_pct:.3f}")

# ----- save CSV -----
out_csv = "nucleotide_counts_per_sequence.csv"
with open(out_csv, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["id","length","A","C","G","T","N","other","GC_percent(excl_N)"])
    writer.writeheader()
    writer.writerows(rows)
print(f"\nSaved per-sequence table to: {out_csv}")

Using FASTA: Data\GCA_000009725.1\.ipynb_checkpoints\GCA_000009725.1_ASM972v1_genomic-checkpoint.fna

Per-sequence counts:
id	length	A	C	G	T	N	other	GC%(excl_N)
BA000022.2	3,573,470	932,363	851,461	853,808	935,838	0	0	47.72
AP004311.1	103,307	28,620	23,249	22,701	28,737	0	0	44.479
AP004312.1	44,343	11,404	10,710	10,849	11,380	0	0	48.619
AP004310.1	119,895	33,846	25,867	25,628	34,554	0	0	42.95
AP006585.1	106,004	31,332	21,997	23,283	29,392	0	0	42.715

TOTALS:
Length: 3,947,019
A: 1,037,565  C: 933,284  G: 936,269  T: 1,039,901  N: 0  other: 0
GC% (excluding N): 47.366

Saved per-sequence table to: nucleotide_counts_per_sequence.csv


In [1]:
from Bio import SeqIO
import pandas as pd

# Load FASTA
fasta_path = "../data/genome.fasta"  # change as needed
records = list(SeqIO.parse(fasta_path, "fasta"))

# Extract gene names & lengths
gene_data = []
for record in records:
    gene_data.append({
        "gene_id": record.id,
        "length": len(record.seq)
    })

# Save to CSV
df = pd.DataFrame(gene_data)
df.to_csv("../results/gene_info.csv", index=False)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/genome.fasta'

In [2]:
from pathlib import Path

data_path = Path("../data")
print(list(data_path.glob("*")))


[WindowsPath('../data/GCA_000009725.1'), WindowsPath('../data/ncbi_dataset (2).zip')]


In [3]:
from Bio import SeqIO
import pandas as pd

# Use the existing filename
fasta_path = "../data/GCA_000009725.1"  # no .fasta needed if it's really FASTA format

# Load sequences
records = list(SeqIO.parse(fasta_path, "fasta"))

# Extract gene names & lengths
gene_data = []
for record in records:
    gene_data.append({
        "gene_id": record.id,
        "length": len(record.seq)
    })

# Save to CSV
df = pd.DataFrame(gene_data)
df.to_csv("../results/gene_info.csv", index=False)

# Preview first few rows
df.head()


PermissionError: [Errno 13] Permission denied: '../data/GCA_000009725.1'

In [4]:
from Bio import SeqIO
import pandas as pd

fasta_path = "../data/GCA_000009725.1"

# Explicitly open the file to avoid permission issues
with open(fasta_path, "r", encoding="utf-8", errors="ignore") as handle:
    records = list(SeqIO.parse(handle, "fasta"))

# Process sequences
gene_data = [{"gene_id": rec.id, "length": len(rec.seq)} for rec in records]

df = pd.DataFrame(gene_data)
df.to_csv("../results/gene_info.csv", index=False)
df.head()

PermissionError: [Errno 13] Permission denied: '../data/GCA_000009725.1'