In [None]:
# 01_qc_trimming.ipynb

# 📘 Title: Quality Control & Trimming of Raw Reads
# 📍 Description: Perform QC using FastQC and trimming with fastp or Trimmomatic

import os
from pathlib import Path
import subprocess

# Paths
raw_dir = Path("../data/raw")
processed_dir = Path("../data/processed")
qc_dir = processed_dir / "qc_reports"
trimmed_dir = processed_dir / "trimmed"

# Create necessary directories
qc_dir.mkdir(parents=True, exist_ok=True)
trimmed_dir.mkdir(parents=True, exist_ok=True)

# Define your raw FASTQ files
samples = [
    ("SRR9043691_1.fastq.gz", "SRR9043691_2.fastq.gz")  # Paired-end
]

# ---------- 🔬 Step 1: Run FastQC ----------
print("Running FastQC on raw reads...")

for f1, f2 in samples:
    subprocess.run([
        "fastqc", 
        str(raw_dir / f1), 
        str(raw_dir / f2), 
        "-o", str(qc_dir)
    ])

# ---------- ✂️ Step 2: Trimming with fastp ----------
print("Running trimming with fastp...")

for f1, f2 in samples:
    sample_name = f1.split("_")[0]  # e.g., SRR9043691
    out1 = trimmed_dir / f"{sample_name}_trimmed_1.fastq.gz"
    out2 = trimmed_dir / f"{sample_name}_trimmed_2.fastq.gz"
    html_report = trimmed_dir / f"{sample_name}_fastp.html"
    json_report = trimmed_dir / f"{sample_name}_fastp.json"

    subprocess.run([
        "fastp",
        "-i", str(raw_dir / f1),
        "-I", str(raw_dir / f2),
        "-o", str(out1),
        "-O", str(out2),
        "-h", str(html_report),
        "-j", str(json_report),
        "--detect_adapter_for_pe",
        "--thread", "4"
    ])

print("✅ Trimming completed. Outputs saved in:", trimmed_dir)

