In [None]:
# 03_ARG_annotation.ipynb

# 📘 Title: Antibiotic Resistance Gene Annotation using DIAMOND
# 📍 Description: Parse DIAMOND output from CARD to extract ARG hits

import pandas as pd
from pathlib import Path

# === Step 1: Set Paths ===
diamond_output_dir = Path("../results/ARG")
output_dir = Path("../results/ARG")
output_dir.mkdir(parents=True, exist_ok=True)

# === Step 2: Load DIAMOND Output ===
# These files were generated by: diamond blastp --outfmt 6
files = {
    "SRR2915339_1": diamond_output_dir / "SRR2915339_1_diamond_output.tsv",
    "SRR2915339_2": diamond_output_dir / "SRR2915339_2_diamond_output.tsv"
}

# DIAMOND default format 6 columns
columns = [
    "query_id", "subject_id", "pident", "length", "mismatch", "gapopen",
    "qstart", "qend", "sstart", "send", "evalue", "bitscore"
]

# Add new columns to extract ARG gene info
all_data = []

for sample, filepath in files.items():
    df = pd.read_csv(filepath, sep="\t", names=columns)
    df["gene_name"] = df["subject_id"].str.split("|").str[0]  # assuming CARD ID as gene name
    df["sample"] = sample
    all_data.append(df)

combined_df = pd.concat(all_data, ignore_index=True)

# === Step 3: Save Annotated Table ===
annotated_csv_path = output_dir / "ARG_annotations_combined.csv"
combined_df.to_csv(annotated_csv_path, index=False)

# === Step 4: Summary Table ===
summary_df = combined_df.groupby("gene_name").size().reset_index(name="count")
summary_df = summary_df.sort_values("count", ascending=False)

# Save summary
summary_path = output_dir / "ARG_summary_table.csv"
summary_df.to_csv(summary_path, index=False)

# === Step 5: Quick View ===
summary_df.head(10)


| File                           | Description                                 |
| ------------------------------ | ------------------------------------------- |
| `ARG_annotations_combined.csv` | Full DIAMOND hits with sample name and gene |
| `ARG_summary_table.csv`        | Gene frequency table                        |
