# EDA for DeepLocPro Project

This notebook generates EDA figures and a markdown summary used by the report in `report/`.

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_theme(style="whitegrid")

splits_path = Path("../data/processed/splits.csv")
out_dir = Path("../report")
fig_dir = out_dir / "figures"
fig_dir.mkdir(parents=True, exist_ok=True)
out_dir.mkdir(parents=True, exist_ok=True)

print(f"splits: {splits_path}")
print(f"output figures: {fig_dir}")

In [None]:
if not splits_path.exists():
    raise FileNotFoundError(f"Missing file: {splits_path}")

df = pd.read_csv(splits_path)
if "sequence" not in df.columns:
    raise ValueError("Expected 'sequence' column in splits.csv")

df = df.copy()
df["seq_len"] = df["sequence"].astype(str).str.len()

label_map = {
    "Cytoplasmic": "Cytoplasmic",
    "CytoplasmicMembrane": "Cytoplasmic Membrane",
    "CYtoplasmicMembrane": "Cytoplasmic Membrane",
    "Extracellular": "Extracellular",
    "OuterMembrane": "Outer Membrane",
    "Periplasmic": "Periplasmic",
    "Cellwall": "Cell Wall",
    "CellWall": "Cell Wall",
}
df["label_display"] = df["label"].map(label_map).fillna(df["label"])

print(df.shape)
df.head(3)

In [None]:
def save_fig(path: Path):
    plt.tight_layout()
    plt.savefig(path, dpi=160)
    plt.close()

label_counts = df["label_display"].value_counts()

plt.figure(figsize=(8, 4))
sns.countplot(data=df, x="label_display", order=label_counts.index)
plt.xticks(rotation=30, ha="right")
plt.title("Label distribution")
save_fig(fig_dir / "label_distribution.png")

plt.figure(figsize=(8, 4))
sns.histplot(df["seq_len"], bins=50)
plt.title("Sequence length distribution")
plt.xlabel("Sequence length")
save_fig(fig_dir / "sequence_length_hist.png")

plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x="label_display", y="seq_len")
plt.xticks(rotation=30, ha="right")
plt.title("Sequence length by label")
save_fig(fig_dir / "sequence_length_boxplot.png")

if "gram_type" in df.columns:
    plt.figure(figsize=(4, 4))
    sns.countplot(data=df, x="gram_type")
    plt.title("Gram type distribution")
    save_fig(fig_dir / "gram_type_distribution.png")

if "split" in df.columns:
    plt.figure(figsize=(4, 4))
    sns.countplot(data=df, x="split")
    plt.title("Split distribution")
    save_fig(fig_dir / "split_distribution.png")

print("Saved core EDA figures")

In [None]:
images = sorted(fig_dir.glob("*.png"))
if images:
    cols = 2
    rows = (len(images) + cols - 1) // cols
    plt.figure(figsize=(10, 4 * rows))
    for idx, img_path in enumerate(images, start=1):
        ax = plt.subplot(rows, cols, idx)
        img = plt.imread(img_path)
        ax.imshow(img)
        ax.set_title(img_path.stem.replace("_", " "))
        ax.axis("off")
    plt.tight_layout()
    plt.savefig(fig_dir / "overview.png", dpi=160)
    plt.close()
    print(f"Saved {fig_dir / 'overview.png'}")
else:
    print("No images to include in overview")

In [None]:
seq_stats = df["seq_len"].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

summary_lines = []
summary_lines.append("# EDA Summary")
summary_lines.append("")
summary_lines.append(f"Rows: {len(df)}")
summary_lines.append(f"Columns: {', '.join(df.columns)}")
summary_lines.append("")
summary_lines.append("## Label distribution")
summary_lines.append(label_counts.to_string())
summary_lines.append("")
summary_lines.append("## Sequence length stats")
summary_lines.append(seq_stats.to_string())

if "split" in df.columns:
    summary_lines.append("")
    summary_lines.append("## Split distribution")
    summary_lines.append(df["split"].value_counts().to_string())

summary_path = out_dir / "eda.md"
summary_path.write_text("\n".join(summary_lines), encoding="utf-8")

print(f"Saved summary: {summary_path}")
print(seq_stats)


Notebook complete. Exported files:
- `report/figures/*.png`
- `report/figures/overview.png`
- `report/eda.md`
