In [None]:
import pandas as pd
import numpy as np
import os

csv_master = "CSV/plantvillage_binary_labels.csv"
df = pd.read_csv(csv_master)

print("Total images:", len(df))
print(df["label"].value_counts())

# 1) Add severity column: healthy=0, diseased=NaN for now
df["severity"] = np.nan
df.loc[df["label"] == 0, "severity"] = 0  # healthy -> severity 0

# 2) Mark a subset of diseased images to label manually
df["for_severity_label"] = 0

diseased_df = df[df["label"] == 1].copy()

# how many per disease type to label (you can change this)
n_per_disease = 200   # e.g. 200 from each disease folder

indices_to_label = []
for disease_name, group in diseased_df.groupby("disease"):
    sample_n = min(n_per_disease, len(group))
    sample = group.sample(sample_n, random_state=42)
    indices_to_label.extend(sample.index.tolist())
    print(f"{disease_name}: will label {sample_n} images")

df.loc[indices_to_label, "for_severity_label"] = 1

print("\nTotal diseased images selected for severity labelling:",
      df["for_severity_label"].sum())

# 3) Save to a new CSV that the labelling tool will use
os.makedirs("CSV", exist_ok=True)
csv_severity = "CSV/plantvillage_with_severity_todo.csv"
df.to_csv(csv_severity, index=False)
print("Saved:", csv_severity)
