# Notebook to clean and correct the training dataset

We will modify the bounding boxes to have a better precision (based on a trained yolo) and mask the ground_truth that have no match and the yolo boxes that have no match aswell (to correct misplaced bounding boxes along with non annotated wbc).

## Import and initialization

In [1]:
import os
import sys
import shutil
import pandas as pd
from tqdm import tqdm

sys.path.append("../")

from detection import YoloInference
from notebooks.utils_notebook import curate_image,split_dataset_from_csv

In [None]:
# if the mode is "msk_iou" we will mask the ground_truth boxes that have an iou lower than the threshold with the predicted boxes
# else we will keep these boxes
mode="msk_iou"

origin_csv_path="../data/Cytologia/train.csv"

csv_path=f"../data/Cytologia/train_{mode}.csv"
if not os.path.exists(csv_path):
    shutil.copy(origin_csv_path,csv_path)
else:
    raise ValueError("File already exists")

df = pd.read_csv(csv_path)
images_list = df["NAME"].unique()
path="../data/Cytologia/images/"


## Check and modify the csv given the specified mode

If you want to create the yolo dataset based on the original annotations, skip this part.

In [None]:

yolo_engine=YoloInference("../models/detection/Cytologia_yolo/yolo11n/384/no_curation/train/weights/best.pt",device="cuda")

tqdm_images=tqdm(images_list,desc="Processing images",unit="image")

new_data = []

for name in tqdm_images:
    img_path=os.path.join(path,name)
    df_img=df[df['NAME']==name]
    boxes = df_img[['x1', 'y1', 'x2', 'y2']].apply(tuple, axis=1).tolist()
    classes = df_img['class'].tolist()
    yolo_output=yolo_engine.predict(img_path)
    curate_image(boxes,yolo_output,img_path,classes,df,new_data,mode="msk_iou")
if new_data:
    df = pd.concat([df, pd.DataFrame(new_data)], ignore_index=True)         
    
df.to_csv(csv_path, index=False)


## Construct the yolo dataset (images and labels)

In [None]:
data_path="../data/Cytologia/images/"
new_data_path=f"../data/Cytologia_{mode}/"
os.makedirs(new_data_path,exist_ok=True)
csv_path=f"../data/Cytologia/train_{mode}.csv"

# To create the dataset with the original labels, you can change csv_path to origin_csv_path
split_dataset_from_csv(new_data_path,data_path, csv_path)
