In [2]:
!pip install -q datasets transformers accelerate timm albumentations>=1.4.5 torchmetrics pycocotools coco-eval

In [3]:
import torchvision
import os
from torch.utils.data import DataLoader
from transformers.image_transforms import center_to_corners_format
from transformers import AutoModelForObjectDetection, TrainingArguments, Trainer, DetrImageProcessor, DetrForObjectDetection
import numpy as np
from dataclasses import dataclass
from torchmetrics.detection.mean_ap import MeanAveragePrecision
import torch
from functools import partial
from coco_eval import CocoEvaluator
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [4]:
MODEL_NAME = "facebook/detr-resnet-50-dc5"
IMAGE_SIZE = 512

In [5]:
class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, processor):
        ann_file = os.path.join(img_folder, "_annotations.coco.json")
        super(CocoDetection, self).__init__(img_folder, ann_file)
        self.processor = processor

    def __getitem__(self, idx):
        # read in PIL image and target in COCO format
        # feel free to add data augmentation here before passing them to the next step
        img, target = super(CocoDetection, self).__getitem__(idx)

        # preprocess image and target (converting target to DETR format, resizing + normalization of both image and target)
        image_id = self.ids[idx]
        target = {'image_id': image_id, 'annotations': target}
        encoding = self.processor(images=img, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension
        target = encoding["labels"][0] # remove batch dimension

        return pixel_values, target

In [6]:
def collate_fn(batch):
    pixel_values = [item[0] for item in batch]
    encoding = processor.pad(pixel_values, return_tensors="pt")
    labels = [item[1] for item in batch]
    batch = {}
    batch['pixel_values'] = encoding['pixel_values']
    batch['pixel_mask'] = encoding['pixel_mask']
    batch['labels'] = labels
    return batch

def convert_bbox_yolo_to_pascal(boxes, image_size):
    """
    Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1]
    to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates.

    Args:
        boxes (torch.Tensor): Bounding boxes in YOLO format
        image_size (Tuple[int, int]): Image size in format (height, width)

    Returns:
        torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
    """
    # convert center to corners format
    boxes = center_to_corners_format(boxes)

    # convert to absolute coordinates
    height, width = image_size
    boxes = boxes * torch.tensor([[width, height, width, height]])

    return boxes

@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor


@torch.no_grad()
def compute_metrics(evaluation_results, image_processor, threshold=0.0, id2label=None):
    """
    Compute mean average mAP, mAR and their variants for the object detection task.

    Args:
        evaluation_results (EvalPrediction): Predictions and targets from evaluation.
        threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.0.
        id2label (Optional[dict], optional): Mapping from class id to class name. Defaults to None.

    Returns:
        Mapping[str, float]: Metrics in a form of dictionary {<metric_name>: <metric_value>}
    """

    predictions, targets = evaluation_results.predictions, evaluation_results.label_ids

    # For metric computation we need to provide:
    #  - targets in a form of list of dictionaries with keys "boxes", "labels"
    #  - predictions in a form of list of dictionaries with keys "boxes", "scores", "labels"

    image_sizes = []
    post_processed_targets = []
    post_processed_predictions = []

    # Collect targets in the required format for metric computation
    for batch in targets:
        # collect image sizes, we will need them for predictions post processing
        batch_image_sizes = torch.tensor(np.array([x["orig_size"] for x in batch]))
        image_sizes.append(batch_image_sizes)
        # collect targets in the required format for metric computation
        # boxes were converted to YOLO format needed for model training
        # here we will convert them to Pascal VOC format (x_min, y_min, x_max, y_max)
        for image_target in batch:
            boxes = torch.tensor(image_target["boxes"])
            boxes = convert_bbox_yolo_to_pascal(boxes, image_target["orig_size"])
            labels = torch.tensor(image_target["class_labels"])
            post_processed_targets.append({"boxes": boxes, "labels": labels})

    # Collect predictions in the required format for metric computation,
    # model produce boxes in YOLO format, then image_processor convert them to Pascal VOC format
    for batch, target_sizes in zip(predictions, image_sizes):
        batch_logits, batch_boxes = batch[1], batch[2]
        output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
        post_processed_output = image_processor.post_process_object_detection(
            output, threshold=threshold, target_sizes=target_sizes
        )
        post_processed_predictions.extend(post_processed_output)

    # Compute metrics
    metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
    metric.update(post_processed_predictions, post_processed_targets)
    metrics = metric.compute()

    # Replace list of per class metrics with separate metric for each class
    classes = metrics.pop("classes")
    map_per_class = metrics.pop("map_per_class")
    mar_100_per_class = metrics.pop("mar_100_per_class")
    if classes.ndim > 0 and map_per_class.ndim > 0 and mar_100_per_class.ndim > 0:
        for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
            class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
            metrics[f"map_{class_name}"] = class_map
            metrics[f"mar_100_{class_name}"] = class_mar
    else:
        class_name = id2label[classes.item()] if id2label is not None else classes.item()
        metrics[f"map_{class_name}"] = map_per_class
        metrics[f"mar_100_{class_name}"] = mar_100_per_class

    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}

    return metrics

def convert_to_xywh(boxes):
    xmin, ymin, xmax, ymax = boxes.unbind(1)
    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)

def prepare_for_coco_detection(predictions):
    coco_results = []
    for original_id, prediction in predictions.items():
        if len(prediction) == 0:
            continue

        boxes = prediction["boxes"]
        boxes = convert_to_xywh(boxes).tolist()
        scores = prediction["scores"].tolist()
        labels = prediction["labels"].tolist()

        coco_results.extend(
            [
                {
                    "image_id": original_id,
                    "category_id": labels[k],
                    "bbox": box,
                    "score": scores[k],
                }
                for k, box in enumerate(boxes)
            ]
        )
    return coco_results

In [8]:
processor = DetrImageProcessor.from_pretrained(
    MODEL_NAME,
    do_resize=True,
    # size={"height": IMAGE_SIZE, "width": IMAGE_SIZE}
    size={"max_height": IMAGE_SIZE, "max_width": IMAGE_SIZE},
)

train_dataset = CocoDetection(img_folder='/kaggle/input/lung-ct-version-n-512/lung_ct_version_n_512.v2i.coco/train', processor=processor)
val_dataset = CocoDetection(img_folder='/kaggle/input/lung-ct-version-n-512/lung_ct_version_n_512.v2i.coco/valid', processor=processor)

cats = train_dataset.coco.cats
id2label = {k: v['name'] for k,v in cats.items()}
label2id = {v: k for k, v in id2label.items()}

print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(val_dataset))
print(id2label)
print(label2id)

eval_compute_metrics_fn = partial(
    compute_metrics, image_processor=processor, id2label=id2label, threshold=0.0
)

preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Number of training examples: 2417
Number of validation examples: 117
{0: 'lung_ct', 1: 'nodule'}
{'lung_ct': 0, 'nodule': 1}


In [9]:
model = DetrForObjectDetection.from_pretrained(
    MODEL_NAME,
    id2label=id2label,
    label2id=label2id,
    num_labels=len(id2label),
    ignore_mismatched_sizes=True,
)

training_args = TrainingArguments(
    output_dir="finetune_detr_r50_dc5_version",
    do_train=True,
    do_eval=True,
    num_train_epochs=50,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    weight_decay=1e-4,
    max_grad_norm=0.01,
    fp16=False,
    metric_for_best_model="eval_map",
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    eval_do_concat_batches=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor,
    data_collator=collate_fn,
    compute_metrics=eval_compute_metrics_fn,
)

config.json:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50-dc5 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model check

In [10]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111539638888909, max=1.0)…

Epoch,Training Loss,Validation Loss,Map,Map 50,Map 75,Map Small,Map Medium,Map Large,Mar 1,Mar 10,Mar 100,Mar Small,Mar Medium,Mar Large,Map Nodule,Mar 100 Nodule,Map Lung Ct,Mar 100 Lung Ct
1,No log,1.595221,0.0023,0.0091,0.0003,0.0022,0.0588,-1.0,0.0051,0.0598,0.212,0.2028,0.3667,-1.0,0.0023,0.212,,
2,1.779000,1.097074,0.0104,0.0365,0.0038,0.0098,0.2136,-1.0,0.0316,0.1752,0.3778,0.3688,0.5,-1.0,0.0104,0.3778,,
3,1.779000,1.096096,0.0629,0.1866,0.034,0.0594,0.2737,-1.0,0.1205,0.3077,0.3718,0.3477,0.6778,-1.0,0.0629,0.3718,,
4,1.152800,0.93399,0.1218,0.2937,0.0691,0.1144,0.4515,-1.0,0.2299,0.412,0.4581,0.4477,0.6111,-1.0,0.1218,0.4581,,
5,1.013400,1.135004,0.092,0.2863,0.0346,0.0767,0.3979,-1.0,0.1479,0.3154,0.3316,0.3147,0.5667,-1.0,0.092,0.3316,,
6,1.013400,1.044507,0.0708,0.1561,0.0539,0.0702,0.2073,-1.0,0.1735,0.3735,0.5103,0.5073,0.5444,-1.0,0.0708,0.5103,-1.0,-1.0
7,0.989500,0.886797,0.1016,0.2644,0.0488,0.0969,0.4482,-1.0,0.2068,0.3812,0.4786,0.4688,0.6444,-1.0,0.1016,0.4786,-1.0,-1.0
8,0.989500,0.967454,0.0776,0.2527,0.0293,0.0804,0.4038,-1.0,0.1342,0.3496,0.4274,0.4128,0.6333,-1.0,0.0776,0.4274,-1.0,-1.0
9,0.927100,0.82137,0.2185,0.5622,0.1565,0.2198,0.4683,-1.0,0.2795,0.4453,0.5385,0.5266,0.6889,-1.0,0.2185,0.5385,-1.0,-1.0
10,0.891700,0.879487,0.2087,0.5562,0.1555,0.2075,0.4453,-1.0,0.2744,0.4248,0.4949,0.4807,0.6778,-1.0,0.2087,0.4949,-1.0,-1.0


TrainOutput(global_step=15150, training_loss=0.6967615692843698, metrics={'train_runtime': 9671.8513, 'train_samples_per_second': 12.495, 'train_steps_per_second': 1.566, 'total_flos': 2.3651518909906944e+19, 'train_loss': 0.6967615692843698, 'epoch': 50.0})

In [11]:
test_dataset = CocoDetection(img_folder='/kaggle/input/lung-ct-version-n-512/lung_ct_version_n_512.v2i.coco/test', processor=processor)
test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn, batch_size=4)
# initialize evaluator with ground truth (gt)
evaluator = CocoEvaluator(coco_gt=test_dataset.coco, iou_types=["bbox"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Running evaluation...")
for idx, batch in enumerate(tqdm(test_dataloader)):
    # get the inputs
    pixel_values = batch["pixel_values"].to(device)
    pixel_mask = batch["pixel_mask"].to(device)
    labels = [{k: v.to(device) for k, v in t.items()} for t in batch["labels"]] # these are in DETR format, resized + normalized

    # forward pass
    with torch.no_grad():
      outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    # turn into a list of dictionaries (one item for each example in the batch)
    orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
    results = processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes, threshold=0)

    # provide to metric
    # metric expects a list of dictionaries, each item
    # containing image_id, category_id, bbox and score keys
    predictions = {target['image_id'].item(): output for target, output in zip(labels, results)}
    predictions = prepare_for_coco_detection(predictions)
    evaluator.update(predictions)

evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
Running evaluation...


  0%|          | 0/57 [00:00<?, ?it/s]

Accumulating evaluation results...
DONE (t=0.24s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.443
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.838
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.363
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.442
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.542
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = -1.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.477
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.568
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.586
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.584
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.658
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= l

In [12]:
!zip -r runs.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/wandb/ (stored 0%)
  adding: kaggle/working/wandb/debug-internal.log (deflated 67%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/ (stored 0%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/tmp/ (stored 0%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/tmp/code/ (stored 0%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/logs/ (stored 0%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/logs/debug-internal.log (deflated 67%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/logs/debug-core.log (deflated 57%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/logs/debug.log (deflated 69%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/run-bzy02g8e.wandb (deflated 80%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/files/ (stored 0%)
  adding: kaggle/working/wandb/run-20241105_025707-bzy02g8e/files/requirements.txt (de

In [None]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

In [None]:
model.model.push_to_hub("Toshiiiii1/detr-lung-ct5")
processor.push_to_hub("Toshiiiii1/detr-lung-ct5")