In [8]:
""" Train Model """
model_path = "/kaggle/input/byu-a-106-yolov8l-daexp-mixupexp/best.pt"

""" [IMPORTANT]
* This parameter has a significant impact on the value of LB since it is the threshold for the prediction score inferred by the model.
* In my experiments, 0.5 to 0.55 is optimal for local CV, but when submitting, 0.35 to 0.45 seems to give better results, so there is a difference.
"""
CONFIDENCE_THRESHOLD = 0.40

MAX_DETECTIONS_PER_TOMO = 3
NMS_IOU_THRESHOLD = 0.2
CONCENTRATION = 1
BATCH_SIZE = 8 

In [3]:
"""[INFO]
* This notebookinstall Ultralytics v8.3.88(2025/03/11 ReleaseVersion)
  Can use YOLO12 is latest family version. 
* If you need a newer version, you can make it available by running and attaching the notebook.
  https://www.kaggle.com/code/hideyukizushi/ultralytics-offlineinstall-yolo12-weights
"""
!tar xfvz /kaggle/input/ultralytics-offlineinstall-yolo12-weights/archive.tar.gz
!pip install --no-index --find-links=./packages ultralytics
!rm -rf ./packages

tar: Error opening archive: Failed to open '/kaggle/input/ultralytics-offlineinstall-yolo12-weights/archive.tar.gz'


Looking in links: ./packages


'rm'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


In [4]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import cv2
from tqdm.notebook import tqdm
from ultralytics import YOLO
import threading
import time
from contextlib import nullcontext
from concurrent.futures import ThreadPoolExecutor

In [5]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x18dda778f30>

In [6]:
data_path = "/kaggle/input/byu-locating-bacterial-flagellar-motors-2025/"
test_dir = os.path.join(data_path, "test")
submission_path = "/kaggle/working/submission.csv"

In [7]:
class GPUProfiler:
    def __init__(self, name):
        self.name = name
        self.start_time = None
        
    def __enter__(self):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        self.start_time = time.time()
        return self
        
    def __exit__(self, *args):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        elapsed = time.time() - self.start_time
        # print(f"[PROFILE] {self.name}: {elapsed:.3f}s")


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
if device.startswith('cuda'):
    # Set CUDA optimization flags
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False
    torch.backends.cuda.matmul.allow_tf32 = True  # Allow TF32 on Ampere GPUs
    torch.backends.cudnn.allow_tf32 = True
    
    # Print GPU info
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9  # Convert to GB
    print(f"Using GPU: {gpu_name} with {gpu_mem:.2f} GB memory")
    
    # Get available GPU memory and set batch size accordingly
    free_mem = gpu_mem - torch.cuda.memory_allocated(0) / 1e9
    BATCH_SIZE = max(8, min(32, int(free_mem * 4)))  # 4 images per GB as rough estimate
    print(f"Dynamic batch size set to {BATCH_SIZE} based on {free_mem:.2f}GB free memory")
else:
    print("GPU not available, using CPU")
    BATCH_SIZE = 4  # Reduce batch size for CPU

Using GPU: NVIDIA GeForce RTX 3090 Ti with 25.76 GB memory
Dynamic batch size set to 32 based on 25.76GB free memory


In [ ]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import cv2
from tqdm.notebook import tqdm
import threading
import time
from contextlib import nullcontext
from concurrent.futures import ThreadPoolExecutor
import torchvision.transforms as T

# 기존 상수 및 변수 선언 등은 그대로 유지
CONFIDENCE_THRESHOLD = 0.45  # 탐지 신뢰도 임계값
NMS_IOU_THRESHOLD = 0.2       # 3D NMS 임계값
CONCENTRATION = 1             # 슬라이스 처리 비율 (빠른 제출용)

# 테스트 데이터 경로 및 제출 파일 경로
data_path = "/kaggle/input/byu-locating-bacterial-flagellar-motors-2025/"
test_dir = os.path.join(data_path, "test")
submission_path = "/kaggle/working/submission.csv"

# 모델 체크포인트 경로 (이제 DETR 모델 체크포인트)
model_path = "/kaggle/input/train-detr/detr_weights/checkpoint.pth"

# GPU 설정
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 8
if device.startswith('cuda'):
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"Using GPU: {gpu_name} with {gpu_mem:.2f} GB memory")
    free_mem = gpu_mem - torch.cuda.memory_allocated(0) / 1e9
    BATCH_SIZE = max(8, min(32, int(free_mem * 4)))
    print(f"Dynamic batch size set to {BATCH_SIZE} based on {free_mem:.2f}GB free memory")
else:
    print("GPU not available, using CPU")
    BATCH_SIZE = 4

# GPU 프로파일링 컨텍스트 매니저 (변경 없음)
class GPUProfiler:
    def __init__(self, name):
        self.name = name
        self.start_time = None
        
    def __enter__(self):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        self.start_time = time.time()
        return self
        
    def __exit__(self, *args):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        elapsed = time.time() - self.start_time
        print(f"[PROFILE] {self.name}: {elapsed:.3f}s")

# -------------------------------------------------------------------
# 아래부터 DETR 모델을 YOLO와 유사한 인터페이스로 래핑하는 부분입니다.
# YOLO 대신 DETRWrapper를 사용하며 fuse(), to() 메서드도 정의해 나머지 코드 변경 없이 동작하도록 합니다.
# -------------------------------------------------------------------

class Boxes:
    def __init__(self, conf, xyxy):
        # conf: torch.Tensor, xyxy: torch.Tensor (이미 CPU로 옮겨진 상태)
        self.conf = conf.cpu() if conf.is_cuda else conf
        self.xyxy = xyxy.cpu() if xyxy.is_cuda else xyxy

class Result:
    def __init__(self, boxes):
        self.boxes = boxes

class DETRWrapper:
    def __init__(self, model_path):
        # DETR 모델 로딩: torch.hub를 이용해 detr_resnet50 모델을 불러오고, 체크포인트로 가중치를 로드
        self.model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=False)
        checkpoint = torch.load(model_path, map_location='cpu')
        # 체크포인트 파일의 구조에 따라 가중치 로드 (여기서는 checkpoint['model'] 형태라고 가정)
        self.model.load_state_dict(checkpoint['model'])
        self.model.eval()
        self.device = device
        self.model.to(device)
        # 입력 이미지 전처리 transform (DETR가 기대하는 전처리)
        self.transform = T.Compose([
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406],
                        [0.229, 0.224, 0.225])
        ])
        
    def fuse(self):
        # DETR에는 fuse 단계가 없으므로 아무 작업도 하지 않음
        pass

    def to(self, device):
        self.device = device
        self.model.to(device)
        return self

    def __call__(self, image_paths, verbose=False):
        results = []
        images = []
        original_sizes = []
        for path in image_paths:
            # 이미지를 RGB로 읽고 원본 크기 저장
            img = Image.open(path).convert("RGB")
            original_sizes.append(img.size)  # (width, height)
            images.append(self.transform(img))
        batch = torch.stack(images).to(self.device)
        with torch.no_grad():
            outputs = self.model(batch)
        # outputs: dict with keys 'pred_logits' and 'pred_boxes'
        probas = outputs['pred_logits'].softmax(-1)  # (batch_size, num_queries, num_classes+1)
        for i in range(batch.shape[0]):
            width, height = original_sizes[i]
            scores, labels = probas[i].max(-1)  # 각 쿼리의 최대 확률 및 해당 클래스
            keep = scores > CONFIDENCE_THRESHOLD
            boxes = outputs['pred_boxes'][i]  # (num_queries, 4) in [cx, cy, w, h] (정규화됨)
            boxes = self.box_cxcywh_to_xyxy(boxes)
            # 이미지 크기에 맞게 스케일 조정
            boxes[:, 0] *= width
            boxes[:, 1] *= height
            boxes[:, 2] *= width
            boxes[:, 3] *= height
            boxes = boxes[keep]
            scores_kept = scores[keep]
            results.append(Result(Boxes(scores_kept, boxes)))
        return results

    def box_cxcywh_to_xyxy(self, x):
        # (cx, cy, w, h) -> (x1, y1, x2, y2)
        x_c, y_c, w, h = x.unbind(1)
        bboxes = torch.stack([x_c - 0.5 * w, y_c - 0.5 * h,
                               x_c + 0.5 * w, y_c + 0.5 * h], dim=1)
        return bboxes

# -------------------------------------------------------------------
# 이하 나머지 함수들은 그대로 유지 (YOLO에서 DETR로 모델 변경에 따른 수정 없이 동작)
# -------------------------------------------------------------------

def normalize_slice(slice_data):
    """
    Normalize slice data using 2nd and 98th percentiles for better contrast
    """
    p2 = np.percentile(slice_data, 2)
    p98 = np.percentile(slice_data, 98)
    clipped_data = np.clip(slice_data, p2, p98)
    normalized = 255 * (clipped_data - p2) / (p98 - p2)
    return np.uint8(normalized)

def preload_image_batch(file_paths):
    """Preload a batch of images to CPU memory"""
    images = []
    for path in file_paths:
        img = cv2.imread(path)
        if img is None:
            img = np.array(Image.open(path))
        images.append(img)
    return images

def process_tomogram(tomo_id, model, index=0, total=1):
    """
    Process a single tomogram and return the most confident motor detection
    """
    tomo_dir = os.path.join(test_dir, tomo_id)
    slice_files = sorted([f for f in os.listdir(tomo_dir) if f.endswith('.jpg')])
    
    # Apply CONCENTRATION to reduce the number of slices processed
    selected_indices = np.linspace(0, len(slice_files)-1, int(len(slice_files) * CONCENTRATION))
    selected_indices = np.round(selected_indices).astype(int)
    slice_files = [slice_files[i] for i in selected_indices]
    
    all_detections = []
    if device.startswith('cuda'):
        streams = [torch.cuda.Stream() for _ in range(min(4, BATCH_SIZE))]
    else:
        streams = [None]
    
    next_batch_thread = None
    next_batch_images = None
    
    for batch_start in range(0, len(slice_files), BATCH_SIZE):
        if next_batch_thread is not None:
            next_batch_thread.join()
            next_batch_images = None
            
        batch_end = min(batch_start + BATCH_SIZE, len(slice_files))
        batch_files = slice_files[batch_start:batch_end]
        
        next_batch_start = batch_end
        next_batch_end = min(next_batch_start + BATCH_SIZE, len(slice_files))
        next_batch_files = slice_files[next_batch_start:next_batch_end] if next_batch_start < len(slice_files) else []
        
        if next_batch_files:
            next_batch_paths = [os.path.join(tomo_dir, f) for f in next_batch_files]
            next_batch_thread = threading.Thread(target=preload_image_batch, args=(next_batch_paths,))
            next_batch_thread.start()
        else:
            next_batch_thread = None
        
        sub_batches = np.array_split(batch_files, len(streams))
        sub_batch_results = []
        
        for i, sub_batch in enumerate(sub_batches):
            if len(sub_batch) == 0:
                continue
                
            stream = streams[i % len(streams)]
            with torch.cuda.stream(stream) if stream and device.startswith('cuda') else nullcontext():
                sub_batch_paths = [os.path.join(tomo_dir, slice_file) for slice_file in sub_batch]
                sub_batch_slice_nums = [int(slice_file.split('_')[1].split('.')[0]) for slice_file in sub_batch]
                
                with GPUProfiler(f"Inference batch {i+1}/{len(sub_batches)}"):
                    sub_results = model(sub_batch_paths, verbose=False)
                
                for j, result in enumerate(sub_results):
                    if len(result.boxes.conf) > 0:
                        boxes = result.boxes
                        for box_idx, confidence in enumerate(boxes.conf):
                            if confidence >= CONFIDENCE_THRESHOLD:
                                x1, y1, x2, y2 = boxes.xyxy[box_idx].cpu().numpy()
                                x_center = (x1 + x2) / 2
                                y_center = (y1 + y2) / 2
                                all_detections.append({
                                    'z': round(sub_batch_slice_nums[j]),
                                    'y': round(y_center),
                                    'x': round(x_center),
                                    'confidence': float(confidence)
                                })
        
        if device.startswith('cuda'):
            torch.cuda.synchronize()
    
    if next_batch_thread is not None:
        next_batch_thread.join()
    
    final_detections = perform_3d_nms(all_detections, NMS_IOU_THRESHOLD)
    final_detections.sort(key=lambda x: x['confidence'], reverse=True)
    
    if not final_detections:
        return {
            'tomo_id': tomo_id,
            'Motor axis 0': -1,
            'Motor axis 1': -1,
            'Motor axis 2': -1
        }
    
    best_detection = final_detections[0]
    return {
        'tomo_id': tomo_id,
        'Motor axis 0': round(best_detection['z']),
        'Motor axis 1': round(best_detection['y']),
        'Motor axis 2': round(best_detection['x'])
    }

def perform_3d_nms(detections, iou_threshold):
    """
    Perform 3D Non-Maximum Suppression on detections to merge nearby motors
    """
    if not detections:
        return []
    
    detections = sorted(detections, key=lambda x: x['confidence'], reverse=True)
    final_detections = []
    
    def distance_3d(d1, d2):
        return np.sqrt((d1['z'] - d2['z'])**2 + 
                       (d1['y'] - d2['y'])**2 + 
                       (d1['x'] - d2['x'])**2)
    
    box_size = 24
    distance_threshold = box_size * iou_threshold
    
    while detections:
        best_detection = detections.pop(0)
        final_detections.append(best_detection)
        detections = [d for d in detections if distance_3d(d, best_detection) > distance_threshold]
    
    return final_detections

def debug_image_loading(tomo_id):
    """
    Debug function to check image loading
    """
    tomo_dir = os.path.join(test_dir, tomo_id)
    slice_files = sorted([f for f in os.listdir(tomo_dir) if f.endswith('.jpg')])
    
    if not slice_files:
        print(f"No image files found in {tomo_dir}")
        return
        
    sample_file = slice_files[len(slice_files)//2]
    img_path = os.path.join(tomo_dir, sample_file)
    
    try:
        img_pil = Image.open(img_path)
        img_array_pil = np.array(img_pil)
        img_cv2 = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        img_rgb = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
    except Exception as e:
        print(f"Error loading image {img_path}: {e}")
        
    try:
        test_model = DETRWrapper(model_path)
        test_results = test_model([img_path], verbose=False)
    except Exception as e:
        print(f"Error with DETR processing: {e}")

def generate_submission():
    """
    Main function to generate the submission file
    """
    test_tomos = sorted([d for d in os.listdir(test_dir) if os.path.isdir(os.path.join(test_dir, d))])
    total_tomos = len(test_tomos)
    
    if test_tomos:
        debug_image_loading(test_tomos[0])
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # DETRWrapper를 사용하여 모델을 로드 (나머지 코드는 그대로 유지)
    model = DETRWrapper(model_path)
    model.to(device)
    
    if device.startswith('cuda'):
        model.fuse()
        if torch.cuda.get_device_capability(0)[0] >= 7:
            # DETR에서는 half precision 적용이 별도 처리 필요하므로 생략하거나 추가 구현 필요
            pass
    
    results = []
    motors_found = 0

    with ThreadPoolExecutor(max_workers=1) as executor:
        future_to_tomo = {}
        
        for i, tomo_id in enumerate(test_tomos, 1):
            future = executor.submit(process_tomogram, tomo_id, model, i, total_tomos)
            future_to_tomo[future] = tomo_id
        
        for future in future_to_tomo:
            tomo_id = future_to_tomo[future]
            try:
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    
                result = future.result()
                results.append(result)
                
                has_motor = not pd.isna(result['Motor axis 0'])
                if has_motor:
                    motors_found += 1
                    print(f"Motor found in {tomo_id} at position: "
                          f"z={result['Motor axis 0']}, y={result['Motor axis 1']}, x={result['Motor axis 2']}")
                else:
                    print(f"No motor detected in {tomo_id}")
                    
                print(f"Current detection rate: {motors_found}/{len(results)} ({motors_found/len(results)*100:.1f}%)")
            
            except Exception as e:
                print(f"Error processing {tomo_id}: {e}")
                results.append({
                    'tomo_id': tomo_id,
                    'Motor axis 0': -1,
                    'Motor axis 1': -1,
                    'Motor axis 2': -1
                })
    
    submission_df = pd.DataFrame(results)
    submission_df = submission_df[['tomo_id', 'Motor axis 0', 'Motor axis 1', 'Motor axis 2']]
    submission_df.to_csv(submission_path, index=False)
    print("="*50)
    print("= Submission preview:")
    print("="*50)
    print(submission_df.head())
    
    return submission_df

if __name__ == "__main__":
    start_time = time.time()
    submission = generate_submission()
    elapsed = time.time() - start_time
    print(f"\nTotal execution time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
