Detection

In [None]:
import os
import pandas as pd
from PIL import Image

images_dir = "path/to/images"
csv_file = "path/to/annotations.csv"
output_dir = "path/to/yolo_format"

annotations = pd.read_csv(csv_file)

def normalize_coordinates(x1, y1, x2, y2, img_width, img_height):
    x_center = (x1 + x2) / 2 / img_width
    y_center = (y1 + y2) / 2 / img_height
    width = (x2 - x1) / img_width
    height = (y2 - y1) / img_height
    return x_center, y_center, width, height

for img_name in annotations['filename'].unique():
    img_path = os.path.join(images_dir, img_name)
    img = Image.open(img_path)
    img_width, img_height = img.size

    txt_file = os.path.join(output_dir, img_name.replace('.jpg', '.txt').replace('.png', '.txt'))

    with open(txt_file, 'w') as f:
        img_annotations = annotations[annotations['filename'] == img_name]

        for _, row in img_annotations.iterrows():
            class_id = row['class']
            x_center, y_center, width, height = normalize_coordinates(
                row['x1'], row['y1'], row['x2'], row['y2'], img_width, img_height
            )
            f.write(f"{class_id} {x_center} {y_center} {width} {height}\n")

In [None]:
# dataset.yaml
# train: path/to/dataset/images/train 
# val: path/to/dataset/images/val    
# test: path/to/dataset/images/test   

# nc: 3 
# names: ['car', 'person', 'dog']

In [None]:
import os
import pandas as pd
from ultralytics import YOLO

DATA_YAML = "path/to/dataset.yaml"
TEST_DIR = "path/to/test" 
OUTPUT_CSV = "submission.csv" 

model = YOLO("yolov11n.pt")

model.train(
    data=DATA_YAML,  
    epochs=10,     
    imgsz=640,    
    batch=16,      
    lr0=1e-3,      
    momentum=0.9,  
    augment=True     
)

model_path = "runs/train/exp/weights/best.pt"
model = YOLO(model_path)

test_images = [f for f in os.listdir(TEST_DIR) if f.endswith((".jpg", ".png"))]

predictions = []

for img_name in test_images:
    img_path = os.path.join(TEST_DIR, img_name)
    results = model(img_path)

    for result in results:
        for box in result.boxes.data:
            x1, y1, x2, y2, conf, cls = box.tolist()
            predictions.append((img_name, int(cls), conf, x1, y1, x2, y2))

df = pd.DataFrame(predictions, columns=["filename", "class", "confidence", "x1", "y1", "x2", "y2"])
df.to_csv(OUTPUT_CSV, index=False)

Classification

In [None]:
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoModelForImageClassification, AdamW, get_linear_schedule_with_warmup

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TRAIN_CSV = "train.csv"
TEST_CSV = "test.csv"
MODEL_NAME = "google/vit-base-patch16-224"
OUTPUT_DIR = "vit_finetuned"
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
VAL_SPLIT = 0.1
NUM_WORKERS = 4
IMG_SIZE = 224

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

labels_unique = sorted(train_df.iloc[:, 1].unique())
label2id = {lab: i for i, lab in enumerate(labels_unique)}
id2label = {i: lab for lab, i in label2id.items()}
train_df['label_id'] = train_df.iloc[:, 1].map(label2id).astype(int)

from sklearn.model_selection import train_test_split
train_df_split, val_df = train_test_split(train_df, test_size=VAL_SPLIT, stratify=train_df['label_id'], random_state=SEED)

transform_train = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
transform_val = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

class ImageDataset(Dataset):
    def __init__(self, df, img_col=0, label_col=None, transform=None):
        self.paths = df.iloc[:, img_col].values.tolist()
        self.labels = None if label_col is None else df[label_col].values.tolist()
        self.transform = transform
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        if self.labels is None:
            return {"pixel_values": img}
        return {"pixel_values": img, "labels": int(self.labels[idx])}

train_dataset = ImageDataset(train_df_split, img_col=0, label_col='label_id', transform=transform_train)
val_dataset = ImageDataset(val_df, img_col=0, label_col='label_id', transform=transform_val)
test_dataset = ImageDataset(test_df, img_col=0, label_col=None, transform=transform_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

num_labels = len(labels_unique)
model = AutoModelForImageClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.06 * total_steps), num_training_steps=total_steps)

best_val_acc = 0.0
os.makedirs(OUTPUT_DIR, exist_ok=True)

for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    val_acc = correct / total if total > 0 else 0.0
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        model.save_pretrained(OUTPUT_DIR)

model = AutoModelForImageClassification.from_pretrained(OUTPUT_DIR)
model.to(device)
model.eval()

all_preds = []
with torch.no_grad():
    for batch in test_loader:
        pixel_values = batch["pixel_values"].to(device)
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        all_preds.extend(preds)

pred_labels = [id2label[p] for p in all_preds]
out_df = test_df.copy()
out_df['label'] = pred_labels
out_df.to_csv("test_predictions.csv", index=False)


Embedding extract

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModel
from tqdm.auto import tqdm

INPUT_CSV = "images.csv"
PATH_COL = "path"
OUT_CSV = "image_embeddings.csv"
OUT_NPZ = "image_embeddings.npz"
MODEL_NAME = "facebook/dinov2-base"
BATCH_SIZE = 16
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv(INPUT_CSV)
processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

embs = []
paths = []
with torch.no_grad():
    for i in tqdm(range(0, len(df), BATCH_SIZE)):
        batch_paths = df[PATH_COL].iloc[i : i + BATCH_SIZE].tolist()
        images = [Image.open(p).convert("RGB") for p in batch_paths]
        inputs = processor(images=images, return_tensors="pt")
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        outputs = model(**inputs)
        if getattr(outputs, "pooler_output", None) is not None:
            batch_emb = outputs.pooler_output.cpu().numpy()
        else:
            last = outputs.last_hidden_state
            if last is None:
                raise ValueError("No embeddings in model output")
            if last.shape[1] >= 1:
                batch_emb = last[:, 0, :].cpu().numpy()
            else:
                batch_emb = last.mean(dim=1).cpu().numpy()
        embs.append(batch_emb)
        paths.extend(batch_paths)

emb_all = np.vstack(embs)
emb_dim = emb_all.shape[1]
cols = ["path"] + [f"emb_{j}" for j in range(emb_dim)]
out_df = pd.DataFrame(np.concatenate([np.array(paths)[:, None], emb_all.astype(np.float32)], axis=1), columns=cols)
out_df.to_csv(OUT_CSV, index=False)
np.savez_compressed(OUT_NPZ, paths=np.array(paths), embeddings=emb_all)


In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModel
from tqdm.auto import tqdm

INPUT_CSV = "images.csv"
PATH_COL = "path"
OUT_CSV = "image_embeddings_single.csv"
OUT_NPZ = "image_embeddings_single.npz"
MODEL_NAME = "facebook/dinov2-base"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv(INPUT_CSV)
processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

paths = []
embs = []
with torch.no_grad():
    for p in tqdm(df[PATH_COL].tolist(), desc="images"):
        image = Image.open(p).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        outputs = model(**inputs)
        if getattr(outputs, "pooler_output", None) is not None:
            emb = outputs.pooler_output[0].cpu().numpy()
        else:
            last = outputs.last_hidden_state
            if last.shape[1] >= 1:
                emb = last[0, 0, :].cpu().numpy()
            else:
                emb = last.mean(dim=1)[0].cpu().numpy()
        paths.append(p)
        embs.append(emb.astype(np.float32))

emb_all = np.vstack(embs) if len(embs) > 0 else np.zeros((0, 0), dtype=np.float32)
emb_dim = emb_all.shape[1] if emb_all.ndim == 2 else 0
cols = [f"emb_{i}" for i in range(emb_dim)]
out_df = pd.DataFrame(emb_all, columns=cols)
out_df.insert(0, PATH_COL, paths)
out_df.to_csv(OUT_CSV, index=False)
np.savez_compressed(OUT_NPZ, paths=np.array(paths), embeddings=emb_all)
