## Dataset building

In [6]:
import os

print("Processed images exists:", os.path.exists("../data/processed_images"))
print("Number of files in processed_images:",
      len(os.listdir("../data/processed_images")))
print(os.listdir("../data/processed_images")[:5])


Processed images exists: True
Number of files in processed_images: 99
['AN_10_G3_back_0.jpg', 'AN_10_G3_neck_1.jpg', 'AN_11_G4_back_0.jpg', 'AN_11_G4_neck_1.jpg', 'AN_12_G2_back_0.jpg']


In [7]:
import os

os.makedirs("../data/cropped_images", exist_ok=True)

In [8]:
import cv2
import os
from tqdm import tqdm

# Load Haarcascade face detector
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 
                                     "haarcascade_frontalface_default.xml")

INPUT_DIR = "../data/processed_images"
OUTPUT_DIR = "../data/cropped_images"

def crop_neck_region(image_path, save_path):
    img = cv2.imread(image_path)
    if img is None:
        return False

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.2, minNeighbors=5)

    h, w = img.shape[:2]

    if len(faces) == 0:
        # fallback: bottom-center crop
        y1 = int(h * 0.45)
        y2 = h
        x1 = int(w * 0.15)
        x2 = int(w * 0.85)
        neck_crop = img[y1:y2, x1:x2]
    else:
        (x, y, w_box, h_box) = faces[0]

        chin_y = y + int(h_box * 0.9)

        y1 = max(chin_y, 0)
        y2 = min(chin_y + int(h * 0.35), h)

        x1 = max(x - int(w_box * 0.3), 0)
        x2 = min(x + w_box + int(w_box * 0.3), w)

        neck_crop = img[y1:y2, x1:x2]

    if neck_crop.size == 0:
        return False

    cv2.imwrite(save_path, neck_crop)
    return True


In [9]:
import glob

image_paths = glob.glob(os.path.join(INPUT_DIR, "*"))

success = 0
failed = 0

for img_path in tqdm(image_paths, desc="Cropping images"):
    filename = os.path.basename(img_path)
    save_path = os.path.join(OUTPUT_DIR, filename)

    if crop_neck_region(img_path, save_path):
        success += 1
    else:
        failed += 1

print(f"Cropping done ‚úÖ")
print(f"Successful: {success}")
print(f"Failed: {failed}")


Cropping images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 99/99 [00:02<00:00, 40.70it/s]

Cropping done ‚úÖ
Successful: 99
Failed: 0





In [10]:
print("Number of cropped images:",
      len(os.listdir("../data/cropped_images")))
print(os.listdir("../data/cropped_images")[:5])


Number of cropped images: 99
['AN_10_G3_back_0.jpg', 'AN_10_G3_neck_1.jpg', 'AN_11_G4_back_0.jpg', 'AN_11_G4_neck_1.jpg', 'AN_12_G2_back_0.jpg']


## Canonical dataset

In [11]:
import os
import pandas as pd
from tqdm import tqdm

CROPPED_DIR = "../data/cropped_images"

rows = []

for filename in tqdm(os.listdir(CROPPED_DIR), desc="Building dataset"):
    if not filename.lower().endswith((".jpg", ".jpeg", ".png")):
        continue

    # Expected filename pattern:
    # AN_10_G3_back_0.jpg
    parts = filename.split("_")

    try:
        patient_id = int(parts[1])
        grade = int(parts[2].replace("G", ""))
        view = parts[3].lower() if len(parts) > 3 else "unknown"
    except Exception as e:
        print(f"Skipping malformed filename: {filename} | {e}")
        continue

    rows.append({
        "image_path": f"data/cropped_images/{filename}",
        "patient_id": patient_id,
        "grade": grade,
        "view": view
    })

assert len(rows) > 0, "‚ùå No images found ‚Äî something is wrong"

df = pd.DataFrame(rows)
df = df.sort_values(by=["patient_id", "view"]).reset_index(drop=True)

df.to_csv("../data/dataset.csv", index=False)

print("‚úÖ Phase 0 complete")
print("Total images:", len(df))
df.head()


Building dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 99/99 [00:00<?, ?it/s]

‚úÖ Phase 0 complete
Total images: 99





Unnamed: 0,image_path,patient_id,grade,view
0,data/cropped_images/AN_1_G4_back_0.jpg,1,4,back
1,data/cropped_images/AN_1_G4_neck_1.jpg,1,4,neck
2,data/cropped_images/AN_1_G4_neck_2.jpg,1,4,neck
3,data/cropped_images/AN_1_G4_neck_3.jpg,1,4,neck
4,data/cropped_images/AN_2_G3_back_0.jpg,2,3,back


In [12]:
print("Images per grade:")
print(df["grade"].value_counts())

print("\nUnique patients:", df["patient_id"].nunique())

print("\nImages per patient (first 10):")
print(df.groupby("patient_id").size().head(10))

print("\nViews:")
print(df["view"].value_counts())


Images per grade:
grade
4    39
3    36
2    12
1    12
Name: count, dtype: int64

Unique patients: 35

Images per patient (first 10):
patient_id
1     4
2     2
3     2
4     2
5     1
6     2
7     2
8     2
9     2
10    2
dtype: int64

Views:
view
neck    64
back    35
Name: count, dtype: int64


## Patient-Wise cross validation

In [13]:
from sklearn.model_selection import GroupKFold
import json

df = pd.read_csv("../data/dataset.csv")

N_FOLDS = 5
gkf = GroupKFold(n_splits=N_FOLDS)

folds = {}

for fold, (train_idx, test_idx) in enumerate(
        gkf.split(df, groups=df["patient_id"])):

    folds[f"fold_{fold}"] = {
        "train_idx": train_idx.tolist(),
        "test_idx": test_idx.tolist()
    }

# Save splits
os.makedirs("../splits", exist_ok=True)
with open("../splits/cv_folds.json", "w") as f:
    json.dump(folds, f, indent=2)

print("‚úÖ Phase 1 complete")
print(f"Saved {N_FOLDS} patient-wise folds")


‚úÖ Phase 1 complete
Saved 5 patient-wise folds


In [14]:
for fold_name, split in folds.items():
    train_patients = set(df.iloc[split["train_idx"]]["patient_id"])
    test_patients  = set(df.iloc[split["test_idx"]]["patient_id"])

    overlap = train_patients & test_patients
    print(f"{fold_name} overlap:", overlap)


fold_0 overlap: set()
fold_1 overlap: set()
fold_2 overlap: set()
fold_3 overlap: set()
fold_4 overlap: set()


## Baseline Model

In [15]:
import os
os.makedirs("../models/effnet_b0", exist_ok=True)


In [16]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.models import efficientnet_b0
import pandas as pd
import numpy as np
import json
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from PIL import Image
from tqdm import tqdm


In [17]:
class ANDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img = Image.open(row["image_path"]).convert("RGB")
        label = int(row["grade"]) - 1  # 1‚Äì4 ‚Üí 0‚Äì3

        if self.transform:
            img = self.transform(img)

        return img, label


In [18]:
train_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(0.3, 0.3, 0.3),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],
                         [0.229,0.224,0.225])
])

test_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],
                         [0.229,0.224,0.225])
])


In [19]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, correct = 0, 0

    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)

        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * imgs.size(0)
        correct += (out.argmax(1) == labels).sum().item()

    return total_loss / len(loader.dataset), correct / len(loader.dataset)


def eval_model(model, loader, device):
    model.eval()
    preds, gts = [], []

    with torch.no_grad():
        for imgs, labels in loader:
            imgs = imgs.to(device)
            out = model(imgs)

            preds.extend(out.argmax(1).cpu().numpy())
            gts.extend(labels.numpy())

    return accuracy_score(gts, preds)


In [20]:
df = pd.read_csv("../data/dataset.csv")

with open("../splits/cv_folds.json") as f:
    folds = json.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
""""
import pandas as pd

df = pd.read_csv("../data/dataset.csv")

# Fix image paths (add ../)
df["image_path"] = df["image_path"].apply(
    lambda x: "../" + x if not x.startswith("../") else x
)

df.to_csv("../data/dataset.csv", index=False)

print("‚úÖ Fixed image paths in dataset.csv")
print(df["image_path"].head()) 
"""

‚úÖ Fixed image paths in dataset.csv
0    ../data/cropped_images/AN_1_G4_back_0.jpg
1    ../data/cropped_images/AN_1_G4_neck_1.jpg
2    ../data/cropped_images/AN_1_G4_neck_2.jpg
3    ../data/cropped_images/AN_1_G4_neck_3.jpg
4    ../data/cropped_images/AN_2_G3_back_0.jpg
Name: image_path, dtype: object


In [23]:
fold_accuracies = []

for fold_name, split in folds.items():
    print(f"\n===== {fold_name.upper()} =====")

    train_df = df.iloc[split["train_idx"]]
    test_df  = df.iloc[split["test_idx"]]

    train_ds = ANDataset(train_df, train_tfms)
    test_ds  = ANDataset(test_df, test_tfms)

    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=8, shuffle=False)

    # Model
    model = efficientnet_b0(weights="IMAGENET1K_V1")
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, 4)
    model.to(device)

    # Class weights (per fold!)
    y = train_df["grade"].values - 1
    weights = compute_class_weight("balanced", classes=np.array([0,1,2,3]), y=y)
    weights = torch.tensor(weights, dtype=torch.float).to(device)

    criterion = nn.CrossEntropyLoss(weight=weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # Train
    for epoch in range(12):
        train_loss, train_acc = train_one_epoch(
            model, train_loader, optimizer, criterion, device
        )

    # Evaluate
    test_acc = eval_model(model, test_loader, device)
    fold_accuracies.append(test_acc)

    print(f"{fold_name} accuracy: {test_acc:.4f}")

    torch.save(
        model.state_dict(),
        f"../models/effnet_b0/{fold_name}.pth"
    )



===== FOLD_0 =====
fold_0 accuracy: 0.2500

===== FOLD_1 =====
fold_1 accuracy: 0.4000

===== FOLD_2 =====
fold_2 accuracy: 0.2500

===== FOLD_3 =====
fold_3 accuracy: 0.3500

===== FOLD_4 =====
fold_4 accuracy: 0.5789


In [24]:
print("\n===== FINAL CV RESULT =====")
print("Fold accuracies:", fold_accuracies)
print(f"Mean accuracy: {np.mean(fold_accuracies):.4f}")
print(f"Std deviation: {np.std(fold_accuracies):.4f}")


===== FINAL CV RESULT =====
Fold accuracies: [0.25, 0.4, 0.25, 0.35, 0.5789473684210527]
Mean accuracy: 0.3658
Std deviation: 0.1214


We evaluated the baseline EfficientNet-B0 model using 5-fold patient-wise cross-validation, ensuring that images from the same patient never appear in both training and testing sets.

Interpretation

The variability across folds is expected due to:

-Small dataset (35 patients)

-Uneven grade distribution

-High inter-patient variability in skin tone, lighting, and image quality

The higher accuracy in Fold 4 indicates that the model can perform well when the test patients are visually distinct.

This mean accuracy represents a true, unbiased baseline, unlike a single train/test split which can be misleading on small medical datasets.

This result establishes a reliable reference point against which all future models will be compared.


# Model 2: EfficientNet-B2

In [25]:
import os
os.makedirs("../models/effnet_b2", exist_ok=True)

In [26]:
from torchvision.models import efficientnet_b2


In [27]:
train_tfms = transforms.Compose([
    transforms.Resize((260, 260)),   # B2 default ~260
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(0.3, 0.3, 0.3),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],
                         [0.229,0.224,0.225])
])

test_tfms = transforms.Compose([
    transforms.Resize((260, 260)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],
                         [0.229,0.224,0.225])
])


In [28]:
from sklearn.utils.class_weight import compute_class_weight

fold_accuracies_b2 = []

for fold_name, split in folds.items():
    print(f"\n===== {fold_name.upper()} (EffNet-B2) =====")

    train_df = df.iloc[split["train_idx"]]
    test_df  = df.iloc[split["test_idx"]]

    train_ds = ANDataset(train_df, train_tfms)
    test_ds  = ANDataset(test_df, test_tfms)

    train_loader = DataLoader(train_ds, batch_size=6, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=6, shuffle=False)

    # üî• Model: EfficientNet-B2
    model = efficientnet_b2(weights="IMAGENET1K_V1")
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, 4)
    model.to(device)

    # Class weights (per fold)
    y = train_df["grade"].values - 1
    weights = compute_class_weight(
        class_weight="balanced",
        classes=np.array([0,1,2,3]),
        y=y
    )
    weights = torch.tensor(weights, dtype=torch.float).to(device)

    criterion = nn.CrossEntropyLoss(weight=weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # Train
    for epoch in range(14):  # slightly more than B0
        train_loss, train_acc = train_one_epoch(
            model, train_loader, optimizer, criterion, device
        )

    # Evaluate
    test_acc = eval_model(model, test_loader, device)
    fold_accuracies_b2.append(test_acc)

    print(f"{fold_name} accuracy: {test_acc:.4f}")

    torch.save(
        model.state_dict(),
        f"../models/effnet_b2/{fold_name}.pth"
    )



===== FOLD_0 (EffNet-B2) =====
Downloading: "https://download.pytorch.org/models/efficientnet_b2_rwightman-c35c1473.pth" to C:\Users\Pranathi/.cache\torch\hub\checkpoints\efficientnet_b2_rwightman-c35c1473.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35.2M/35.2M [00:15<00:00, 2.33MB/s]


fold_0 accuracy: 0.2500

===== FOLD_1 (EffNet-B2) =====
fold_1 accuracy: 0.4500

===== FOLD_2 (EffNet-B2) =====
fold_2 accuracy: 0.2500

===== FOLD_3 (EffNet-B2) =====
fold_3 accuracy: 0.3500

===== FOLD_4 (EffNet-B2) =====
fold_4 accuracy: 0.5263


In [29]:
print("\n===== EFFICIENTNET-B2 CV RESULT =====")
print("Fold accuracies:", fold_accuracies_b2)
print(f"Mean accuracy: {np.mean(fold_accuracies_b2):.4f}")
print(f"Std deviation: {np.std(fold_accuracies_b2):.4f}")



===== EFFICIENTNET-B2 CV RESULT =====
Fold accuracies: [0.25, 0.45, 0.25, 0.35, 0.5263157894736842]
Mean accuracy: 0.3653
Std deviation: 0.1095


# Model-3 Ordinal Regression with EfficientNet-B0

In [31]:
class ANDatasetReg(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row["image_path"]).convert("RGB")
        y = float(row["grade"])  # keep 1‚Äì4

        if self.transform:
            img = self.transform(img)

        return img, torch.tensor(y, dtype=torch.float32)


In [32]:
model = efficientnet_b0(weights="IMAGENET1K_V1")
model.classifier[1] = nn.Linear(model.classifier[1].in_features, 1)
model.to(device)


EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [33]:
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [34]:
def train_one_epoch_reg(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for imgs, y in loader:
        imgs = imgs.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        preds = model(imgs).squeeze(1)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * imgs.size(0)

    return total_loss / len(loader.dataset)


def eval_reg(model, loader, device):
    model.eval()
    preds_all, y_all = [], []

    with torch.no_grad():
        for imgs, y in loader:
            imgs = imgs.to(device)
            preds = model(imgs).squeeze(1)

            preds_all.extend(preds.cpu().numpy())
            y_all.extend(y.numpy())

    preds_all = np.array(preds_all)
    y_all = np.array(y_all)

    mae = np.mean(np.abs(preds_all - y_all))

    # Off-by-1 accuracy (round to nearest grade)
    rounded = np.clip(np.round(preds_all), 1, 4)
    off_by_1 = np.mean(np.abs(rounded - y_all) <= 1)

    return mae, off_by_1


In [35]:
maes = []
off1s = []

for fold_name, split in folds.items():
    print(f"\n===== {fold_name.upper()} (REGRESSION) =====")

    train_df = df.iloc[split["train_idx"]]
    test_df  = df.iloc[split["test_idx"]]

    train_ds = ANDatasetReg(train_df, train_tfms)
    test_ds  = ANDatasetReg(test_df, test_tfms)

    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=8, shuffle=False)

    model = efficientnet_b0(weights="IMAGENET1K_V1")
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, 1)
    model.to(device)

    criterion = nn.SmoothL1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(12):
        train_loss = train_one_epoch_reg(
            model, train_loader, optimizer, criterion, device
        )

    mae, off1 = eval_reg(model, test_loader, device)
    maes.append(mae)
    off1s.append(off1)

    print(f"{fold_name} MAE: {mae:.3f} | Off-by-1 Acc: {off1:.3f}")



===== FOLD_0 (REGRESSION) =====
fold_0 MAE: 0.663 | Off-by-1 Acc: 0.900

===== FOLD_1 (REGRESSION) =====
fold_1 MAE: 0.971 | Off-by-1 Acc: 0.800

===== FOLD_2 (REGRESSION) =====
fold_2 MAE: 0.985 | Off-by-1 Acc: 0.750

===== FOLD_3 (REGRESSION) =====
fold_3 MAE: 1.082 | Off-by-1 Acc: 0.750

===== FOLD_4 (REGRESSION) =====
fold_4 MAE: 1.125 | Off-by-1 Acc: 0.684


In [36]:
print("\n===== REGRESSION CV RESULT =====")
print("MAE per fold:", maes)
print(f"Mean MAE: {np.mean(maes):.3f} ¬± {np.std(maes):.3f}")

print("Off-by-1 Acc per fold:", off1s)
print(f"Mean Off-by-1 Acc: {np.mean(off1s):.3f}")



===== REGRESSION CV RESULT =====
MAE per fold: [np.float32(0.6629048), np.float32(0.9706052), np.float32(0.9848345), np.float32(1.0822408), np.float32(1.1249658)]
Mean MAE: 0.965 ¬± 0.162
Off-by-1 Acc per fold: [np.float64(0.9), np.float64(0.8), np.float64(0.75), np.float64(0.75), np.float64(0.6842105263157895)]
Mean Off-by-1 Acc: 0.777


The regression model demonstrates that although exact grade prediction remains challenging, the model consistently predicts AN severity within one grade of the clinician-assigned label in a majority of cases.

This is far more realistic than claiming exact accuracy.

# Patient-level multi view fusion on regression

In [37]:
def eval_reg_patient_level(model, loader, df_subset, device):
    """
    df_subset: dataframe corresponding exactly to loader.dataset
    """
    model.eval()

    preds = []
    gts = []
    patient_ids = []

    with torch.no_grad():
        for i, (imgs, y) in enumerate(loader):
            imgs = imgs.to(device)
            out = model(imgs).squeeze(1)

            batch_size = imgs.size(0)
            start = i * batch_size
            end = start + batch_size

            preds.extend(out.cpu().numpy())
            gts.extend(y.numpy())
            patient_ids.extend(
                df_subset.iloc[start:end]["patient_id"].values
            )

    return np.array(preds), np.array(gts), np.array(patient_ids)


In [38]:
def patient_level_metrics(preds, gts, patient_ids):
    """
    preds: continuous predictions (float)
    gts: true grades (1‚Äì4)
    patient_ids: patient identifiers
    """

    patient_preds = {}
    patient_gts = {}

    for p, y, pid in zip(preds, gts, patient_ids):
        if pid not in patient_preds:
            patient_preds[pid] = []
            patient_gts[pid] = y
        patient_preds[pid].append(p)

    fused_preds = []
    fused_gts = []

    for pid in patient_preds:
        fused_preds.append(np.mean(patient_preds[pid]))
        fused_gts.append(patient_gts[pid])

    fused_preds = np.array(fused_preds)
    fused_gts = np.array(fused_gts)

    # Metrics
    mae = np.mean(np.abs(fused_preds - fused_gts))
    rounded = np.clip(np.round(fused_preds), 1, 4)
    off_by_1 = np.mean(np.abs(rounded - fused_gts) <= 1)

    return mae, off_by_1


In [39]:
maes_fused = []
off1s_fused = []

for fold_name, split in folds.items():
    print(f"\n===== {fold_name.upper()} (PATIENT-LEVEL REGRESSION) =====")

    train_df = df.iloc[split["train_idx"]]
    test_df  = df.iloc[split["test_idx"]]

    train_ds = ANDatasetReg(train_df, train_tfms)
    test_ds  = ANDatasetReg(test_df, test_tfms)

    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=8, shuffle=False)

    model = efficientnet_b0(weights="IMAGENET1K_V1")
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, 1)
    model.to(device)

    criterion = nn.SmoothL1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(12):
        train_one_epoch_reg(
            model, train_loader, optimizer, criterion, device
        )

    # üî• patient-level evaluation
    preds, gts, pids = eval_reg_patient_level(
        model, test_loader, test_df, device
    )

    mae, off1 = patient_level_metrics(preds, gts, pids)
    maes_fused.append(mae)
    off1s_fused.append(off1)

    print(f"{fold_name} MAE: {mae:.3f} | Off-by-1 Acc: {off1:.3f}")



===== FOLD_0 (PATIENT-LEVEL REGRESSION) =====
fold_0 MAE: 0.936 | Off-by-1 Acc: 0.833

===== FOLD_1 (PATIENT-LEVEL REGRESSION) =====
fold_1 MAE: 0.650 | Off-by-1 Acc: 1.000

===== FOLD_2 (PATIENT-LEVEL REGRESSION) =====
fold_2 MAE: 0.777 | Off-by-1 Acc: 1.000

===== FOLD_3 (PATIENT-LEVEL REGRESSION) =====
fold_3 MAE: 0.768 | Off-by-1 Acc: 1.000

===== FOLD_4 (PATIENT-LEVEL REGRESSION) =====
fold_4 MAE: 0.919 | Off-by-1 Acc: 0.857


In [40]:
print("\n===== PATIENT-LEVEL REGRESSION CV RESULT =====")
print("MAE per fold:", maes_fused)
print(f"Mean MAE: {np.mean(maes_fused):.3f} ¬± {np.std(maes_fused):.3f}")

print("Off-by-1 Acc per fold:", off1s_fused)
print(f"Mean Off-by-1 Acc: {np.mean(off1s_fused):.3f}")



===== PATIENT-LEVEL REGRESSION CV RESULT =====
MAE per fold: [np.float32(0.93570423), np.float32(0.65042776), np.float32(0.7766094), np.float32(0.7679024), np.float32(0.9185685)]
Mean MAE: 0.810 ¬± 0.106
Off-by-1 Acc per fold: [np.float64(0.8333333333333334), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(0.8571428571428571)]
Mean Off-by-1 Acc: 0.938


Treating Acanthosis Nigricans grading as an ordinal regression problem and aggregating predictions across multiple patient views substantially improves grading reliability and clinical relevance compared to standard image-level classification.

# Quadratic Weighted Kappa(QWK)

In [41]:
from sklearn.metrics import cohen_kappa_score


In [42]:
def patient_level_metrics_with_qwk(preds, gts, patient_ids):
    patient_preds = {}
    patient_gts = {}

    for p, y, pid in zip(preds, gts, patient_ids):
        if pid not in patient_preds:
            patient_preds[pid] = []
            patient_gts[pid] = y
        patient_preds[pid].append(p)

    fused_preds = []
    fused_gts = []

    for pid in patient_preds:
        fused_preds.append(np.mean(patient_preds[pid]))
        fused_gts.append(patient_gts[pid])

    fused_preds = np.array(fused_preds)
    fused_gts = np.array(fused_gts)

    rounded_preds = np.clip(np.round(fused_preds), 1, 4)

    mae = np.mean(np.abs(fused_preds - fused_gts))
    off_by_1 = np.mean(np.abs(rounded_preds - fused_gts) <= 1)

    qwk = cohen_kappa_score(
        fused_gts.astype(int),
        rounded_preds.astype(int),
        weights="quadratic"
    )

    return mae, off_by_1, qwk, fused_preds, fused_gts


In [43]:
maes, off1s, qwks = [], [], []

all_preds = []
all_gts = []

for fold_name, split in folds.items():
    print(f"\n===== {fold_name.upper()} (PATIENT REG + QWK) =====")

    train_df = df.iloc[split["train_idx"]]
    test_df  = df.iloc[split["test_idx"]]

    train_ds = ANDatasetReg(train_df, train_tfms)
    test_ds  = ANDatasetReg(test_df, test_tfms)

    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=8, shuffle=False)

    model = efficientnet_b0(weights="IMAGENET1K_V1")
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, 1)
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.SmoothL1Loss()

    for epoch in range(12):
        train_one_epoch_reg(model, train_loader, optimizer, criterion, device)

    preds, gts, pids = eval_reg_patient_level(
        model, test_loader, test_df, device
    )

    mae, off1, qwk, fused_preds, fused_gts = patient_level_metrics_with_qwk(
        preds, gts, pids
    )

    maes.append(mae)
    off1s.append(off1)
    qwks.append(qwk)

    all_preds.extend(fused_preds)
    all_gts.extend(fused_gts)

    print(f"MAE: {mae:.3f} | Off-by-1: {off1:.3f} | QWK: {qwk:.3f}")



===== FOLD_0 (PATIENT REG + QWK) =====
MAE: 1.132 | Off-by-1: 0.667 | QWK: 0.108

===== FOLD_1 (PATIENT REG + QWK) =====
MAE: 0.624 | Off-by-1: 1.000 | QWK: 0.526

===== FOLD_2 (PATIENT REG + QWK) =====
MAE: 0.567 | Off-by-1: 1.000 | QWK: 0.640

===== FOLD_3 (PATIENT REG + QWK) =====
MAE: 0.713 | Off-by-1: 1.000 | QWK: 0.556

===== FOLD_4 (PATIENT REG + QWK) =====
MAE: 0.959 | Off-by-1: 0.714 | QWK: 0.189


# 4th Models

In [1]:
import os
import json
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torchvision.transforms as transforms
from torchvision.models import efficientnet_b0

from sklearn.metrics import cohen_kappa_score


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
df = pd.read_csv("../data/dataset.csv")

with open("../splits/cv_folds.json", "r") as f:
    folds = json.load(f)

df.head()


Using device: cpu


Unnamed: 0,image_path,patient_id,grade,view
0,../data/cropped_images/AN_1_G4_back_0.jpg,1,4,back
1,../data/cropped_images/AN_1_G4_neck_1.jpg,1,4,neck
2,../data/cropped_images/AN_1_G4_neck_2.jpg,1,4,neck
3,../data/cropped_images/AN_1_G4_neck_3.jpg,1,4,neck
4,../data/cropped_images/AN_2_G3_back_0.jpg,2,3,back


In [None]:
def rgb_to_lab_l(img_pil):
    img = np.array(img_pil)
    lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
    L = lab[:, :, 0]          # shape (H, W)
    return Image.fromarray(L, mode="L")


class ANDatasetRegLAB(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img = Image.open(row["image_path"]).convert("RGB")
        img = rgb_to_lab_l(img)   # üî• key change

        y = float(row["grade"])  # keep 1‚Äì4

        if self.transform:
            img = self.transform(img)

        return img, torch.tensor(y, dtype=torch.float32)




In [4]:
train_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5],
                         [0.5, 0.5, 0.5])
])

test_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5],
                         [0.5, 0.5, 0.5])
])


In [5]:
def ordinal_loss(logits, labels):
    labels = labels.long() - 1  # 1‚Äì4 ‚Üí 0‚Äì3
    targets = torch.zeros((labels.size(0), 3), device=labels.device)

    for i in range(3):
        targets[:, i] = (labels > i).float()

    return nn.BCEWithLogitsLoss()(logits, targets)


In [6]:
def train_one_epoch_ordinal(model, loader, optimizer):
    model.train()
    total_loss = 0

    for imgs, y in loader:
        imgs = imgs.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits = model(imgs)
        loss = ordinal_loss(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * imgs.size(0)

    return total_loss / len(loader.dataset)


In [7]:
def eval_ordinal_patient_level(model, loader, df_subset):
    model.eval()
    preds, gts, pids = [], [], []

    with torch.no_grad():
        for i, (imgs, y) in enumerate(loader):
            imgs = imgs.to(device)
            logits = model(imgs)

            probs = torch.sigmoid(logits)
            pred = torch.sum(probs > 0.5, dim=1) + 1

            batch_size = imgs.size(0)
            start = i * batch_size
            end = start + batch_size

            preds.extend(pred.cpu().numpy())
            gts.extend(y.numpy())
            pids.extend(df_subset.iloc[start:end]["patient_id"].values)

    return np.array(preds), np.array(gts), np.array(pids)


In [8]:
def patient_level_metrics_with_qwk(preds, gts, patient_ids):
    patient_preds = {}
    patient_gts = {}

    for p, y, pid in zip(preds, gts, patient_ids):
        if pid not in patient_preds:
            patient_preds[pid] = []
            patient_gts[pid] = y
        patient_preds[pid].append(p)

    fused_preds = []
    fused_gts = []

    for pid in patient_preds:
        fused_preds.append(np.mean(patient_preds[pid]))
        fused_gts.append(patient_gts[pid])

    fused_preds = np.array(fused_preds)
    fused_gts = np.array(fused_gts)

    mae = np.mean(np.abs(fused_preds - fused_gts))
    off_by_1 = np.mean(np.abs(np.round(fused_preds) - fused_gts) <= 1)

    qwk = cohen_kappa_score(
        fused_gts.astype(int),
        np.clip(np.round(fused_preds), 1, 4).astype(int),
        weights="quadratic"
    )

    return mae, off_by_1, qwk


In [9]:
maes, off1s, qwks = [], [], []

for fold_name, split in folds.items():
    print(f"\n===== {fold_name.upper()} (ORDINAL + LAB-L + PATIENT FUSION) =====")

    train_df = df.iloc[split["train_idx"]]
    test_df  = df.iloc[split["test_idx"]]

    train_ds = ANDatasetRegLAB(train_df, train_tfms)
    test_ds  = ANDatasetRegLAB(test_df, test_tfms)

    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=8, shuffle=False)

    model = efficientnet_b0(weights="IMAGENET1K_V1")
    model.classifier[1] = nn.Linear(1280, 3)  # ordinal head
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(15):
        loss = train_one_epoch_ordinal(model, train_loader, optimizer)

    preds, gts, pids = eval_ordinal_patient_level(model, test_loader, test_df)

    mae, off1, qwk = patient_level_metrics_with_qwk(preds, gts, pids)

    maes.append(mae)
    off1s.append(off1)
    qwks.append(qwk)

    print(f"MAE: {mae:.3f} | Off-by-1: {off1:.3f} | QWK: {qwk:.3f}")



===== FOLD_0 (ORDINAL + LAB-L + PATIENT FUSION) =====


TypeError: Cannot handle this data type: (1, 1, 1), |u1

In [None]:
print("\n===== FINAL PATIENT-LEVEL RESULTS =====")
print(f"Mean MAE: {np.mean(maes):.3f} ¬± {np.std(maes):.3f}")
print(f"Mean Off-by-1 Accuracy: {np.mean(off1s):.3f}")
print(f"Mean QWK: {np.mean(qwks):.3f} ¬± {np.std(qwks):.3f}")
