In [5]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure, transform
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()  # 去掉最后分类层
resnet = resnet.to(device)
resnet.eval()

# 图像预处理
transform_cnn = T.Compose([
    T.ToPILImage(),
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(rgb_crop):
    """输入RGB图像 -> 提取CNN特征"""
    x = transform_cnn(rgb_crop).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat

# ===== 读取与分割 =====
def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb)
    v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H,
        in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98))
    )
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

# ===== 提取特征（手工 + CNN）=====
def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None

    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # 手工特征
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    tumor_frac = tumor_px_thumb / tissue_px_thumb
    h_vals = H[mask]
    h_mean, h_std = float(h_vals.mean()), float(h_vals.std())

    feats = dict(
        tumor_frac=tumor_frac,
        h_mean=h_mean, h_std=h_std
    )

    # CNN 特征（肿瘤外接框裁剪）
    ys, xs = np.where(nuc)
    if len(xs) > 0 and len(ys) > 0:
        xmin, xmax = xs.min(), xs.max()
        ymin, ymax = ys.min(), ys.max()
        crop = rgb[ymin:ymax, xmin:xmax]
        cnn_feat = extract_cnn_feature(crop)
    else:
        cnn_feat = np.zeros(512, dtype=np.float32)  # 兜底

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists():
                continue

            # === 三分类映射：1->0, 2/3->1, 4->2 ===
            if stage == 1:
                label = 0
            elif stage in (2, 3):
                label = 1
            else:  # stage == 4
                label = 2

            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None:
                    continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split)
                row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2&3, 2=stage4):\n",
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_cnn_3class.csv", index=False)

    train_df = df[df.split=="train"]
    test_df  = df[df.split=="test"]

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + ["tumor_frac","h_mean","h_std"]

    Xtr, ytr = train_df[feature_cols].values, train_df["label"].values
    Xte, yte = test_df[feature_cols].values, test_df["label"].values

    clf = RandomForestClassifier(
        n_estimators=400, random_state=0, n_jobs=-1, class_weight="balanced"
    )
    clf.fit(Xtr, ytr)
    ypr = clf.predict(Xte)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2]))
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2],
        target_names=["stage1","stage2&3","stage4"], digits=4))




Feature rows: (133, 518)
Class distribution (0=stage1, 1=stage2&3, 2=stage4):
 label
0    35
1    89
2     9
Name: count, dtype: int64

Confusion matrix:
 [[ 0  7  0]
 [ 0 17  0]
 [ 0  2  0]]

Report:
               precision    recall  f1-score   support

      stage1     0.0000    0.0000    0.0000         7
    stage2&3     0.6538    1.0000    0.7907        17
      stage4     0.0000    0.0000    0.0000         2

    accuracy                         0.6538        26
   macro avg     0.2179    0.3333    0.2636        26
weighted avg     0.4275    0.6538    0.5170        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [8]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()  # 去掉最后分类层
resnet = resnet.to(device)
resnet.eval()

# 预处理（输入将是 PIL）
transform_cnn = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(img):
    """img 可以是 PIL / numpy.ndarray / torch.Tensor；统一转 PIL→RGB 再变换"""
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img)
    elif isinstance(img, torch.Tensor):
        img = T.ToPILImage()(img)
    if getattr(img, "mode", None) != "RGB":
        img = img.convert("RGB")
    x = transform_cnn(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat

def extract_tumor_region_with_padding(pil_img, tumor_mask, padding_ratio=0.12):
    """从 tumor_mask 中裁剪病灶区域，加 padding，返回 224x224 的 PIL"""
    ys, xs = np.where(tumor_mask)
    if len(xs) == 0 or len(ys) == 0:
        return pil_img.resize((224, 224))
    xmin, xmax = xs.min(), xs.max()
    ymin, ymax = ys.min(), ys.max()
    h, w = (ymax - ymin), (xmax - xmin)
    pad_x = int(padding_ratio * w)
    pad_y = int(padding_ratio * h)
    xmin = max(xmin - pad_x, 0)
    xmax = min(xmax + pad_x, pil_img.width)
    ymin = max(ymin - pad_y, 0)
    ymax = min(ymax + pad_y, pil_img.height)
    return pil_img.crop((xmin, ymin, xmax, ymax)).resize((224, 224))

# ===== 读取与分割 =====
def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb)
    v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98))
    )
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

# ===== 提取特征（手工 + CNN）=====
def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None

    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # 手工特征
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    tumor_frac = tumor_px_thumb / max(tissue_px_thumb, 1)
    h_vals = H[mask]
    h_mean, h_std = float(h_vals.mean()), float(h_vals.std())
    feats = dict(tumor_frac=tumor_frac, h_mean=h_mean, h_std=h_std)

    # CNN 特征：最小外接框 + padding 裁剪
    pil_img = Image.fromarray(rgb)
    tumor_crop = extract_tumor_region_with_padding(pil_img, tumor_mask=nuc, padding_ratio=0.12)
    cnn_feat = extract_cnn_feature(tumor_crop)

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists():
                continue
            # 三分类映射：1->0, 2/3->1, 4->2
            if stage == 1:
                label = 0
            elif stage in (2, 3):
                label = 1
            else:
                label = 2
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None:
                    continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split)
                row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2&3, 2=stage4):\n",
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_cnn_3class_padding_fixed.csv", index=False)

    train_df = df[df.split=="train"]
    test_df  = df[df.split=="test"]

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + ["tumor_frac","h_mean","h_std"]
    Xtr, ytr = train_df[feature_cols].values, train_df["label"].values
    Xte, yte = test_df[feature_cols].values, test_df["label"].values

    clf = RandomForestClassifier(n_estimators=400, random_state=0, n_jobs=-1, class_weight="balanced")
    clf.fit(Xtr, ytr)
    ypr = clf.predict(Xte)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2]))
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2],
        target_names=["stage1","stage2&3","stage4"], digits=4))




Feature rows: (133, 518)
Class distribution (0=stage1, 1=stage2&3, 2=stage4):
 label
0    35
1    89
2     9
Name: count, dtype: int64

Confusion matrix:
 [[ 1  6  0]
 [ 0 17  0]
 [ 0  2  0]]

Report:
               precision    recall  f1-score   support

      stage1     1.0000    0.1429    0.2500         7
    stage2&3     0.6800    1.0000    0.8095        17
      stage4     0.0000    0.0000    0.0000         2

    accuracy                         0.6923        26
   macro avg     0.5600    0.3810    0.3532        26
weighted avg     0.7138    0.6923    0.5966        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [3]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image
from collections import Counter

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()
resnet = resnet.to(device).eval()

transform_cnn = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(img):
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img)
    elif isinstance(img, torch.Tensor):
        img = T.ToPILImage()(img)
    if getattr(img, "mode", None) != "RGB":
        img = img.convert("RGB")
    x = transform_cnn(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat  # [512]

# ---------- Top-K 病灶裁剪 + 池化 ----------
def extract_topk_crops(pil_img, nuc_mask, k=3, pad_ratio=0.12, out_size=224):
    lab = measure.label(nuc_mask)
    props = sorted(measure.regionprops(lab), key=lambda p: p.area, reverse=True)
    W, H = pil_img.width, pil_img.height
    crops = []
    for p in props[:k]:
        ymin, xmin, ymax, xmax = p.bbox
        h, w = ymax - ymin, xmax - xmin
        if h <= 0 or w <= 0: continue
        px = int(pad_ratio * w); py = int(pad_ratio * h)
        xmin = max(xmin - px, 0); xmax = min(xmax + px, W)
        ymin = max(ymin - py, 0); ymax = min(ymax + py, H)
        crops.append(pil_img.crop((xmin, ymin, xmax, ymax)).resize((out_size, out_size)))
    if not crops:
        crops = [pil_img.resize((out_size, out_size))]
    return crops

def cnn_features_pooled(crops):
    feats = [extract_cnn_feature(im) for im in crops]  # [k,512]
    F = np.stack(feats, 0)
    return np.concatenate([F.mean(0), F.max(0)], 0)  # [1024]
# ------------------------------------------

def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb); v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98)))
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None
    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # —— 手工统计
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    lab = measure.label(nuc)
    props = measure.regionprops(lab)
    areas  = np.array([p.area for p in props], dtype=np.float32) if props else np.array([])
    perims = np.array([p.perimeter for p in props], dtype=np.float32) if props else np.array([])
    cc_count = len(props)
    largest_cc_px = int(areas.max()) if cc_count else 0

    tumor_frac = tumor_px_thumb / max(tissue_px_thumb, 1)
    largest_cc_frac = largest_cc_px / max(tissue_px_thumb, 1)
    small_thresh = 0.001 * tissue_px_thumb
    cc_small = int((areas < small_thresh).sum()) if cc_count else 0
    cc_small_frac = cc_small / max(cc_count, 1)
    frag_ratio = float(perims.sum() / (areas.sum() + 1e-6)) if cc_count else 0.0

    feats = dict(
        tumor_frac=tumor_frac,
        largest_cc_frac=largest_cc_frac,
        cc_count=cc_count,
        cc_small_frac=cc_small_frac,
        frag_ratio=frag_ratio,
    )

    # —— CNN：Top-K 裁剪 + 池化（K=3）
    pil_img = Image.fromarray(rgb)
    crops = extract_topk_crops(pil_img, nuc_mask=nuc, k=3, pad_ratio=0.12)
    cnn_feat = cnn_features_pooled(crops)  # [1024]

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists(): continue
            label = stage - 1        # ★ 变更点：四分类直接用 1→0,2→1,3→2,4→3
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None: continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split); row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2, 2=stage3, 3=stage4):\n",   # ★ 变更点
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_topk_4class.csv", index=False)

    train_df = df[df.split=="train"].copy()
    test_df  = df[df.split=="test"].copy()

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + [
        "tumor_frac","largest_cc_frac","cc_count","cc_small_frac","frag_ratio"
    ]

    Xtr = train_df[feature_cols].values
    ytr = train_df["label"].values
    Xte = test_df[feature_cols].values
    yte = test_df["label"].values

    # ===== 类不平衡权重：按 4 类计算 =====
    counts = train_df["label"].value_counts()
    n_classes = 4                                                          # ★ 变更点
    class_weight_map = {c: len(train_df) / (n_classes * counts[c]) for c in counts.index}
    # 可选：额外拉高极少类（例如 stage1/4）
    for k, mult in {0:1.4, 3:1.5}.items():                                 # ★ 可调
        if k in class_weight_map: class_weight_map[k] *= mult
    weights = train_df["label"].map(class_weight_map).values

    clf = HistGradientBoostingClassifier(
        max_depth=6, learning_rate=0.06, max_iter=600,
        l2_regularization=1.0, min_samples_leaf=20, random_state=0
    )
    clf.fit(Xtr, ytr, sample_weight=weights)

    # ===== 预测：按 4 类输出；可对两端类小幅 boost =====
    proba = clf.predict_proba(Xte)                                        # [N,4]
    proba[:, 0] *= 1.10   # stage1 boost（可按需要调整/去掉）
    proba[:, 3] *= 1.15   # stage4 boost
    proba = proba / np.clip(proba.sum(1, keepdims=True), 1e-12, None)
    ypr = proba.argmax(axis=1)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2,3]))   # ★ 变更点
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2,3],                                                # ★ 变更点
        target_names=["stage1","stage2","stage3","stage4"], digits=4))


Feature rows: (133, 1032)
Class distribution (0=stage1, 1=stage2, 2=stage3, 3=stage4):
 label
0    35
1    51
2    38
3     9
Name: count, dtype: int64

Confusion matrix:
 [[1 4 0 0]
 [2 2 1 0]
 [0 2 3 0]
 [0 1 0 1]]

Report:
               precision    recall  f1-score   support

      stage1     0.3333    0.2000    0.2500         5
      stage2     0.2222    0.4000    0.2857         5
      stage3     0.7500    0.6000    0.6667         5
      stage4     1.0000    0.5000    0.6667         2

    accuracy                         0.4118        17
   macro avg     0.5764    0.4250    0.4673        17
weighted avg     0.5016    0.4118    0.4321        17

