## Part 1: First get the titan, phikon encoder and hest segmenter offline


## Part 1: Import


In [None]:
import os, sys

## Part 1: obtaining everything offline

In [None]:
!rm -rf /kaggle/working/TITAN_combined
!mkdir -p /kaggle/working/TITAN_combined/titan
!cp -r /kaggle/input/titana/TITAN/* /kaggle/working/TITAN_combined/
!cp -r /kaggle/input/titan-weights/titan-model/* /kaggle/working/TITAN_combined/titan/
!ls /kaggle/working/TITAN_combined/titan

In [None]:
pip install --no-deps /kaggle/input/einops/einops_pkgs/einops_pkgs/*.whl

In [None]:
sys.path.insert(0, "/kaggle/working/TITAN_combined")  # so “import titan” picks up your local code


In [None]:
%%bash
grep -R "def encode_slide_from_patch_features" -n /kaggle/working/TITAN_combined/titan

In [None]:
%%bash
grep -nE "^class " /kaggle/working/TITAN_combined/titan/modeling_titan.py

In [None]:
sys.path.insert(0, "/kaggle/working/TITAN_combined")  # your offline code

import torch
from titan.configuration_titan import TitanConfig
from titan.modeling_titan import Titan   

In [None]:
# 1) Load the config
config = TitanConfig.from_pretrained(
    "/kaggle/working/TITAN_combined/titan",
    local_files_only=True
)


In [None]:
%%bash
grep -nE "^class " /kaggle/working/TITAN_combined/titan/modeling_titan.py

In [None]:
%%bash
#  remove any HuggingFace module cache for titan
rm -rf ~/.cache/huggingface/modules/transformers_modules/titan

# uninstall any pip-installed 'titan' package
pip uninstall -y titan


In [None]:
#  force Transformers / HF Hub to stay offline
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"]   = "1"

In [None]:
sys.path.insert(0, "/kaggle/working/TITAN_combined")
from transformers import PreTrainedTokenizerFast
_orig = PreTrainedTokenizerFast.from_pretrained
def _local_tok(repo_id, *args, **kw):
    if repo_id == "MahmoodLab/TITAN":
        return _orig("/kaggle/working/TITAN_combined/titan", local_files_only=True, *args, **kw)
    return _orig(repo_id, *args, **kw)
PreTrainedTokenizerFast.from_pretrained = _local_tok

## titan offline model

In [None]:
import torch
from titan.configuration_titan import TitanConfig
from titan.modeling_titan      import Titan
config = TitanConfig.from_pretrained("/kaggle/working/TITAN_combined/titan", local_files_only=True)
model  = Titan.from_pretrained("/kaggle/working/TITAN_combined/titan", config=config, local_files_only=True ).eval()


## get the hest-segmenter

In [None]:
sys.path.append('/kaggle/input/trident-orig/TRIDENT')
from trident import load_wsi
from trident.segmentation_models.load import HESTSegmenter
from trident.segmentation_models import segmentation_model_factory

def offline_build(self):
    from torchvision.models.segmentation import deeplabv3_resnet50
    import torchvision.transforms as transforms
    import torch.nn as nn
    import torch
    weights_path = "/kaggle/input/hest-tissue-seg-weights/deeplabv3_seg_v4.ckpt"
    model = deeplabv3_resnet50(weights=None, weights_backbone=None)
    model.classifier[4] = nn.Conv2d(256, 2, kernel_size=1, stride=1)
    checkpoint = torch.load(weights_path, map_location='cpu')
    state_dict = {k.replace('model.', ''): v for k, v in checkpoint.get('state_dict', {}).items() if 'aux' not in k}
    model.load_state_dict(state_dict)
    self.input_size = 512
    self.precision = torch.float16
    self.target_mag = 10
    eval_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    return model, eval_transforms

HESTSegmenter._build = offline_build

seg_model = segmentation_model_factory(
    "hest",
    confidence_thresh=0.5
)

print("Segmentation model ready")

## get the titan encoder, phikon encoder via trident

In [None]:
import sys
sys.path.append('/kaggle/input/trident-orig/TRIDENT')

# import encoder factories
from trident.patch_encoder_models.load import encoder_factory as patch_encoder_factory
from trident.slide_encoder_models.load import encoder_factory as slide_encoder_factory

# load the Phikon encoder (patch-level) from local path
phikon_encoder = patch_encoder_factory(
    "phikon",
    weights_path="/kaggle/input/phikon-model/phikon_model/pytorch_model.bin"
)
print(" Phikon encoder loaded:")
# print(phikon_encoder)
titan_encoder = slide_encoder_factory('titan',pretrained=True)
print(" Titan encoder loaded")
# print(titan_encoder)

## Part 2: The real inference of our best trained models

### part 2: imports

In [None]:
# Core Libraries
import pytorch_lightning as pl
import time
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import h5py
from pathlib import Path
from PIL import Image
import numpy as np
import imagehash
from openslide import OpenSlide
import pandas as pd

# Geospatial libraries
import geopandas as gpd

# Utilities
import shutil
import zipfile

# Monkey-patch for GeoPandas
if not hasattr(gpd.GeoSeries, "union_all"):
    gpd.GeoSeries.union_all = lambda self, *args, **kwargs: self.unary_union

### Configuration

In [None]:
class Config:
    def __init__(self):
        self.input_dim = 768
        self.hidden_dim = 512
        self.num_classes = 5
        self.lr = 3e-4
        self.batch_size = 32
        self.max_epochs = 50
        self.num_workers = 4
        self.n_splits = 5
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.patch_encoder_name = "phikon"
        self.slide_encoder_name = "titan"
        self.patch_mag = 10
        self.patch_size = 224

        # paths
        self.data_dir = "/kaggle/input/prostate-cancer-grade-assessment"
        self.train_images_dir = os.path.join(self.data_dir, "train_images")
        self.test_images_dir = os.path.join(self.data_dir, "test_images")
        self.train_csv = os.path.join(self.data_dir, "train.csv")
        self.test_csv = os.path.join(self.data_dir, "test.csv")
        self.sample_submission = os.path.join(self.data_dir, "sample_submission.csv")
        self.best_models = "/kaggle/input/model-label-denoised"
        self.output_dir = "/kaggle/working/slide_vectors"
        self.submission_output = "submission.csv"
        


### Simple Classifier

In [None]:
class SlideClassifier(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.save_hyperparameters(cfg.__dict__)
        self.head = nn.Sequential(
            nn.Linear(cfg.input_dim, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, cfg.num_classes)
        )
        self.criterion = nn.BCEWithLogitsLoss()
        self.val_preds = []
        self.val_targets = []
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        return self.head(x)

### Test Custom dataset needed for the dataloader

In [None]:
class SlideTestDataset(Dataset):
    def __init__(self, slide_vectors):
        self.slide_ids = list(slide_vectors.keys())
        self.slide_vectors = slide_vectors

    def __len__(self):
        return len(self.slide_ids)

    def __getitem__(self, idx):
        slide_id = self.slide_ids[idx]
        vector = self.slide_vectors[slide_id]
        return slide_id, vector

### Exctracting all features with Phikon and Titan

In [None]:
def extract_test_features(cfg,phikon_encoder,titan_encoder,seg_model):
    print("Start feature extraction on the test set ..")

    is_test = os.path.exists(cfg.test_images_dir)
    image_folder = cfg.test_images_dir if is_test else cfg.train_images_dir
    print(f"Using images from: {image_folder}")

    # Step 1: create the slide_paths
    if not is_test:
        print("No Test images found! Only use the first 100 training slides")
        df_train = pd.read_csv(cfg.train_csv)
        fallback_ids = df_train.loc[:10, "image_id"].tolist()
        slide_paths = [Path(image_folder) / f"{sid}.tiff" for sid in fallback_ids]
    else:
        slide_paths = list(Path(image_folder).glob("*.tiff"))

    slide_paths.sort(key=lambda x: x.name)

     # Step 2: Load Trident models
    seg = seg_model
    patch_encoder = phikon_encoder
    slide_encoder = titan_encoder

    slide_vectors = {}

    # step 3: go through the loop
    for idx, slide_path in enumerate(slide_paths, 1):
        try:
            job_dir = Path(cfg.output_dir) / slide_path.stem
            job_dir.mkdir(parents=True, exist_ok=True)

            print(f" Processing {slide_path.name}...")
            wsi = load_wsi(slide_path, lazy_init=False)

            # Tissue segmentation
            wsi.segment_tissue(seg, seg.target_mag, job_dir, cfg.device)

            # Extract patch coordinates
            coords_path = wsi.extract_tissue_coords(
                target_mag=cfg.patch_mag,
                patch_size=cfg.patch_size,
                save_coords=str(job_dir),
            )

            # Extract patch features (saved to job_dir/patches/)
            patch_features_path = wsi.extract_patch_features(
                patch_encoder=patch_encoder,
                coords_path=str(coords_path),
                save_features=str(job_dir),
                device=cfg.device,
                batch_limit=32,
            )

            # create embeddings directory and extract slide-level features
            embeddings_dir = job_dir / "embeddings"
            embeddings_dir.mkdir(parents=True, exist_ok=True)

            slide_vector_path = wsi.extract_slide_features(
                str(patch_features_path),
                slide_encoder,
                str(embeddings_dir),
                cfg.device
            )
            embeddings_h5 = embeddings_dir / f"{slide_path.stem}.h5"
            if embeddings_h5.exists():
                with h5py.File(embeddings_h5, "r") as f:
                    features = f["features"][:]
                    tensor = torch.from_numpy(features)
                    slide_vectors[slide_path.stem] = tensor
            else:
                print(f"Warning: embeddings .h5 not found for {slide_path.stem}")

        except Exception as e:
            print(f"Error processing {slide_path.name}: {e}")

    print(f"Done extracting features for {len(slide_vectors)} slides.")
    return slide_vectors
            


### Run inference


In [None]:
def run_inference(cfg):
    # create slide_vectors
    slide_vectors = extract_test_features(cfg, phikon_encoder, titan_encoder, seg_model)

    if not slide_vectors:
        print("No slide features extracted. Creating dummy submission...")
        dummy_df = pd.read_csv(cfg.sample_submission)
        dummy_df["isup_grade"] = 0
        dummy_df.to_csv(cfg.submission_output, index=False)
        print(f"Dummy submission saved at {cfg.submission_output}")
        return

    # create dataset and dataloader
    dataset = SlideTestDataset(slide_vectors)
    slide_ids = [dataset.slide_ids[i] for i in range(len(dataset))]
    dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers)

    # load in all best fold model classifiers
    fold_ckpts = sorted(Path(cfg.best_models).glob("fold*.ckpt"))
    print(f"Using {len(fold_ckpts)} models for ensemble: {fold_ckpts}")

    all_preds = []

    for ckpt_path in fold_ckpts:
        print(f"Loading model: {ckpt_path}")
        model = SlideClassifier.load_from_checkpoint(ckpt_path, cfg=cfg)
        model.eval().to(cfg.device)

        preds = []

        with torch.no_grad():
            for batch_slide_ids, vectors in dataloader:
                vectors = vectors.to(cfg.device)
                logits = model(vectors)

                # convert the logits to ordinal predictions
                ordinal_preds = (torch.sigmoid(logits) > 0.5).float().cpu().numpy()
                final_preds = ordinal_preds.sum(axis=1)  

                preds.extend(final_preds.astype(int))

        all_preds.append(preds)

    # average across fold, ensemble  method
    final_preds = np.round(np.mean(all_preds, axis=0)).astype(int)

    # final submission
    df_train = pd.read_csv(cfg.train_csv)
    df_test = pd.read_csv(cfg.test_csv)
    is_test = os.path.exists(cfg.test_images_dir)

    df = df_test if is_test else df_train.loc[:len(final_preds) - 1]
    df["isup_grade"] = final_preds.astype(int)

    submission_df = df[["image_id", "isup_grade"]]
    submission_df.to_csv(cfg.submission_output, index=False)
    print(f"Inference done! Submission saved as {cfg.submission_output}")

    # to ensure it comes trough the submission on kaggle
    if os.path.exists(f'../input/prostate-cancer-grade-assessment/test_images'):
        print("Still can not access the test file?")
        submission_df.to_csv(cfg.submission_output, index=False)
    else:
        # fallback to Kaggle sample submission
        sample_df = pd.read_csv("/kaggle/input/prostate-cancer-grade-assessment/sample_submission.csv")
        sample_df.to_csv('submission.csv', index=False)


In [None]:
def main():
    cfg = Config()
    run_inference(cfg)

if __name__ == "__main__":
    main()