In [1]:
!python --version

Python 3.11.13


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install /content/drive/MyDrive/Thesis/Projects/Loki/src/.

In [2]:
#--------------------
##Import
#--------------------

import os
import pandas as pd
import numpy as np
from PIL import Image
import cv2

import albumentations as trans
from albumentations.pytorch import ToTensorV2

import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from tqdm.notebook import tqdm

In [3]:
import scanpy as sc
import anndata
import loki.utils
import loki.preprocess



In [4]:
import tarfile

tar_path = "/content/drive/MyDrive/Thesis/Projects/Data/Mouse_Coronal_Patches_C.tar"
output_folder = "/content/Mouse_Coronal_Patches"

def extract_images_from_tar(tar_path, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open and extract tar file
    with tarfile.open(tar_path, 'r') as tar:
        image_members = [m for m in tqdm(tar.getmembers())]

        for member in tqdm(image_members):
            tar.extract(member, path=output_folder)

    print(f"✅ Extraction complete! Images saved to: {output_folder}")

extract_images_from_tar(tar_path, output_folder)


  0%|          | 0/63173 [00:00<?, ?it/s]

  0%|          | 0/63173 [00:00<?, ?it/s]

✅ Extraction complete! Images saved to: /content/Mouse_Coronal_Patches


In [5]:
#---------------------
##Parameters
#---------------------

model_dir = "/content/drive/MyDrive/Thesis/Projects/Loki"
model_path = os.path.join(model_dir, 'checkpoint.pt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#HE_dir = "/content/drive/MyDrive/Thesis/Projects/Data/Mouse_Coronal_Patches"
HE_dir = "/content/Mouse_Coronal_Patches/Mouse_Coronal_Patches"

Load Data

using RAM

In [5]:
# Basic Transformation
base_transform = trans.Compose([
    trans.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2(),
])

class Histo_images(Dataset):
    def __init__(self, patch_paths, transform=base_transform):
        self.transform = transform
        self.cell_ids = []
        self.images = []  # << store image tensors in RAM

        for p in tqdm(patch_paths):
            cell_id = p[-14:-4]
            img = cv2.cvtColor(cv2.imread(p), cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (224, 224))
            img = self.transform(image=img)['image']
            self.cell_ids.append(cell_id)
            self.images.append(img)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        return self.cell_ids[index], self.images[index]


not using RAM

In [12]:
# Basic Transformation
base_transform = trans.Compose([
    trans.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2(),
])

class Histo_images(Dataset):
    def __init__(self, patch_paths, transform=base_transform):
        self.transform = transform
        self.cell_ids = []
        self.paths = []

        for p in tqdm(patch_paths):
          self.paths.append(p)
          cell_id = p[-14:-4]
          self.cell_ids.append(cell_id)

    def __len__(self):
        return len(self.cell_ids)

    def __getitem__(self, index):
        p = self.paths[index]
        img = cv2.cvtColor(cv2.imread(p), cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (224, 224))
        img = self.transform(image=img)['image']

        return self.cell_ids[index], img


In [13]:
img_list = os.listdir(HE_dir)
patch_paths = [os.path.join(HE_dir, fn) for fn in img_list]

In [20]:
Data_set = Histo_images(transform=base_transform, patch_paths=patch_paths)
Data_loader = DataLoader(Data_set, batch_size=256, shuffle=False, drop_last=False)

  0%|          | 0/63172 [00:00<?, ?it/s]

model

In [9]:
model, preprocess, tokenizer = loki.utils.load_model(model_path, device)
model.eval()

CoCa(
  (text): TextTransformer(
    (token_embedding): Embedding(49408, 768)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_drop

In [21]:
def Embed(model, Data_loader, device):

    model.eval()  # set eval mode
    all_cell_ids = []
    embedded_vectors = []

    with torch.inference_mode():  # no_grad

        for cell_ids, frames in tqdm(Data_loader, desc='Inference', leave=False):

            # Forward pass
            outputs = model(frames.to(device))

            # Move back to CPU for storage
            outputs = outputs['image_features'].detach().cpu()

            # Store results
            all_cell_ids.extend(cell_ids)
            embedded_vectors.append(outputs)

    # Concatenate all embedded vectors → shape: (N, embed_dim)
    embedded_vectors = torch.cat(embedded_vectors, dim=0)

    return all_cell_ids, embedded_vectors


In [None]:
cell_ids, E_vectors = Embed(model, Data_loader, device)

Inference:   0%|          | 0/247 [00:00<?, ?it/s]

In [None]:
# Predefined function to encode the images
image_embeddings = loki.utils.encode_images(model, preprocess, patch_paths, device)
print(image_embeddings.shape)

In [None]:
import anndata
import pandas as pd

#cell_ids = [p[-14:-4] for p in patch_paths]
#E_ectors = image_embeddings.detach().cpu().numpy()

# Create a minimal AnnData object
adata = anndata.AnnData(
    X = np.zeros((len(cell_ids), 1))
)

# Store metadata properly
adata.obs["cell_id"] = cell_ids
adata.obsm["X_custom"] = E_vectors

# Save to file
adata.write("/content/drive/MyDrive/Thesis/Projects/Data/Mouse_Brain_Image_Embedding/cells.h5ad")