## seaborn

In [1]:
import anndata as ad
import numpy as np

In [None]:

adata_train = ad.read_h5ad('/Users/apple/Desktop/KB/data/Shaffer_cancer/shaffer_train.h5ad')
adata_test  = ad.read_h5ad('/Users/apple/Desktop/KB/data/Shaffer_cancer/shaffer_test.h5ad')

train_labels = adata_train.obs["clone_id"]
test_labels = adata_test.obs["clone_id"]

#calculate global frequency for total cells
all_labels = np.concatenate([train_labels, test_labels])
uniques, counts = np.unique(all_labels, return_counts=True)
total = len(all_labels)
global_freq = { lab: cnt / total for lab, cnt in zip(uniques, counts) }

input_dir = "/Users/apple/Desktop/KB/data/feat_LCL_2025/shaffer_cancer"

train_embeddings = np.load(input_dir+'/feat_shaffer_lambda01_unlab5_bs110_testAsPenalty/scBaseEncoderFeat_Z_bs110_tau0.5.npy')
test_embeddings = np.load(input_dir+'/feat_shaffer_lambda01_unlab5_bs110_testAsPenalty/test_embedding.npy')

k = 30

### Modified KNN 1: only for testing pred 

In [2]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

def adjusted_knn_predict(
    knn_model: KNeighborsClassifier,
    train_labels: np.ndarray,
    test_embeddings: np.ndarray,
    global_freq: dict,
    k: int
) -> np.ndarray:
    """
    For each test point:
      1) find its k nearest TRAINING neighbors
      2) compute local_freq[L] = (# neighbors with label L) / k
      3) adjusted_score[L] = local_freq[L] - global_freq[L]
      4) predict the label with highest adjusted_score
    
    Returns:
      predicted_labels: np.ndarray of shape (n_test,)
    """
    # find the k nearest neighbors in the TRAIN set for each test point
    neigh_indices = knn_model.kneighbors(test_embeddings, return_distance=False)  # shape: (n_test, k)
    
    preds = []
    for nbrs in neigh_indices:
        # count how many of those k neighbors belong to each lineage
        nbr_labels = train_labels[nbrs]
        unique, counts = np.unique(nbr_labels, return_counts=True)
        
        # compute local frequencies
        local_freq = { lab: cnt / k for lab, cnt in zip(unique, counts) }
        
        # build adjusted scores = local_freq - global_freq (zero if lineage not among neighbors)
        scores = {}
        for lab, g in global_freq.items():
            scores[lab] = local_freq.get(lab, 0.0) - g
        
        # pick the label with the highest adjusted score
        pred = max(scores.items(), key=lambda x: x[1])[0]
        preds.append(pred)
    
    return np.array(preds)


In [5]:
#Fit a standard KNN on the TRAINING data:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(train_embeddings, train_labels)

#Run the adjusted predictor:
adjusted_preds = adjusted_knn_predict(
    knn_model=knn,
    train_labels=train_labels,
    test_embeddings=test_embeddings,
    global_freq=global_freq,
    k=k
)
#Compute accuracy or any downstream metric:
accuracy = (adjusted_preds == test_labels).mean()
print(f"Adjusted KNN accuracy: {accuracy * 100:.2f}%")

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  nbr_labels = train_labels[nbrs]


Adjusted KNN accuracy: 18.12%


### Modified KNN 2: for both training and testing

In [9]:
import numpy as np
from sklearn.metrics import pairwise_distances

def adjusted_knn_predict(train_embeddings, train_labels, 
                         test_embeddings, global_freq, k=30):
    # Compute distance matrix (n_test_samples, n_train_samples)
    distances = pairwise_distances(test_embeddings, train_embeddings)

    predictions = []
    for dist in distances:
        # indices of k nearest neighbors
        nearest_indices = np.argsort(dist)[:k]
        
        # labels of these neighbors
        neighbor_labels = train_labels[nearest_indices]
        
        # compute local frequencies
        unique_labels, counts = np.unique(neighbor_labels, return_counts=True)
        local_freq = {lab: cnt / k for lab, cnt in zip(unique_labels, counts)}
        
        # compute adjusted scores (local - global)
        adjusted_scores = {lab: local_freq.get(lab, 0) - global_freq.get(lab, 0)
                           for lab in global_freq.keys()}
        
        # choose label with the highest adjusted score
        predicted_label = max(adjusted_scores, key=adjusted_scores.get)
        predictions.append(predicted_label)

    return np.array(predictions)

In [10]:
test_predictions = adjusted_knn_predict(
    train_embeddings=train_embeddings,
    train_labels=train_labels,
    test_embeddings=test_embeddings,
    global_freq=global_freq,
    k=30
)

  neighbor_labels = train_labels[nearest_indices]


In [11]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_labels, test_predictions)
print(f"Adjusted KNN Accuracy: {accuracy:.4f}")

Adjusted KNN Accuracy: 0.1812


## Plot

### 1. read the data and embeddings

In [None]:

adata_train = ad.read_h5ad('/Users/apple/Desktop/KB/data/Shaffer_cancer/shaffer_train.h5ad')
adata_test  = ad.read_h5ad('/Users/apple/Desktop/KB/data/Shaffer_cancer/shaffer_test.h5ad')

input_dir = "/Users/apple/Desktop/KB/data/feat_LCL_2025/shaffer_cancer"

train_semi_10 = np.load(input_dir+'/feat_shaffer_lambda01_unlab5_bs110_testAsPenalty/scBaseEncoderFeat_Z_bs110_tau0.5.npy')
test_semi_10 = np.load(input_dir+'/feat_shaffer_lambda01_unlab5_bs110_testAsPenalty/test_embedding.npy')


In [None]:
adata_train.obsm["LCL_embedding_semi_10"] = train_semi_10
adata_test.obsm["LCL_embedding_semi_10"] = test_semi_10

adata_train.obs["dataset"] = "train"
adata_test.obs["dataset"] = "test"

adata = ad.concat([adata_train, adata_test], axis=0, join='outer')

### 2. compute the umap coordinates

In [None]:
import umap

reducer = umap.UMAP()

embedding_umap = reducer.fit_transform(adata.obsm["LCL_embedding_semi_10"])

adata.obsm["X_umap"] = embedding_umap

In [None]:
import pandas as pd

# Count number of cells per lineage
clone_counts = adata.obs["clone_id"].value_counts()

# Get the top 5 largest lineages
top_5_clones = clone_counts.index[:5]

# Assign 'Other' to all lineages except the top 5
adata.obs["clone_group"] = adata.obs["clone_id"].apply(lambda x: x if x in top_5_clones else "Other")

# Convert to categorical
adata.obs["clone_group"] = adata.obs["clone_group"].astype("category")

# Print for verification
print(adata.obs["clone_group"].value_counts())

### 3. plot 1: plot top 5 lineages with both training and testing cells

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_umap(adata, colormap="tab10"):
    """
    Plots UMAP with:
    - Top 5 clones in distinct colors (plotted on top)
    - Other clones in gray with lower opacity
    - Train cells as dots, Test cells as crosses
    - Larger marker size for top 5 clones
    """
    umap_coords = adata.obsm["X_umap"]

    # Extract train and test indices
    train_idx = adata.obs["dataset"] == "train"
    test_idx = adata.obs["dataset"] == "test"

    # Get unique clone groups
    unique_clones = adata.obs["clone_group"].cat.categories

    # Define a colormap for the top 5 clones, others in gray
    colors = plt.get_cmap(colormap)(range(len(unique_clones) - 1))  # Leave space for gray
    color_map = dict(zip(unique_clones[:-1], colors))  # Map top 5 clones
    color_map["Other"] = "gray"  # Set 'Other' to gray

    plt.figure(figsize=(8, 6))

    # **Step 1**: Plot "Other" cells first (background with low opacity)
    idx_train_other = (adata.obs["clone_group"] == "Other") & train_idx
    idx_test_other = (adata.obs["clone_group"] == "Other") & test_idx

    plt.scatter(umap_coords[idx_train_other, 0], umap_coords[idx_train_other, 1], 
                color=color_map["Other"], s=8, marker=".", alpha=0.2, label="Train Other")  # Lower opacity

    plt.scatter(umap_coords[idx_test_other, 0], umap_coords[idx_test_other, 1], 
                color=color_map["Other"], s=12, marker="x", alpha=0.2, label="Test Other")  # Lower opacity

    # **Step 2**: Plot top 5 clones on top (larger size)
    for clone in unique_clones[:-1]:  # Skip "Other"
        idx_train = (adata.obs["clone_group"] == clone) & train_idx
        idx_test = (adata.obs["clone_group"] == clone) & test_idx

        # Train: Dots
        plt.scatter(umap_coords[idx_train, 0], umap_coords[idx_train, 1], 
                    color=color_map[clone], s=30, marker=".", alpha=0.8, label=f"Train {clone}")  # Bigger size

        # Test: Crosses
        plt.scatter(umap_coords[idx_test, 0], umap_coords[idx_test, 1], 
                    color=color_map[clone], s=40, marker="x", alpha=0.9, label=f"Test {clone}")  # Bigger size

    plt.xlabel("UMAP1")
    plt.ylabel("UMAP2")
    plt.title("UMAP Projection - Top 5 Clones Highlighted")
    plt.legend()
    plt.show()

# Run the function to plot
plot_umap(adata)

### 4. plot 2: plot top 1 lineages with both training and testing cells

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_umap_with_lineages(adata, n_top_lineages=5, colormap="tab10"):
    """
    Plots UMAP from `adata.obsm["X_umap"]` with:
    - Top N clones in distinct colors (plotted on top)
    - Other clones in gray with lower opacity
    - Train cells as dots, Test cells as crosses
    
    Parameters:
    - adata (AnnData): AnnData object with precomputed UMAP in `.obsm["X_umap"]`
    - n_top_lineages (int): Number of largest lineages to highlight in the plot
    - colormap (str): Matplotlib colormap for the distinct top N lineages
    
    Output:
    - A UMAP scatter plot (does NOT modify `adata`)
    """

    ### **1️⃣ Check If Required Fields Exist**
    if "X_umap" not in adata.obsm:
        raise ValueError("UMAP coordinates missing! Ensure `adata.obsm['X_umap']` is computed.")
    
    if "LCL_embedding_semi_10" not in adata.obsm:
        raise ValueError("Contrastive learning embeddings missing! Ensure `adata.obsm['LCL_embedding_dim10']` exists.")
    
    if "clone_id" not in adata.obs:
        raise ValueError("Clone ID column missing! Ensure `adata.obs['clone_id']` exists.")
    
    if "dataset" not in adata.obs:
        raise ValueError("Dataset column missing! Ensure `adata.obs['dataset']` exists with 'train' and 'test' values.")

    ### **2️⃣ Identify the Top N Largest Lineages**
    print(f"Identifying the top {n_top_lineages} largest lineages...")
    clone_counts = adata.obs["clone_id"].value_counts()
    top_n_clones = clone_counts.index[:n_top_lineages]

    # Assign "Other" to all but the top N lineages
    adata.obs["clone_group"] = adata.obs["clone_id"].apply(lambda x: x if x in top_n_clones else "Other")
    
    # Convert to categorical for easy plotting
    adata.obs["clone_group"] = adata.obs["clone_group"].astype("category")

    ### **3️⃣ Plot UMAP with Custom Formatting**
    print("Plotting UMAP with lineage-specific colors and train/test markers...")

    umap_coords = adata.obsm["X_umap"]
    train_idx = adata.obs["dataset"] == "train"
    test_idx = adata.obs["dataset"] == "test"
    
    unique_clones = adata.obs["clone_group"].cat.categories

    # Define a colormap for the top N clones, others in gray
    colors = plt.get_cmap(colormap)(range(len(unique_clones) - 1))  # Leave space for gray
    color_map = dict(zip(unique_clones[:-1], colors))  # Map top N clones
    color_map["Other"] = "gray"  # Set 'Other' to gray

    plt.figure(figsize=(8, 6))

    # **Step 1**: Plot "Other" cells first (background with low opacity)
    idx_train_other = (adata.obs["clone_group"] == "Other") & train_idx
    idx_test_other = (adata.obs["clone_group"] == "Other") & test_idx

    plt.scatter(umap_coords[idx_train_other, 0], umap_coords[idx_train_other, 1], 
                color=color_map["Other"], s=8, marker=".", alpha=0.2, label="Train Other")

    plt.scatter(umap_coords[idx_test_other, 0], umap_coords[idx_test_other, 1], 
                color=color_map["Other"], s=8, marker="x", alpha=0.2, label="Test Other")

    # **Step 2**: Plot top N clones on top (larger size)
    for clone in unique_clones[:-1]:  # Skip "Other"
        idx_train = (adata.obs["clone_group"] == clone) & train_idx
        idx_test = (adata.obs["clone_group"] == clone) & test_idx

        # Train: Dots
        plt.scatter(umap_coords[idx_train, 0], umap_coords[idx_train, 1], 
                    color=color_map[clone], s=40, marker=".", alpha=0.8, label=f"Train {clone}")

        # Test: Crosses
        plt.scatter(umap_coords[idx_test, 0], umap_coords[idx_test, 1], 
                    color=color_map[clone], s=40, marker="x", alpha=1, label=f"Test {clone}")

    plt.xlabel("UMAP1")
    plt.ylabel("UMAP2")
    plt.title(f"UMAP Projection - Top {n_top_lineages} Clones Highlighted")
    plt.legend()
    plt.show()

In [None]:
plot_umap_with_lineages(adata, n_top_lineages=1)

### 5. plot 3.1: plot both training and testing cells

In [None]:
import matplotlib.pyplot as plt
plt.scatter(embedding_umap[:,0],embedding_umap[:,1])

### 5. plot 3.2: plot training cells

In [None]:
plt.scatter(embedding_umap[:20656,0],embedding_umap[:20656,1])

### 5. plot 3.2: plot testing cells

In [None]:
plt.scatter(embedding_umap[20656:,0],embedding_umap[20656:,1])