In [1]:
import anndata as ad
import pandas as pd
import numpy as np

import umap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import calinski_harabasz_score

import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


### Data Loading

In [2]:
input_dir = "/Users/apple/Desktop/KB/data"
adata_train = ad.read_h5ad(input_dir+'/BiddyData/Biddy_train.h5ad')
adata_test = ad.read_h5ad(input_dir+'/BiddyData/Biddy_test.h5ad')

train_labels = adata_train.obs["clone_id"].to_numpy()
test_labels = adata_test.obs["clone_id"].to_numpy()

embed_dir = input_dir + "/feat_LCL_2025/cell_tag/feat_celltag_lambda005_unlab5_bs50_testAsPenalty"
train_embeddings = np.load(embed_dir+'/train_proj_embed.npy')
test_embeddings = np.load(embed_dir+'/test_proj_embed.npy')


print(train_labels.shape, test_labels.shape)
print(train_embeddings.shape, test_embeddings.shape)

(5893,) (641,)
(5893, 32) (641, 32)


In [3]:
min_size = 16
test_sizes = pd.Series(test_labels).value_counts()
kept_lineages = test_sizes[test_sizes >= min_size].index

test_keep = pd.Series(test_labels).isin(kept_lineages).to_numpy()
X_test_f = test_embeddings[test_keep]
y_test_f = test_labels[test_keep]

# print(f"Kept lineages (>= {min_size} cells in test): {len(kept_lineages)}")
# print(f"Test cells kept:  {X_test_f.shape[0]} / {test_embeddings.shape[0]}")


In [4]:
len(adata_train.obs["clone_id"].unique())

169

#### calinski score

In [5]:
# Calculate the Calinski-Harabasz score
score = calinski_harabasz_score(train_embeddings, train_labels)

# Print the score
print("Train Calinski-Harabasz Score:", score)


Train Calinski-Harabasz Score: 1895.2273223423206


In [6]:
# Calculate the Calinski-Harabasz score
# score = calinski_harabasz_score(test_embeddings, test_labels)
score = calinski_harabasz_score(X_test_f, y_test_f)


# Print the score
print("Test Calinski-Harabasz Score:", score)


Test Calinski-Harabasz Score: 14.746171069823525


## KNN classifier

In [7]:
adata_train.obs["clone_id"].value_counts()

clone_id
493.0     1178
2352.0     591
487.0      329
666.0      296
2721.0     263
          ... 
2902.0       5
2951.0       5
2863.0       5
2894.0       5
2367.0       5
Name: count, Length: 169, dtype: int64

### Test Accuracy

In [8]:

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(train_embeddings, train_labels)

# Predict the labels for the test set
y_pred = knn.predict(X_test_f)

# Calculate the accuracy
accuracy = accuracy_score(y_test_f, y_pred)

print(f"KNN classifier testing accuracy: {accuracy * 100:.2f}%")


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



KNN classifier testing accuracy: 54.06%


### Train Accuracy

In [9]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_embeddings, train_labels, test_size=0.2, random_state=42)

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"KNN classifier training accuracy: {accuracy * 100:.2f}%")


KNN classifier training accuracy: 98.98%


In [None]:
# build a dataframe with predictions
df = pd.DataFrame({
    "true_clone": y_test_f,
    "pred_clone": y_pred
})

# lineage size in test set
lineage_size = (
    df.groupby("true_clone")
      .size()
      .rename("lineage_size")
)

# per-lineage accuracy
lineage_acc = (
    df.assign(correct=lambda x: x.true_clone == x.pred_clone)
      .groupby("true_clone")["correct"]
      .mean()
      .rename("accuracy")
)

# combine
lineage_stats = pd.concat([lineage_size, lineage_acc], axis=1).reset_index()
plt.figure(figsize=(6, 5), dpi=300)

sns.scatterplot(
    data=lineage_stats,
    x="lineage_size",
    y="accuracy",
    s=40,
    alpha=0.7,
    edgecolor=None
)

plt.xscale("log")  # strongly recommended for lineage sizes
plt.xlabel("Lineage size (log scale)")
plt.ylabel("Per-lineage KNN accuracy")
plt.title("Per-lineage accuracy vs lineage size")

plt.tight_layout()
plt.show()