In [1]:
import anndata as ad
import pandas as pd
import numpy as np

import umap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import calinski_harabasz_score

import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_dir = "/Users/apple/Desktop/KB/data/"
adata_train = ad.read_h5ad(input_dir+'/LarryData/Larry_200_train.h5ad')
adata_test = ad.read_h5ad(input_dir+'/LarryData/Larry_200_test.h5ad')

train_labels = adata_train.obs["clone_id"].to_numpy()
test_labels = adata_test.obs["clone_id"].to_numpy()

embed_dir = input_dir + "/feat_LCL_2025/Larry/larry_top200_lambda0.01_unlab15_bs150_testAsPenalty"
train_embeddings = np.load(embed_dir+'/train_proj_embed.npy')
test_embeddings = np.load(embed_dir+'/test_proj_embed.npy')


print(train_labels.shape, test_labels.shape)
print(train_embeddings.shape, test_embeddings.shape)

(10148,) (1225,)
(10148, 32) (1225, 32)


In [3]:
len(adata_train.obs["clone_id"].unique())

200

#### calinski score

In [4]:
# Calculate the Calinski-Harabasz score
score = calinski_harabasz_score(train_embeddings, train_labels)

# Print the score
print("Train Calinski-Harabasz Score:", score)


Train Calinski-Harabasz Score: 114834.76691419636


In [5]:
# Calculate the Calinski-Harabasz score
score = calinski_harabasz_score(test_embeddings, test_labels)
# score = calinski_harabasz_score(X_test_f, y_test_f)


# Print the score
print("Test Calinski-Harabasz Score:", score)


Test Calinski-Harabasz Score: 2.6215108388088586


## KNN classifier

In [6]:
adata_train.obs["clone_id"].value_counts()

clone_id
1261    159
2370    148
5714    126
292     120
5209    116
       ... 
1691     31
2115     30
998      30
1998     30
922      30
Name: count, Length: 200, dtype: int64

### Test Accuracy

In [7]:

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(train_embeddings, train_labels)

# Predict the labels for the test set
# y_pred = knn.predict(X_test_f)
# accuracy = accuracy_score(y_test_f, y_pred)

y_pred = knn.predict(test_embeddings)
accuracy = accuracy_score(test_labels, y_pred)

print(f"KNN classifier testing accuracy: {accuracy * 100:.2f}%")


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



KNN classifier testing accuracy: 13.63%


### Train Accuracy

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_embeddings, train_labels, test_size=0.2, random_state=42)

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"KNN classifier training accuracy: {accuracy * 100:.2f}%")


KNN classifier training accuracy: 100.00%
