In [10]:
import anndata as ad
import pandas as pd
import numpy as np

import umap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import calinski_harabasz_score

import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
input_dir = "/Users/apple/Desktop/KB/data"
adata_train = ad.read_h5ad(input_dir+'/LarryData/train_test/Larry_200_train.h5ad')
adata_test = ad.read_h5ad(input_dir+'/LarryData/train_test/Larry_200_test.h5ad')

train_labels = adata_train.obs["clone_id"].to_numpy()
test_labels = adata_test.obs["clone_id"].to_numpy()

 
train_embeddings = np.load(input_dir+'/feat_RECOMB/train_test/larry_top200_train_test/Larry_scvi_train_200_embeddings.npy')
test_embeddings = np.load(input_dir+'/feat_RECOMB/train_test/larry_top200_train_test/Larry_scvi_test_200_embeddings.npy')

print(train_labels.shape, test_labels.shape)
print(train_embeddings.shape, test_embeddings.shape)

(10148,) (1225,)
(10148, 10) (1225, 10)


#### calinski score

In [12]:
# Calculate the Calinski-Harabasz score
score = calinski_harabasz_score(train_embeddings, train_labels)

# Print the score
print("Calinski-Harabasz Score:", score)


Calinski-Harabasz Score: 30.430285075483706


In [13]:
# Calculate the Calinski-Harabasz score
score = calinski_harabasz_score(test_embeddings, test_labels)

# Print the score
print("Test Calinski-Harabasz Score:", score)


Test Calinski-Harabasz Score: 4.809966098849501


## KNN classifier

In [14]:
adata_train.obs["clone_id"].value_counts()

clone_id
1261    159
2370    148
5714    126
292     120
5209    116
       ... 
1691     31
2115     30
998      30
1998     30
922      30
Name: count, Length: 200, dtype: int64

### Test Accuracy

In [24]:

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(train_embeddings, train_labels)

# Predict the labels for the test set
y_pred = knn.predict(test_embeddings)

# Calculate the accuracy
accuracy = accuracy_score(test_labels, y_pred)

print(f"KNN classifier testing accuracy: {accuracy * 100:.2f}%")


KNN classifier testing accuracy: 11.10%


### Train Accuracy

In [16]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_embeddings, train_labels, test_size=0.2, random_state=42)

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"KNN classifier training accuracy: {accuracy * 100:.2f}%")


KNN classifier training accuracy: 12.66%
