In [1]:
import anndata as ad
import pandas as pd
import numpy as np

import umap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import calinski_harabasz_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
input_dir = "/Users/apple/Desktop/KB/data"
adata_train = ad.read_h5ad(input_dir+'/LarryData/train_test/Larry_train.h5ad')
adata_test = ad.read_h5ad(input_dir+'/LarryData/train_test/Larry_test.h5ad')

train_labels = adata_train.obs["clone_id"].to_numpy()
test_labels = adata_test.obs["clone_id"].to_numpy()

 
train_embeddings = np.load(input_dir+'/feat_RECOMB/train_test/larry_train_test/Larry_scvi_train_embeddings.npy')
test_embeddings = np.load(input_dir+'/feat_RECOMB/train_test/larry_train_test/Larry_scvi_test_embeddings.npy')

print(train_labels.shape, test_labels.shape)
print(train_embeddings.shape, test_embeddings.shape)

(37207,) (3886,)
(37207, 10) (3886, 10)


#### calinski score

In [3]:
# Calculate the Calinski-Harabasz score
score = calinski_harabasz_score(train_embeddings, train_labels)

# Print the score
print("Train Calinski-Harabasz Score:", score)


Train Calinski-Harabasz Score: 12.27941375716528


In [4]:
# Calculate the Calinski-Harabasz score
score = calinski_harabasz_score(test_embeddings, test_labels)

# Print the score
print("Test Calinski-Harabasz Score:", score)


Test Calinski-Harabasz Score: 3.23932990680996


## KNN classifier

In [4]:
adata_train.obs["clone_id"].value_counts()

clone_id
1261    159
2370    148
5714    126
292     120
5209    116
       ... 
1935      5
2216      5
5467      5
4404      5
4329      5
Name: count, Length: 2813, dtype: int64

### Test Accuracy

In [18]:

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(train_embeddings, train_labels)

# Predict the labels for the test set
y_pred = knn.predict(test_embeddings)

# Calculate the accuracy
accuracy = accuracy_score(test_labels, y_pred)

print(f"KNN classifier testing accuracy: {accuracy * 100:.2f}%")


KNN classifier testing accuracy: 4.22%


### Train Accuracy

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_embeddings, train_labels, test_size=0.2, random_state=42)

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"KNN classifier training accuracy: {accuracy * 100:.2f}%")


KNN classifier training accuracy: 3.32%
