In [2]:
import anndata as ad
import pandas as pd
import numpy as np

import umap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import calinski_harabasz_score

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
input_dir = "/Users/apple/Desktop/KB/data"
adata_train = ad.read_h5ad(input_dir+'/LarryData/train_test/Larry_train.h5ad')
adata_test = ad.read_h5ad(input_dir+'/LarryData/train_test/Larry_test.h5ad')

train_labels = adata_train.obs["clone_id"].to_numpy()
test_labels = adata_test.obs["clone_id"].to_numpy()
print(train_labels.shape, test_labels.shape)

(37207,) (3886,)


## supUMAP embedding

In [3]:
# Extract the data matrix and labels
train_data = adata_train.X
test_data = adata_test.X


# labels = adata_train.obs['clone_id'].values

# Initialize UMAP with a higher number of neighbors for supervised learning
reducer = umap.UMAP(n_neighbors=15, n_components=10)



In [4]:
# Fit and transform the data with the labels
train_embeddings = reducer.fit_transform(train_data, y=train_labels)

In [5]:
# get the test embeddings
test_embeddings = reducer.transform(test_data)

In [6]:
test_embeddings.shape, train_embeddings.shape

((3886, 10), (37207, 10))

In [4]:
# Save the embeddings to a .npy file
# np.save("supUMAP_larry_top500_test_embedding.npy", test_embeddings)
# np.save("supUMAP_larry_top500_train_embedding.npy", train_embeddings)
# Load the embeddings from the .npy file
test_embeddings = np.load("supUMAP_larry_top500_test_embedding.npy")
train_embeddings = np.load("supUMAP_larry_top500_train_embedding.npy")


#### calinski score

In [8]:
# Calculate the Calinski-Harabasz score
score = calinski_harabasz_score(train_embeddings, train_labels)

# Print the score
print("Train Calinski-Harabasz Score:", score)


Train Calinski-Harabasz Score: 35.993721217977935


## KNN classifier

In [9]:
adata_train.obs["clone_id"].value_counts()

clone_id
1261    159
2370    148
5714    126
292     120
5209    116
       ... 
1935      5
2216      5
5467      5
4404      5
4329      5
Name: count, Length: 2813, dtype: int64

### Test Accuracy

In [10]:

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(train_embeddings, train_labels)

# Predict the labels for the test set
y_pred = knn.predict(test_embeddings)

# Calculate the accuracy
accuracy = accuracy_score(test_labels, y_pred)

print(f"KNN classifier testing accuracy: {accuracy * 100:.2f}%")


KNN classifier testing accuracy: 1.60%


### Train Accuracy

In [5]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_embeddings, train_labels, test_size=0.2, random_state=42)

# Initialize the KNN classifier (you can adjust the number of neighbors)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"KNN classifier training accuracy: {accuracy * 100:.2f}%")


KNN classifier training accuracy: 3.99%
