In [1]:
import numpy as np
import anndata as ad
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
import scanpy as sc
import scipy

In [2]:
adata = ad.read_h5ad("/Users/apple/Desktop/KB/data/LarryData/Larry_41093_2000_norm_log.h5ad")

In [3]:
adata.shape

(41093, 2000)

In [4]:
adata.obs["clone_id"].value_counts()

clone_id
1261    177
2370    165
5714    141
292     134
5209    129
       ... 
513       5
5629      5
1014      5
3998      5
4329      5
Name: count, Length: 2813, dtype: int64

In [5]:
# Get the top 200/500 most frequent clone_ids
top_clone_ids_200 = adata.obs["clone_id"].value_counts().head(200).index
top_clone_ids_500 = adata.obs["clone_id"].value_counts().head(500).index

# Filter the adata object to include only these top 200 clone_ids
adata_200 = adata[adata.obs["clone_id"].isin(top_clone_ids_200)].copy()
adata_500 = adata[adata.obs["clone_id"].isin(top_clone_ids_500)].copy()


adata_200.shape, adata_500.shape

((11373, 2000), (19231, 2000))

In [6]:
adata_200.obs["clone_id"].value_counts()

clone_id
1261    177
2370    165
5714    141
292     134
5209    129
       ... 
4870     35
2115     34
998      34
1998     34
922      34
Name: count, Length: 200, dtype: int64

In [7]:
adata_500.obs["clone_id"].value_counts()

clone_id
1261    177
2370    165
5714    141
292     134
5209    129
       ... 
4371     21
2693     21
4846     21
151      21
1547     21
Name: count, Length: 500, dtype: int64

In [16]:
adata_200.write_h5ad('Larry_200.h5ad')
adata_500.write_h5ad('Larry_500.h5ad')

In [8]:
def split_adata_by_clone_id(adata):
    """
    Splits an AnnData object into training and test sets based on the 'clone_id' column.
    
    Parameters:
        adata (AnnData): The AnnData object to split.
    
    Returns:
        adata_train (AnnData): The training set.
        adata_test (AnnData): The test set.
    """
    # Step 1: Identify unique clone_ids and their frequencies
    clone_id_counts = adata.obs['clone_id'].value_counts()

    # Step 2: Initialize empty lists to hold indices for adata_train and adata_test
    test_indices = []
    train_indices = []

    # Step 3: Loop through each clone_id and split based on the criteria
    for clone_id, count in clone_id_counts.items():
        clone_indices = adata.obs[adata.obs['clone_id'] == clone_id].index

        if count >= 10:
            # Select 10% of the elements randomly for adata_test
            test_size = int(np.ceil(0.1 * count))
            test_clone_indices = np.random.choice(clone_indices, size=test_size, replace=False)
            test_indices.extend(test_clone_indices)
            
            # The remaining elements go to adata_train
            train_clone_indices = list(set(clone_indices) - set(test_clone_indices))
            train_indices.extend(train_clone_indices)
        else:
            # All elements go to adata_train
            train_indices.extend(clone_indices)

    # Step 4: Subset the original adata to create adata_train and adata_test
    adata_train = adata[train_indices, :].copy()
    adata_test = adata[test_indices, :].copy()
    
    return adata_train, adata_test


In [9]:
adata_200_train, adata_200_test = split_adata_by_clone_id(adata_200)
adata_500_train, adata_500_test = split_adata_by_clone_id(adata_500)

print(adata_200_train.shape, adata_200_test.shape)
print(adata_500_train.shape, adata_500_test.shape)

(10148, 2000) (1225, 2000)
(17054, 2000) (2177, 2000)


In [10]:
adata_200_train.write_h5ad('Larry_200_train.h5ad')
adata_200_test.write_h5ad('Larry_200_test.h5ad')

adata_500_train.write_h5ad('Larry_500_train.h5ad')
adata_500_test.write_h5ad('Larry_500_test.h5ad')

In [12]:
import math

In [15]:
math.log(30000)

10.308952660644293