## Split the data into day2, 4, 6

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

# Load your AnnData object
file_path = "/Users/apple/Desktop/KB/data/LarryData/Larry_41201_2000.h5ad"
adata = sc.read_h5ad(file_path)

# Filter AnnData based on a condition in a column in adata.obs
# For example, to filter cells where 'Time point' is 2
adata_day2 = adata[adata.obs['Time point'] == 2].copy()
adata_day4 = adata[adata.obs['Time point'] == 4].copy()
adata_day6 = adata[adata.obs['Time point'] == 6].copy()
# filtered_adata now contains only the subset of cells where 'Time point' equals 2


In [2]:
adata.obs["clone_id"].value_counts()

clone_id
1261    177
2370    165
5714    142
292     134
5209    130
       ... 
5194      5
2320      5
5170      5
3007      5
4329      5
Name: count, Length: 2817, dtype: int64

In [3]:
clone_id_counts = adata_day2.obs['clone_id'].value_counts()

# Find the 'clone_id's that occur more than once
clone_ids_to_keep = clone_id_counts[clone_id_counts > 1].index

# Filter the AnnData object to keep only rows with 'clone_id' that occur more than once
adata_day2_filtered = adata_day2[adata_day2.obs['clone_id'].isin(clone_ids_to_keep)]

# adata_day2_filtered.obs


In [4]:
adata_day2.obs.shape, adata_day2_filtered.obs.shape

((1756, 9), (1076, 9))

In [6]:
clone_id_counts_4 = adata_day4.obs['clone_id'].value_counts()

# Find the 'clone_id's that occur more than once
clone_ids_to_keep_4 = clone_id_counts_4[clone_id_counts_4 > 1].index

# Filter the AnnData object to keep only rows with 'clone_id' that occur more than once
adata_day4_filtered = adata_day4[adata_day4.obs['clone_id'].isin(clone_ids_to_keep_4)]

# adata_day4_filtered.obs


In [7]:
adata_day4.obs.shape, adata_day4_filtered.obs.shape

((12284, 9), (11874, 9))

In [8]:
clone_id_counts_6 = adata_day6.obs['clone_id'].value_counts()

# Find the 'clone_id's that occur more than once
clone_ids_to_keep_6 = clone_id_counts_6[clone_id_counts_6 > 1].index

# Filter the AnnData object to keep only rows with 'clone_id' that occur more than once
adata_day6_filtered = adata_day6[adata_day6.obs['clone_id'].isin(clone_ids_to_keep_6)]

# adata_day6_filtered.obs


In [9]:
adata_day6.obs.shape, adata_day6_filtered.obs.shape

((27161, 9), (26981, 9))

In [10]:
adata_day2.write('LarrayData_day2.h5ad')
adata_day2_filtered.write('LarrayData_day2_filitered.h5ad')
adata_day4.write('LarrayData_day4.h5ad')
adata_day4_filtered.write('LarrayData_day4_filitered.h5ad')
adata_day6.write('LarrayData_day6.h5ad')
adata_day6_filtered.write('LarrayData_day6_filitered.h5ad')

## Simulated Data

### Setting A (No information)

2000 lineages (with each lineage having same proportion of cell types)

In [8]:
# Group by 'Cell type annotation' and count the number of cells in each group
cell_type_counts = adata.obs.groupby('Cell type annotation').size()
print(cell_type_counts)


Cell type annotation
Baso                 5092
Ccr7_DC                39
Eos                   149
Erythroid             316
Lymphoid               78
Mast                 1255
Meg                   831
Monocyte             7356
Neutrophil           7582
Undifferentiated    18472
pDC                    31
dtype: int64


  cell_type_counts = adata.obs.groupby('Cell type annotation').size()


In [9]:
# Make a copy of the original AnnData object
adata_copy = adata.copy()

# Reset the index of adata_copy.obs to ensure it's a simple range index
adata_copy.obs.reset_index(drop=True, inplace=True)

# Convert the sparse matrix adata_copy.X to a dense numpy array
dense_X = adata_copy.X.toarray()

# Extract unique cell types
cell_types = adata_copy.obs['Cell type annotation'].unique()
n_groups = 2000

# Create a DataFrame to store 'Cell type annotation' and 'clone_id'
obs_data = {'Cell type annotation': [], 'clone_id': [], 'original_index': []}

# Randomly assign each cell to one of 2000 groups ensuring similar proportions
np.random.seed(42)  # For reproducibility
group_assignments = np.arange(n_groups) + 1  # Groups numbered from 1 to 2000

for cell_type in cell_types:
    cell_indices = adata_copy.obs.index[adata_copy.obs['Cell type annotation'] == cell_type].tolist()
    # Calculate number of cells per group for this cell type
    cells_per_group = np.ones(n_groups, dtype=int) * (len(cell_indices) // n_groups)
    cells_per_group[:len(cell_indices) % n_groups] += 1
    np.random.shuffle(cells_per_group)
    
    # Assign cells to groups
    start_idx = 0
    for group, n_cells in zip(group_assignments, cells_per_group):
        end_idx = start_idx + n_cells
        obs_data['Cell type annotation'].extend([cell_type] * n_cells)
        obs_data['clone_id'].extend([group] * n_cells)
        obs_data['original_index'].extend(cell_indices[start_idx:end_idx])
        start_idx = end_idx

# Shuffle the entire dataset to mix up the groups
shuffled_indices = np.random.permutation(len(obs_data['Cell type annotation']))
shuffled_cell_types = np.array(obs_data['Cell type annotation'])[shuffled_indices]
shuffled_clone_ids = np.array(obs_data['clone_id'])[shuffled_indices]
original_indices = np.array(obs_data['original_index'])[shuffled_indices].astype(int)

# Create the new AnnData object with the shuffled data
adata_setA = ad.AnnData(
    obs=pd.DataFrame({
        'Cell type annotation': shuffled_cell_types,
        'clone_id': shuffled_clone_ids
    }),
    X=dense_X[original_indices]  # Index into the dense array to align with shuffled obs
)



In [10]:
# Group the data by 'clone_id' and count the number of cells in each group
group_sizes = adata_setA.obs.groupby('clone_id').size()

# Calculate the mean and standard deviation of the group sizes
mean_cells_per_group = group_sizes.mean()
std_dev_cells_per_group = group_sizes.std()

print("Mean number of cells per group:", mean_cells_per_group)
print("Standard deviation of cells per group:", std_dev_cells_per_group)


Mean number of cells per group: 20.6005
Standard deviation of cells per group: 1.2580505139094273


In [11]:
adata_setA.write('LarryData_setA.h5ad')

### Setting B (lineage = cell type)

In [50]:
from sklearn.cluster import KMeans

adata_copy = adata.copy()
adata_copy.obs.reset_index(drop=True, inplace=True)  

dense_X = adata_copy.X


# Prepare data frame for new AnnData
obs_data = {'Cell type annotation': [], 'clone_id': [], 'indices': []}

# Filter cell types with more than 20 cells
cell_counts = adata_copy.obs['Cell type annotation'].value_counts()
valid_types = cell_counts[cell_counts >= 20].index

# Initialize clone_id
clone_id = 0

for cell_type in valid_types:
    # Get indices for current cell type
    indices = adata_copy.obs[adata_copy.obs['Cell type annotation'] == cell_type].index
    # Data subset for K-Means
    data_subset = dense_X[indices]
    
    # Calculate the number of clusters
    n_clusters = max(1, len(indices) // 20)  # Ensure at least one cluster

    # K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    cluster_labels = kmeans.fit_predict(data_subset)

    # Resample clusters to have exactly 20 cells and assign new cell type annotation
    for i in range(n_clusters):
        cluster_indices = indices[cluster_labels == i]
        new_cell_type = f"{cell_type}_{i+1}"  # Unique cell type annotation for each cluster

        if len(cluster_indices) != 20:
            if len(cluster_indices) < 20:
                # If less than 20, randomly sample with replacement
                resampled_indices = np.random.choice(cluster_indices, size=20, replace=True)
            else:
                # If more than 20, randomly sample without replacement
                resampled_indices = np.random.choice(cluster_indices, size=20, replace=False)
        else:
            resampled_indices = cluster_indices

        for idx in resampled_indices:
            obs_data['Cell type annotation'].append(new_cell_type)
            obs_data['clone_id'].append(clone_id)
            obs_data['indices'].append(idx)  # Store the actual data indices

        clone_id += 1  # Increment clone_id for the next cluster

    # Stop if we reach 2000 groups
    if clone_id >= 2000:
        break

# Create the new AnnData object using the correct indices
new_indices = [int(i) for i in obs_data['indices']]  # Use actual indices
adata_setB = ad.AnnData(
    obs=pd.DataFrame({'Cell type annotation': obs_data['Cell type annotation'], 'clone_id': obs_data['clone_id']}),
    X=dense_X[new_indices]  # Use the correct indices to set X
)




In [53]:
adata_setB.obs

Unnamed: 0,Cell type annotation,clone_id
0,Undifferentiated_1,0
1,Undifferentiated_1,0
2,Undifferentiated_1,0
3,Undifferentiated_1,0
4,Undifferentiated_1,0
...,...,...
40515,Meg_41,2025
40516,Meg_41,2025
40517,Meg_41,2025
40518,Meg_41,2025


In [56]:
# Group the data by 'clone_id' and count the number of cells in each group
group_sizes = adata_setB.obs.groupby('clone_id').size()

# Calculate the mean and standard deviation of the group sizes
mean_cells_per_group = group_sizes.mean()
std_dev_cells_per_group = group_sizes.std()

print("Mean number of cells per group:", mean_cells_per_group)
print("Standard deviation of cells per group:", std_dev_cells_per_group)


Mean number of cells per group: 20.0
Standard deviation of cells per group: 0.0


In [57]:
adata_setB.write('LarryData_setB.h5ad')

### Generate the simulated data with full info ()

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

# Assuming you have already loaded your AnnData object and converted the obs and X as shown previously
file_path = "/Users/apple/Desktop/KB/SCSeq_LineageBarcoding2/SCSeq_LineageBarcoding/SCLineage_ConstrativeLearning/out/data/Larry_41201_2000.h5ad"
adata = sc.read_h5ad(file_path)
adata_df = pd.DataFrame(adata.obs)
count_matrix = adata.X.toarray() if hasattr(adata.X, 'toarray') else adata.X

# Ensure that the DataFrame index aligns with the count matrix rows
adata_df['index'] = range(adata_df.shape[0])

# Group the indices by 'clone_id'
grouped_indices = adata_df.groupby('clone_id')['index'].apply(list)

# Compute the centroid for each group
centroids = {}
for clone_id, indices in grouped_indices.items():
    # Select the rows corresponding to the current group's indices
    group_data = count_matrix[indices]
    # Compute the mean of these rows to get the centroid
    centroid = np.mean(group_data, axis=0)
    centroids[clone_id] = centroid

# Optionally, convert centroids to a DataFrame for easier manipulation and viewing
centroids_df = pd.DataFrame.from_dict(centroids, orient='index')
centroids_df.shape


In [None]:
adata_df['Time point']

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData

# Assuming `centroids` and noise generation from previous examples
num_new_points = 20
noise_scale = 0.003
simulate_data = []

# Metadata storage
obs_data = []

for clone_id, centroid in centroids.items():
    # Generate noisy data points
    repeated_centroids = np.tile(centroid, (num_new_points, 1))
    noise = np.random.normal(loc=0.0, scale=noise_scale, size=repeated_centroids.shape)
    noisy_data = repeated_centroids + noise
    simulate_data.append(noisy_data)

    # Generate metadata for each noisy data point
    for _ in range(num_new_points):
        obs_data.append({'clone_id': clone_id, 'Cell type annotation': np.random.randint(9), 'fake_data2': np.random.randint(100)})

# Convert simulate data into a numpy array
simulate_data_array = np.vstack(simulate_data)

# Create a DataFrame from the metadata list
obs_df = pd.DataFrame(obs_data)

# Create an AnnData object
adata_simulate = AnnData(X=simulate_data_array, obs=obs_df)

# Print information about the new AnnData object



In [None]:
adata_simulate.obs.head()

In [None]:
simulate_data_array.shape

In [None]:
adata_simulate.write('simulated_data_fullInfo.h5ad')

In [None]:
type(adata_simulate)