#### Adding the clone id(lineage) from L1 normalized larry data to Raw Larry data 

In [1]:
import numpy as np
import anndata as ad
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
import scanpy as sc
import scipy

In [2]:
adata_raw = ad.read_h5ad("/Users/apple/Desktop/KB/data/LarryData/larry_invitro_adata_sub_raw.h5ad")
adata_raw.obs["Time point"] = adata_raw.obs["time_info"].astype(int)
adata_raw.obs["Starting population"] = adata_raw.obs["Starting population"].astype('category')
adata_raw.obs['Well'] = adata_raw.obs['Well'].astype(int)

adata_lineage = ad.read_h5ad("/Users/apple/Desktop/KB/data/LarryData/larry_L1norm.h5ad")

In [3]:
adata_raw.shape, adata_lineage.shape

((49302, 23420), (130887, 25289))

In [4]:
adata_raw.obs_keys()

['Library',
 'Cell barcode',
 'time_info',
 'Starting population',
 'state_info',
 'Well',
 'SPRING-x',
 'SPRING-y',
 'Time point']

In [5]:
adata_lineage.obs_keys()

['Library',
 'Cell barcode',
 'Time point',
 'Starting population',
 'Cell type annotation',
 'Well',
 'SPRING-x',
 'SPRING-y',
 'clone_id']

In [6]:
# Print data types of each column in adata_raw.obs
print("Data types in adata_raw.obs:")
print(adata_raw.obs.dtypes)

print("\nData types in adata_lineage.obs:")
print(adata_lineage.obs.dtypes)


Data types in adata_raw.obs:
Library                category
Cell barcode           category
time_info               float64
Starting population    category
state_info             category
Well                      int64
SPRING-x                float64
SPRING-y                float64
Time point                int64
dtype: object

Data types in adata_lineage.obs:
Library                 category
Cell barcode            category
Time point                 int64
Starting population     category
Cell type annotation    category
Well                       int64
SPRING-x                 float64
SPRING-y                 float64
clone_id                   int64
dtype: object


In [7]:
# Extract the (SPRING-x, SPRING-y) pairs from both adata_raw and adata_lineage
spring_raw = set(zip(adata_raw.obs['SPRING-x'], adata_raw.obs['SPRING-y']))
spring_lineage = set(zip(adata_lineage.obs['SPRING-x'], adata_lineage.obs['SPRING-y']))

# Find the intersection of the two sets
intersection = spring_raw.intersection(spring_lineage)

# Print the results
print(f"Number of unique (SPRING-x, SPRING-y) pairs in adata_raw: {len(spring_raw)}")
print(f"Number of unique (SPRING-x, SPRING-y) pairs in adata_lineage: {len(spring_lineage)}")
print(f"Number of intersecting pairs: {len(intersection)}")


Number of unique (SPRING-x, SPRING-y) pairs in adata_raw: 49289
Number of unique (SPRING-x, SPRING-y) pairs in adata_lineage: 130830
Number of intersecting pairs: 49289


In [8]:
# Create a set of tuples with ('Cell barcode', 'SPRING-x', 'SPRING-y') from adata_raw.obs
barcode_spring_set = set(zip(adata_raw.obs['Cell barcode'], adata_raw.obs['SPRING-x'], adata_raw.obs['SPRING-y']))

# Print the number of unique tuples
print(f"Number of unique ('Cell barcode', 'SPRING-x', 'SPRING-y') tuples: {len(barcode_spring_set)}")


Number of unique ('Cell barcode', 'SPRING-x', 'SPRING-y') tuples: 49302


In [9]:
adata_raw.obs['SPRING-x'] = adata_raw.obs['SPRING-x'].round(4)
adata_raw.obs['SPRING-y'] = adata_raw.obs['SPRING-y'].round(4)
adata_lineage.obs['SPRING-x'] = adata_lineage.obs['SPRING-x'].round(4)
adata_lineage.obs['SPRING-y'] = adata_lineage.obs['SPRING-y'].round(4)


In [10]:
merged_df = pd.merge(
    adata_raw.obs,
    adata_lineage.obs[['Cell barcode','SPRING-x', 'SPRING-y', 'clone_id']],
    on=['Cell barcode', 'SPRING-x', 'SPRING-y'],
    how='left'
)

In [11]:
merged_df

Unnamed: 0,Library,Cell barcode,time_info,Starting population,state_info,Well,SPRING-x,SPRING-y,Time point,clone_id
0,d6_2_2,GCGTGCAA-AGAAGTTA,6.0,Lin-Kit+Sca1-,Undifferentiated,2,411.496,-96.190,6,573
1,d6_2_2,AAGGGACC-CTCGATGC,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-587.462,-306.925,6,1440
2,d6_2_2,CGTACCGA-AGCGCCTT,6.0,Lin-Kit+Sca1-,Monocyte,2,1429.805,-429.300,6,394
3,d6_2_2,CCGTAGCT-AGGCAGTT,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-1169.594,362.010,6,1972
4,d6_2_2,TAGGCTTT-CGTGGATA,6.0,Lin-Kit+Sca1-,Neutrophil,2,1236.039,-2113.013,6,1450
...,...,...,...,...,...,...,...,...,...,...
49297,LSK_d6_1_3,AGGCGTTT-CCTATTCA,6.0,Lin-Kit+Sca1+,Undifferentiated,1,-1074.917,441.981,6,12
49298,LSK_d6_1_3,CGTGCAAC-TGGCCTGT,6.0,Lin-Kit+Sca1+,Undifferentiated,1,247.634,281.262,6,267
49299,LSK_d6_1_3,CCCTTGCA-AGTGAAAG,6.0,Lin-Kit+Sca1+,Neutrophil,1,353.734,-980.246,6,3534
49300,LSK_d6_1_3,TCCGCTGT-GACAGATA,6.0,Lin-Kit+Sca1+,Monocyte,1,2513.013,-324.061,6,4945


In [12]:
clone_id = np.array(merged_df['clone_id'])
clone_id.shape

(49302,)

In [13]:
adata_raw.obs['clone_id']=clone_id
adata_raw.obs

Unnamed: 0,Library,Cell barcode,time_info,Starting population,state_info,Well,SPRING-x,SPRING-y,Time point,clone_id
d6_2_2:GCGTGCAAAGAAGTTA,d6_2_2,GCGTGCAA-AGAAGTTA,6.0,Lin-Kit+Sca1-,Undifferentiated,2,411.496,-96.190,6,573
d6_2_2:AAGGGACCCTCGATGC,d6_2_2,AAGGGACC-CTCGATGC,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-587.462,-306.925,6,1440
d6_2_2:CGTACCGAAGCGCCTT,d6_2_2,CGTACCGA-AGCGCCTT,6.0,Lin-Kit+Sca1-,Monocyte,2,1429.805,-429.300,6,394
d6_2_2:CCGTAGCTAGGCAGTT,d6_2_2,CCGTAGCT-AGGCAGTT,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-1169.594,362.010,6,1972
d6_2_2:TAGGCTTTCGTGGATA,d6_2_2,TAGGCTTT-CGTGGATA,6.0,Lin-Kit+Sca1-,Neutrophil,2,1236.039,-2113.013,6,1450
...,...,...,...,...,...,...,...,...,...,...
LSK_d6_1_3:AGGCGTTTCCTATTCA,LSK_d6_1_3,AGGCGTTT-CCTATTCA,6.0,Lin-Kit+Sca1+,Undifferentiated,1,-1074.917,441.981,6,12
LSK_d6_1_3:CGTGCAACTGGCCTGT,LSK_d6_1_3,CGTGCAAC-TGGCCTGT,6.0,Lin-Kit+Sca1+,Undifferentiated,1,247.634,281.262,6,267
LSK_d6_1_3:CCCTTGCAAGTGAAAG,LSK_d6_1_3,CCCTTGCA-AGTGAAAG,6.0,Lin-Kit+Sca1+,Neutrophil,1,353.734,-980.246,6,3534
LSK_d6_1_3:TCCGCTGTGACAGATA,LSK_d6_1_3,TCCGCTGT-GACAGATA,6.0,Lin-Kit+Sca1+,Monocyte,1,2513.013,-324.061,6,4945


In [14]:
adata_raw.write("Larry_Raw_w_Clone_id.h5ad")