In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
from itertools import product

In [2]:
adata=sc.read_h5ad("level3_cp_ctrl_pred_tissue_filtered.h5ad")
# adata
df=pd.DataFrame(adata.obs)
# df
cell_counts=df['cell'].value_counts()
cmap_counts=df['cmap_name'].value_counts()
filtered_cells = cell_counts[cell_counts >= 600].index.tolist()
filtered_cmaps = cmap_counts[cmap_counts >= 1].index.tolist()
print(len(filtered_cells))
print(len(filtered_cmaps))

combinations = list(product(filtered_cells, filtered_cmaps))
print(len(combinations))

26
4445
115570


In [4]:
cmap_counts

cmap_name
ibrutinib             96
masitinib             91
PD-0325901            90
tozasertib            88
gefitinib             87
                      ..
BG-1027                1
androstanol            1
SA-25547               1
BG-1025                1
piperonyl-butoxide     1
Name: count, Length: 4445, dtype: int64

In [5]:
unpert_df=pd.DataFrame(adata.layers['unpert_expr'],index=adata.obs['cell'],columns=adata.var_names)

In [6]:

if unpert_df.index.duplicated().any():
    unpert_df = unpert_df.groupby(unpert_df.index).mean()

combinations = list(product(filtered_cells, filtered_cmaps))

cell_drug_names = [f"{cell}_{drug}" for cell, drug in combinations]

unpert_expand_df = pd.DataFrame(index=cell_drug_names, columns=unpert_df.columns)


for cell_drug in cell_drug_names:
    parts = cell_drug.split("_", 1)
    
    if len(parts) == 2:
        cell, drug = parts
        if cell in unpert_df.index:
            unpert_expand_df.loc[cell_drug] = unpert_df.loc[cell].values
    else:
        print(f"Skipping invalid cell_drug format: {cell_drug}")


unpert_expand_df.reset_index(inplace=True)
unpert_expand_df.rename(columns={"index": "cell_drug"}, inplace=True)

print(unpert_expand_df.head())


        cell_drug   GNPDA1      CDH3     HDAC6     PARP2    MAMLD1     DNAJB6  \
0   PC3_ibrutinib  9.74503  8.127631  8.394389  9.760157  4.900449  10.497885   
1   PC3_masitinib  9.74503  8.127631  8.394389  9.760157  4.900449  10.497885   
2  PC3_PD-0325901  9.74503  8.127631  8.394389  9.760157  4.900449  10.497885   
3  PC3_tozasertib  9.74503  8.127631  8.394389  9.760157  4.900449  10.497885   
4   PC3_gefitinib  9.74503  8.127631  8.394389  9.760157  4.900449  10.497885   

       SMC4     ABCC5    ABCB6  ...    NCAPD2      PAN2   LPGAT1     KIF14  \
0  9.990736  5.072751  7.42981  ...  8.773877  5.008732  8.99127  7.927429   
1  9.990736  5.072751  7.42981  ...  8.773877  5.008732  8.99127  7.927429   
2  9.990736  5.072751  7.42981  ...  8.773877  5.008732  8.99127  7.927429   
3  9.990736  5.072751  7.42981  ...  8.773877  5.008732  8.99127  7.927429   
4  9.990736  5.072751  7.42981  ...  8.773877  5.008732  8.99127  7.927429   

     CDC25A    CDC25B     OXSR1       MVP   

In [7]:
import anndata as ad

In [8]:
split_cols = unpert_expand_df['cell_drug'].str.split('_', n=1, expand=True)

unpert_expand_df[['cell', 'drug']] = split_cols

data_matrix = unpert_expand_df.drop(columns=['cell_drug'])

In [9]:
bdata = ad.AnnData(X=data_matrix.iloc[:, :-2].values) 
bdata.obs = data_matrix[['cell', 'drug']].reset_index(drop=True)

data_matrix['cell'] = data_matrix['cell'].astype(str)
data_matrix['drug'] = data_matrix['drug'].astype(str)

In [10]:

print(bdata.X.dtype)


bdata.X = bdata.X.astype(float)

object


In [11]:
import pandas as pd

compound_file = r'/path/to/l1000/compoundinfo_beta.txt'  
compound_df = pd.read_csv(compound_file, sep='\t', header=0)  


moa_mapping = dict(zip(compound_df['cmap_name'], compound_df['moa']))  
target_mapping = dict(zip(compound_df['cmap_name'], compound_df['target']))  
pert_id_mapping = dict(zip(compound_df['cmap_name'], compound_df['pert_id']))  
smiles_mapping = dict(zip(compound_df['cmap_name'], compound_df['canonical_smiles']))  
compound_aliases_mapping = dict(zip(compound_df['cmap_name'], compound_df['compound_aliases']))  
inchi_key_mapping = dict(zip(compound_df['cmap_name'], compound_df['inchi_key']))  


bdata.obs['moa'] = bdata.obs['drug'].map(moa_mapping)
bdata.obs['target'] = bdata.obs['drug'].map(target_mapping)
bdata.obs['pert_id'] = bdata.obs['drug'].map(pert_id_mapping)
bdata.obs['smiles'] = bdata.obs['drug'].map(smiles_mapping)
bdata.obs['compound_aliases'] = bdata.obs['drug'].map(compound_aliases_mapping)
bdata.obs['inchi_key'] = bdata.obs['drug'].map(inchi_key_mapping)


bdata.layers['unpert_expr'] = bdata.X.copy()


bdata = bdata[~bdata.obs['smiles'].isna()]


In [None]:
bdata.write_h5ad('test_expand_cp.h5ad')