In [None]:
!pip install scanpy
!pip install datasets anndata scipy pandas pubchempy

Collecting pubchempy
  Downloading pubchempy-1.0.5-py3-none-any.whl.metadata (4.3 kB)
Downloading pubchempy-1.0.5-py3-none-any.whl (21 kB)
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.5


In [None]:
from datasets import load_dataset
from scipy.sparse import csr_matrix
import anndata
import pandas as pd
import pubchempy as pcp

## Generate Data fron Tahoe-100M

In [None]:
def create_anndata_from_generator(generator, gene_vocab, sample_size=None):
    sorted_vocab_items = sorted(gene_vocab.items())
    token_ids, gene_names = zip(*sorted_vocab_items)
    token_id_to_col_idx = {token_id: idx for idx, token_id in enumerate(token_ids)}

    data, indices, indptr = [], [], [0]
    obs_data = []

    for i, cell in enumerate(generator):
        if sample_size is not None and i >= sample_size:
            break
        genes = cell['genes']
        expressions = cell['expressions']
        if expressions[0] < 0:
            genes = genes[1:]
            expressions = expressions[1:]

        col_indices = [token_id_to_col_idx[gene] for gene in genes if gene in token_id_to_col_idx]
        valid_expressions = [expr for gene, expr in zip(genes, expressions) if gene in token_id_to_col_idx]

        data.extend(valid_expressions)
        indices.extend(col_indices)
        indptr.append(len(data))

        obs_entry = {k: v for k, v in cell.items() if k not in ['genes', 'expressions']}
        obs_data.append(obs_entry)

    expr_matrix = csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(gene_names)))
    obs_df = pd.DataFrame(obs_data)

    adata = anndata.AnnData(X=expr_matrix, obs=obs_df)
    adata.var.index = pd.Index(gene_names, name='ensembl_id')

    return adata

In [None]:
#build a dataset generator from HuggingFaces
tahoe_100m_ds = load_dataset('vevotx/Tahoe-100M', streaming=True, split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/3388 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/3388 [00:00<?, ?it/s]

In [None]:
tahoe_100m_ds

IterableDataset({
    features: ['genes', 'expressions', 'drug', 'sample', 'BARCODE_SUB_LIB_ID', 'cell_line_id', 'moa-fine', 'canonical_smiles', 'pubchem_cid', 'plate'],
    num_shards: 3388
})

In [None]:
#load the sample metadata
sample_metadata = load_dataset("vevotx/Tahoe-100M","sample_metadata", split="train")

Resolving data files:   0%|          | 0/3388 [00:00<?, ?it/s]

metadata/sample_metadata.parquet:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
#load the gene metadata
gene_metadata = load_dataset("vevotx/Tahoe-100M", name="gene_metadata", split="train")
gene_vocab = {entry["token_id"]: entry["ensembl_id"] for entry in gene_metadata}

Resolving data files:   0%|          | 0/3388 [00:00<?, ?it/s]

metadata/gene_metadata.parquet:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
#generate from any random 100K cells. Don't exceed, otherwise, colab will crash
adata = create_anndata_from_generator(tahoe_100m_ds, gene_vocab, sample_size=100000)
adata

  return dispatch(args[0].__class__)(*args, **kw)


AnnData object with n_obs × n_vars = 100000 × 62710
    obs: 'drug', 'sample', 'BARCODE_SUB_LIB_ID', 'cell_line_id', 'moa-fine', 'canonical_smiles', 'pubchem_cid', 'plate'

In [None]:
adata.obs.shape

(100000, 8)

In [None]:
#merge sample metadata with main data
sample_metadata = load_dataset("vevotx/Tahoe-100M","sample_metadata", split="train").to_pandas()
adata.obs = pd.merge(adata.obs, sample_metadata.drop(columns=["drug","plate"]), on="sample")
adata.obs.head()

Resolving data files:   0%|          | 0/3388 [00:00<?, ?it/s]

  return dispatch(args[0].__class__)(*args, **kw)


Unnamed: 0,drug,sample,BARCODE_SUB_LIB_ID,cell_line_id,moa-fine,canonical_smiles,pubchem_cid,plate,mean_gene_count,mean_tscp_count,mean_mread_count,mean_pcnt_mito,drugname_drugconc
0,8-Hydroxyquinoline,smp_1783,01_001_052-lib_1105,CVCL_0480,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"
1,8-Hydroxyquinoline,smp_1783,01_001_105-lib_1105,CVCL_0546,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"
2,8-Hydroxyquinoline,smp_1783,01_001_165-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"
3,8-Hydroxyquinoline,smp_1783,01_003_094-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"
4,8-Hydroxyquinoline,smp_1783,01_003_164-lib_1105,CVCL_1056,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"


In [None]:
#merge drug metadata with main data
drug_metadata = load_dataset("vevotx/Tahoe-100M","drug_metadata", split="train").to_pandas()
adata.obs = pd.merge(adata.obs, drug_metadata.drop(columns=["canonical_smiles","pubchem_cid","moa-fine"]), on="drug", how='left')
adata.obs.head()

Resolving data files:   0%|          | 0/3388 [00:00<?, ?it/s]

  return dispatch(args[0].__class__)(*args, **kw)


Unnamed: 0,drug,sample,BARCODE_SUB_LIB_ID,cell_line_id,moa-fine,canonical_smiles,pubchem_cid,plate,mean_gene_count,mean_tscp_count,mean_mread_count,mean_pcnt_mito,drugname_drugconc,targets,moa-broad,human-approved,clinical-trials,gpt-notes-approval
0,8-Hydroxyquinoline,smp_1783,01_001_052-lib_1105,CVCL_0480,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."
1,8-Hydroxyquinoline,smp_1783,01_001_105-lib_1105,CVCL_0546,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."
2,8-Hydroxyquinoline,smp_1783,01_001_165-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."
3,8-Hydroxyquinoline,smp_1783,01_003_094-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."
4,8-Hydroxyquinoline,smp_1783,01_003_164-lib_1105,CVCL_1056,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."


In [None]:
#merge cell line metadata with main data
cell_line_metadata = load_dataset("vevotx/Tahoe-100M","cell_line_metadata", split="train").to_pandas()
cell_line_metadata.head(1)

Resolving data files:   0%|          | 0/3388 [00:00<?, ?it/s]

Unnamed: 0,cell_name,Cell_ID_DepMap,Cell_ID_Cellosaur,Organ,Driver_Gene_Symbol,Driver_VarZyg,Driver_VarType,Driver_ProtEffect_or_CdnaEffect,Driver_Mech_InferDM,Driver_GeneType_DM
0,A549,ACH-000681,CVCL_0023,Lung,CDKN2A,Hom,Deletion,DEL,LoF,Suppressor


In [None]:
#merge cell line metadata with main data
cell_line_metadata_unique = cell_line_metadata.drop_duplicates(subset=['Cell_ID_Cellosaur'])
adata.obs = pd.merge(adata.obs, cell_line_metadata_unique, left_on='cell_line_id' , right_on='Cell_ID_Cellosaur', how='left')
adata.obs.head()

  return dispatch(args[0].__class__)(*args, **kw)


Unnamed: 0,drug,sample,BARCODE_SUB_LIB_ID,cell_line_id,moa-fine,canonical_smiles,pubchem_cid,plate,mean_gene_count,mean_tscp_count,...,cell_name,Cell_ID_DepMap,Cell_ID_Cellosaur,Organ,Driver_Gene_Symbol,Driver_VarZyg,Driver_VarType,Driver_ProtEffect_or_CdnaEffect,Driver_Mech_InferDM,Driver_GeneType_DM
0,8-Hydroxyquinoline,smp_1783,01_001_052-lib_1105,CVCL_0480,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,PANC-1,ACH-000164,CVCL_0480,Pancreas,AKT2,,Gain,GAIN,GoF,Oncogene
1,8-Hydroxyquinoline,smp_1783,01_001_105-lib_1105,CVCL_0546,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,SW480,ACH-000842,CVCL_0546,Bowel,APC,Hom,Stopgain,p.Q1338*,LoF,Suppressor
2,8-Hydroxyquinoline,smp_1783,01_001_165-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,SW1417,ACH-000236,CVCL_1717,Bowel,APC,Hom,Stopgain,p.R1450*,LoF,Suppressor
3,8-Hydroxyquinoline,smp_1783,01_003_094-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,SW1417,ACH-000236,CVCL_1717,Bowel,APC,Hom,Stopgain,p.R1450*,LoF,Suppressor
4,8-Hydroxyquinoline,smp_1783,01_003_164-lib_1105,CVCL_1056,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,A498,ACH-000555,CVCL_1056,Kidney,APC,Het,Missense,p.C1270R,LoF,Suppressor
