In [1]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

import scanpy as sc
import numpy as np
from scipy import sparse
import random
import numpy as np
import scipy.sparse as sp
import pandas as pd
import anndata as ad
import json
import argparse

In [None]:
# Parse cell_lines and drugs from json file
parser = argparse.ArgumentParser()
parser.add_argument("-f", type=str, help="Path to dataset_config.json file")
args = parser.parse_args()

# Read dataset configuration
config_path = args.f
with open(config_path_path, "r") as f:
    arguments = json.load(f)

lines_select = arguments["cell_lines"]
control_name = "DMSO_TF"
drugs_select = arguments["drugs"]
drugs_select.append(control_name)

export_name = arguments["name"]

In [3]:
data_dir = "/cluster/work/bewi/data/tahoe100/h5ad/"
metadata_path = "/cluster/work/bewi/members/rquiles/experiments/datasets/obs_metadata.parquet"

print("Inspecting metadata...")
metadata = pd.read_parquet(metadata_path)

select = metadata[(metadata["cell_name"].isin(lines_select)) & (metadata["drug"].isin(drugs_select))]

# Determine which plates to inspect
plates = np.unique(select["plate"])

# Loop through plates and append the matching rows
final_adata = None

for plate in plates:
    print(f"Loading {plate}...")
    data_path = f"/cluster/work/bewi/data/tahoe100/h5ad/{plate}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad"
    adata = sc.read_h5ad(data_path, backed="r")
    print(f"Subsetting {plate}...")
    
    ids = select[select["plate"] == plate]["BARCODE_SUB_LIB_ID"].values
    subset = adata[adata.obs_names.isin(ids), :].to_memory()
    # Initialize or append final AnnData
    if final_adata is None:
        final_adata = subset
    else:
        final_adata = ad.concat([final_adata, subset], join="outer")

Inspecting metadata...
Subsetting plate1...
Subsetting plate10...
Subsetting plate11...
Subsetting plate12...


KeyboardInterrupt: 

In [4]:
final_adata

AnnData object with n_obs × n_vars = 37067 × 62710
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate'

In [5]:
# Randomize the rows in final_adata
n = final_adata.n_obs  # number of cells
perm = np.random.permutation(n)
adata = final_adata[perm, :]

In [6]:
# Typical downstream processing of the AnnData to match the FCR method code
file_name = ""
export_path = f"/cluster/work/bewi/members/rquiles/experiments/datasets/{file_name}.h5ad"

change_col_names = {"drug":"Agg_Treatment", "cell_line": "covariates"}

## UPDATE COLUMNS NEW DATASET
adata.obs = adata.obs.rename(columns=change_col_names)
adata.obs["control"] = adata.obs[change_col_names["drug"]] == control_name
adata.obs["control"] = adata.obs["control"].astype(int)
adata.uns["fields"] = []
adata.obs["dose"] = adata.obs["drugname_drugconc"].str.split(",").str[1].astype(float)

In [7]:
## SUBSET TO BALANCE THE NUMBER OF CONTROLS
idx = []
keep_rows = []

# First, find how many rows each cell_line–treatment pair has
for cell_line in lines_select:
    mask = (
        (adata.obs["cell_name"] == cell_line)
        & (adata.obs[change_col_names["drug"]] != control_name)
    )
    row_indexes = adata.obs[mask].index

    # Keep all treated samples
    n_treated = len(row_indexes)
    if len(row_indexes) > 0:
        keep_rows.extend(row_indexes)

    # Keep number of control samples = 1/3 treated samples
    n_controls = n_treated//3
    mask = (
        (adata.obs["cell_name"] == cell_line)
        & (adata.obs[change_col_names["drug"]] == control_name)
    )
    row_indexes = adata.obs[mask].index
    row_indexes = row_indexes[:n_controls]
    if len(row_indexes) > 0:
        keep_rows.extend(row_indexes)

# Randomize row order
random.shuffle(keep_rows)

# Subset the AnnData object
filtered_adata = adata[keep_rows, :]

In [8]:
filtered_adata

View of AnnData object with n_obs × n_vars = 14922 × 62710
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'Agg_Treatment', 'covariates', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate', 'control', 'dose'
    uns: 'fields'

In [None]:
## PREPROCESS AND EXPORT
export_path = f"/cluster/work/bewi/members/rquiles/experiments/datasets/{export_name}"

sc.pp.normalize_total(filtered_adata, target_sum=1e4)
sc.pp.log1p(filtered_adata)
sc.pp.highly_variable_genes(filtered_adata, n_top_genes=2000, subset=True)
sc.pp.scale(filtered_adata, max_value=10)
filtered_adata = filtered_adata[:, filtered_adata.var.highly_variable]
filtered_adata.write(export_path)