In [7]:
import scanpy as sc
import pandas as pd
import numpy as np
import pickle


def get_perts(x):
    if x == ["ctrl"]:
        return x[0]
    elif len(x) == 2:
        return [i for i in x if i != "ctrl"][0]
    else:
        raise ValueError(f"Invalid condition: {x}")

# Dixit

In [2]:
dixit_single = sc.read("data/Data_biolord/dixit/dixit_single_biolord.h5ad")
embd_path = "data/Data_GeneEmbd/GenePT_V1.pickle"
with open(embd_path, "rb") as f:
    embd = pd.DataFrame(pickle.load(f)).T
ctrl_row = pd.DataFrame([np.zeros(embd.shape[1])], columns=embd.columns, index=["ctrl"])
embd = pd.concat([ctrl_row, embd])

perts = dixit_single.obs.condition.str.split("+").apply(get_perts)
in_idx = list(np.where(perts.isin(embd.index))[0])
in_perts = perts[in_idx]
in_bcodes = dixit_single.obs_names[in_idx]

dixit_single = dixit_single[in_bcodes].copy()
dixit_single.obsm["perturbation_neighbors"] = embd.loc[in_perts].values
dixit_single.write_h5ad("data/Data_biolord/dixit/dixit_single_biolord_embedding.h5ad")

  in_perts = perts[in_idx]


## Adamson

In [None]:
adamson_single = sc.read("data/Data_biolord/adamson/adamson_single_biolord.h5ad")
embd_path = "data/Data_GeneEmbd/GenePT_V1.pickle"
with open(embd_path, "rb") as f:
    embd = pd.DataFrame(pickle.load(f)).T
ctrl_row = pd.DataFrame([np.zeros(embd.shape[1])], columns=embd.columns, index=["ctrl"])
embd = pd.concat([ctrl_row, embd])
embd.rename(
    index={
        "SARS1": "SARS",
        "DARS1": "DARS",
        "QARS1": "QARS",
        "TARS1": "TARS",
        "HARS1": "HARS",
        "CARS1": "CARS",
        "SRPRA": "SRPR",
        "MARS1": "MARS",
        "AARS1": "AARS",
        "PRELID3B": "SLMO2",
    },
    inplace=True,
)
perts = adamson_single.obs.condition.str.split("+").apply(get_perts)
in_idx = list(np.where(perts.isin(embd.index))[0])
in_perts = perts[in_idx]
in_bcodes = adamson_single.obs_names[in_idx]

adamson_single = adamson_single[in_bcodes].copy()
adamson_single.obsm["perturbation_neighbors"] = embd.loc[in_perts].values
adamson_single.write_h5ad("data/Data_biolord/adamson/adamson_single_biolord_embedding.h5ad")

  in_perts = perts[in_idx]


## Norman

In [4]:
norman_single = sc.read("data/Data_biolord/norman/norman2019_single_biolord.h5ad")
embd_path = "data/Data_GeneEmbd/GenePT_V1.pickle"
with open(embd_path, "rb") as f:
    embd = pd.DataFrame(pickle.load(f)).T
ctrl_row = pd.DataFrame([np.zeros(embd.shape[1])], columns=embd.columns, index=["ctrl"])
embd = pd.concat([ctrl_row, embd])
embd.rename(
    index={
        "MAP3K21": "KIAA1804",
        "FOXL2NB": "C3orf72",
        "RHOXF2B": "RHOXF2BB",
        "MIDEAS": "ELMSAN1",
        "CBARP": "C19orf26",
    },
    inplace=True,
)
perts = norman_single.obs.condition.str.split("+").apply(get_perts)
in_idx = list(np.where(perts.isin(embd.index))[0])
in_perts = perts[in_idx]
in_bcodes = norman_single.obs_names[in_idx]

norman_single = norman_single[in_bcodes].copy()
norman_single.obsm["perturbation_neighbors"] = embd.loc[in_perts].values
norman_single.write_h5ad(
    "data/Data_biolord/norman/norman2019_single_biolord_embedding.h5ad"
)

  in_perts = perts[in_idx]


## Replogle K562

In [5]:
repk562_single = sc.read("data/Data_biolord/replogle_k562_essential/k562_single_biolord.h5ad")
repk562 = sc.read("data/Data_biolord/replogle_k562_essential/k562_biolord.h5ad")
embd_path = "data/Data_GeneEmbd/GenePT_V1.pickle"
with open(embd_path, "rb") as f:
    embd = pd.DataFrame(pickle.load(f)).T
ctrl_row = pd.DataFrame([np.zeros(embd.shape[1])], columns=embd.columns, index=["ctrl"])
embd = pd.concat([ctrl_row, embd])
embd.rename(
    index={
        "AARS1": "AARS",
        "CENATAC": "CCDC84",
        "POLR1G": "CD3EAP",
        "DARS1": "DARS",
        "EPRS1": "EPRS",
        "HARS1": "HARS",
        "IARS1": "IARS",
        "KARS1": "KARS",
        "LARS1": "LARS",
        "MARS1": "MARS",
        "QARS1": "QARS",
        "RARS1": "RARS",
        "SARS1": "SARS",
        "TARS1": "TARS",
        "POLR1F": "TWISTNB",
        "VARS1": "VARS",
        "POLR1H": "ZNRD1",
    },
    inplace=True,
)
perts = repk562_single.obs.condition.str.split("+").apply(get_perts)
in_idx = list(np.where(perts.isin(embd.index))[0])
in_perts = perts[in_idx]
in_bcodes = repk562_single.obs_names[in_idx]

repk562_single = repk562_single[in_bcodes].copy()
repk562_single.obsm["perturbation_neighbors"] = embd.loc[in_perts].values
repk562_single.write_h5ad("data/Data_biolord/replogle_k562_essential/k562_single_biolord_embedding.h5ad")

repk562 = repk562[repk562.obs.condition.isin(in_perts.index)].copy()
repk562.write_h5ad("data/Data_biolord/replogle_k562_essential/k562_biolord_embedding.h5ad")

  in_perts = perts[in_idx]


## Replogle RPE1

In [6]:
reprpe1_single = sc.read("data/Data_biolord/replogle_rpe1_essential/rpe1_single_biolord.h5ad")
reprpe1 = sc.read("data/Data_biolord/replogle_rpe1_essential/rpe1_biolord.h5ad")
embd_path = "data/Data_GeneEmbd/GenePT_V1.pickle"
with open(embd_path, "rb") as f:
    embd = pd.DataFrame(pickle.load(f)).T
ctrl_row = pd.DataFrame([np.zeros(embd.shape[1])], columns=embd.columns, index=["ctrl"])
embd = pd.concat([ctrl_row, embd])
embd.rename(index={"ZZZ3": "AC118549.1"}, inplace=True)
perts = reprpe1_single.obs.condition.str.split("+").apply(get_perts)
in_idx = list(np.where(perts.isin(embd.index))[0])
in_perts = perts[in_idx]
in_bcodes = reprpe1_single.obs_names[in_idx]

reprpe1_single = reprpe1_single[in_bcodes].copy()
reprpe1_single.obsm["perturbation_neighbors"] = embd.loc[in_perts].values
reprpe1_single.write_h5ad(
    "data/Data_biolord/replogle_rpe1_essential/rpe1_single_biolord_embedding.h5ad"
)

reprpe1 = reprpe1[reprpe1.obs.condition.isin(in_perts.index)].copy()
reprpe1.write_h5ad("data/Data_biolord/replogle_rpe1_essential/rpe1_biolord_embedding.h5ad")

  in_perts = perts[in_idx]
