In [2]:
import sys
import os
sys.path.append('../')

from utils.path import get_data_dir_path
import anndata as ad
import numpy as np
import torch
import pandas as pd

In [3]:
data = ad.read_h5ad(os.path.join(get_data_dir_path(), "original", "norman", "perturb_processed.h5ad"))

In [None]:
data

AnnData object with n_obs × n_vars = 91205 × 5045
    obs: 'condition', 'cell_type', 'dose_val', 'control', 'condition_name'
    var: 'gene_name'
    uns: 'non_dropout_gene_idx', 'non_zeros_gene_idx', 'rank_genes_groups_cov_all', 'top_non_dropout_de_20', 'top_non_zero_de_20'
    layers: 'counts'

In [5]:
obs = data.obs
print(f"Unique Cell Types: {obs['cell_type'].unique()}")

assert all(
    obs["condition_name"]
    == obs["cell_type"].astype(str)
    + "_"
    + obs["condition"].astype(str)
    + "_"
    + obs["dose_val"].astype(str)
)

print(f"Unique Control values: {pd.unique(obs['control'])}")
print(f"Unique Dose Val values: {pd.unique(obs['dose_val'])}")
assert all((obs["control"] == 1) == (obs["dose_val"] == "1"))
assert all((obs["control"] == 0) == (obs["dose_val"] == "1+1"))

assert all((obs["control"] == 1) == (obs["condition"] == "ctrl"))


obs = obs.drop(["condition_name", "cell_type", "dose_val", "control"], axis=1)

Unique Cell Types: ['A549']
Categories (1, object): ['A549']
Unique Control values: [0 1]
Unique Dose Val values: ['1+1', '1']
Categories (2, object): ['1', '1+1']


Cell Type is the same for all, A549. The condition name is only a concatenation of all other columns. 
The Dose_val contains the same information as the control flag.
And the control flag is always set, when the condition is "ctrl". It therefore also does not contain any additional information.


In [None]:
one_data_point = data.X[0].toarray().T
one_data_point_count = data.layers["counts"][0].reshape(-1, 1)

print(one_data_point_count.shape, one_data_point.shape)
difference = (one_data_point - (one_data_point_count / np.sum(one_data_point)))
print(np.sum(one_data_point), np.sum(one_data_point_count))

(5045, 1) (5045, 1)
536.24414 2988.0


In [47]:
var = data.var
var.head()

Unnamed: 0_level_0,gene_name
gene_id,Unnamed: 1_level_1
ENSG00000239945,RP11-34P13.8
ENSG00000223764,RP11-54O7.3
ENSG00000187634,SAMD11
ENSG00000187642,PERM1
ENSG00000188290,HES4


In [None]:
uns = data.uns
print(f"Type: {type(uns)}")
print(uns.keys())
uns["non_dropout_gene_idx"]

Type: <class 'dict'>
dict_keys(['non_dropout_gene_idx', 'non_zeros_gene_idx', 'rank_genes_groups_cov_all', 'top_non_dropout_de_20', 'top_non_zero_de_20'])
                        condition
cell_barcode                     
AAACCTGAGGCATGTG-1   TSC22D1+ctrl
AAACCTGAGGCCCTTG-1    KLF1+MAP2K6
AAACCTGCACGAAGCA-1           ctrl
AAACCTGCAGACGTAG-1  CEBPE+RUNX1T1
AAACCTGCAGCCTTGG-1     MAML2+ctrl
...                           ...
CGGACTGTCGTACGGC-1   MAP2K3+IKZF3
CGGACTGTCGTGGTCG-1     KMT2A+ctrl
CGGAGCTAGACTTTCG-1       JUN+ctrl
CGGAGCTAGATCGATA-1      ctrl+DLX2
CGGAGCTAGCTGATAA-1    TGFBR2+ETS2

[3384 rows x 1 columns]


{'A549_AHR+FEV_1+1': array([   0,    2,    3, ..., 5042, 5043, 5044]),
 'A549_AHR+KLF1_1+1': array([   0,    1,    2, ..., 5042, 5043, 5044]),
 'A549_AHR+ctrl_1+1': array([   0,    1,    2, ..., 5041, 5042, 5044]),
 'A549_ARID1A+ctrl_1+1': array([   0,    2,    3, ..., 5041, 5042, 5044]),
 'A549_ARRDC3+ctrl_1+1': array([   0,    1,    2, ..., 5041, 5042, 5044]),
 'A549_ATL1+ctrl_1+1': array([   0,    1,    2, ..., 5042, 5043, 5044]),
 'A549_BAK1+ctrl_1+1': array([   0,    1,    2, ..., 5042, 5043, 5044]),
 'A549_BCL2L11+BAK1_1+1': array([   0,    1,    3, ..., 5041, 5042, 5044]),
 'A549_BCL2L11+TGFBR2_1+1': array([   0,    1,    2, ..., 5041, 5042, 5044]),
 'A549_BCL2L11+ctrl_1+1': array([   0,    1,    2, ..., 5041, 5042, 5044]),
 'A549_BCORL1+ctrl_1+1': array([   0,    1,    2, ..., 5042, 5043, 5044]),
 'A549_BPGM+SAMD1_1+1': array([   0,    1,    3, ..., 5041, 5042, 5044]),
 'A549_BPGM+ZBTB1_1+1': array([   0,    1,    2, ..., 5041, 5042, 5044]),
 'A549_BPGM+ctrl_1+1': array([   0, 

In [None]:
layers = data.layers

(1651,)

This results in the following data information with the respective contents:

- obs: RNA Sequences ("cell_barcode") with certain condition ("condition")
- var: gene IDs ("gene_id") with their respective names ("gene_name")
- layers: the counts of certain 
- X: The actual data, 91205 × 5045 variables (genes)