# Create anndata file from .tsv files & annotation

In [1]:
import pandas as pd
import numpy as np

import scanpy as sc
from anndata import AnnData

from scipy.sparse import csr_matrix, save_npz

Load total and allele-specific counts produced in the original analysis:

In [2]:
counts_total = pd.read_csv('../data/complete_ase_phased.total_counts.all_leads.tsv', sep='\t')
counts_allelic = pd.read_csv('../data/complete_ase_phased.allelic_counts.all_leads.tsv', sep='\t')

In [16]:
row_data = counts_total[['ensembl_gene_id', 'snp_id']]
col_names = counts_total.columns.drop(['ensembl_gene_id', 'snp_id'])

In [17]:
(row_data != counts_allelic[['ensembl_gene_id', 'snp_id']]).sum(), (col_names != counts_allelic.columns.drop(['ensembl_gene_id', 'snp_id'])).sum()

(ensembl_gene_id    0
 snp_id             0
 dtype: int64,
 0)

In [18]:
row_data.index = rows.apply(lambda x: '-'.join(x), 1)

In [24]:
col_data = pd.read_csv('../data/cell_metadata_cols.tsv', sep='\t').loc[col_names]

In [39]:
adata = AnnData(
    X=csr_matrix(counts_allelic[col_names].replace(np.nan, 0).to_numpy()), 
    obs=row_data, var=col_data)
adata.layers['allelic_total'] = csr_matrix(counts_total[col_names].replace(np.nan, 0).to_numpy())
adata = adata.T
adata

AnnData object with n_obs × n_vars = 36042 × 4470
    obs: 'assigned', 'auxDir', 'cell_filter', 'cell_name', 'compatible_fragment_ratio', 'day', 'donor', 'expected_format', 'experiment', 'frag_dist_length', 'gc_bias_correct', 'is_cell_control', 'is_cell_control_bulk', 'is_cell_control_control', 'library_types', 'libType', 'log10_total_counts', 'log10_total_counts_endogenous', 'log10_total_counts_ERCC', 'log10_total_counts_feature_control', 'log10_total_counts_MT', 'log10_total_features', 'log10_total_features_endogenous', 'log10_total_features_ERCC', 'log10_total_features_feature_control', 'log10_total_features_MT', 'mapping_type', 'mates1', 'mates2', 'n_alt_reads', 'n_total_reads', 'num_assigned_fragments', 'num_bias_bins', 'num_bootstraps', 'num_compatible_fragments', 'num_consistent_mappings', 'num_inconsistent_mappings', 'num_libraries', 'num_mapped', 'num_processed', 'num_targets', 'nvars_used', 'pct_counts_endogenous', 'pct_counts_ERCC', 'pct_counts_feature_control', 'pct_counts_

Add PCs:

In [40]:
pcs = pd.read_csv('../data/endodiff_100PCs.csv.zip', index_col=0)

cell_ids = adata.obs_names.intersection(pcs.index)
adata = adata[cell_ids, :]
adata.obsm['X_pca'] = pcs.loc[cell_ids].to_numpy()

  if not is_categorical(df_full[k]):


Filter:

In [41]:
# atleast 50 cells per region
adata = adata[:, adata.layers['allelic_total'].A.astype(bool).sum(0) > 50]
adata

View of AnnData object with n_obs × n_vars = 34254 × 3966
    obs: 'assigned', 'auxDir', 'cell_filter', 'cell_name', 'compatible_fragment_ratio', 'day', 'donor', 'expected_format', 'experiment', 'frag_dist_length', 'gc_bias_correct', 'is_cell_control', 'is_cell_control_bulk', 'is_cell_control_control', 'library_types', 'libType', 'log10_total_counts', 'log10_total_counts_endogenous', 'log10_total_counts_ERCC', 'log10_total_counts_feature_control', 'log10_total_counts_MT', 'log10_total_features', 'log10_total_features_endogenous', 'log10_total_features_ERCC', 'log10_total_features_feature_control', 'log10_total_features_MT', 'mapping_type', 'mates1', 'mates2', 'n_alt_reads', 'n_total_reads', 'num_assigned_fragments', 'num_bias_bins', 'num_bootstraps', 'num_compatible_fragments', 'num_consistent_mappings', 'num_inconsistent_mappings', 'num_libraries', 'num_mapped', 'num_processed', 'num_targets', 'nvars_used', 'pct_counts_endogenous', 'pct_counts_ERCC', 'pct_counts_feature_control', 'pct

Add published p-values:

In [52]:
pval_pub = pd.read_csv('../data/41467_2020_14457_MOESM10_ESM.txt', sep='\t', index_col=[0, 1])
pval_pub.index = ['-'.join(x) for x in pval_pub.index]
adata.var[['pval_orig_pub', 'qval_orig_pub']] = pval_pub.loc[adata.var_names, ['pval', 'qval']]

Trying to set attribute `.var` of view, copying.


In [55]:
sc.write('../data/endoderm_ase.h5ad', adata)

  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'auxDir' as categorical
... storing 'day' as categorical
... storing 'donor' as categorical
... storing 'expected_format' as categorical
... storing 'experiment' as categorical
... storing 'library_types' as categorical
... storing 'libType' as categorical
... storing 'mapping_type' as categorical
... storing 'salmon_version' as categorical
... storing 'samp_type' as categorical
... storing 'start_time' as categorical
... storing 'well_id' as categorical
... storing 'well_type' as categorical
... storing 'donor_short_id' as categorical
... storing 'donor_long_id' as categorical
... storing 'ensembl_gene_id' as categorical
... storing 'snp_id' as categorical
