In [66]:
%matplotlib inline
import matplotlib
import scanpy as sc
import pandas as pd
import anndata
import convert_data_to_scp as conv
import os
import ipywidgets as widgets
import requests
import upload_helpers as uh
import MetadataAdder as ma

# Alexandria metadata annotation for scanpy files

For this notebook, you are expected to have the following inputs:


* an AnnData (scanpy) object saved as an h5ad file with the following attributes: 
    * containing few enough cells that you are willing to save it in dense format
    * with some dimensionality reduction already run (ex. UMAP or tSNE)
    * with some metadata/clustering already done
    * at least one metadata column with a value describing the sample of each cell such that all cells from that sample have an identical value in that column
    
* (optional) a per sample csv, txt, excel, or tsv file with the following attributes:
    * one row per sample
    * the values in at least one column are sample names, and these sample names exactly match the values in the sample column in the scanpy metadata
    
* (optional) a per donor csv, txt, excel, or tsv file with the following attributes:
    * This is useful if you have multiple samples per donor and some metadata that is the same each sample from the same donor
    * there should be a column in this file with values as donor identifiers that maps to a column in the sample csv or the scanpy object
    
    
    
    
    
## Troubleshooting:

If you are not seeing graphical output try running ``jupyter labextension install @jupyter-widgets/jupyterlab-manager``

## USER INPUT: Paths to files

In [67]:

# path to anndata/scanpy object
anndata_path = "epithelial_cell_clustering.h5ad"

# path to sample file, leave as "" if you are not using it
sample_metadata_path = ""

# path to donor file, leave as "" if you are not using it
donor_metadata_path = ""

In [68]:
adata = sc.read_h5ad(anndata_path)

print(adata)

AnnData object with n_obs × n_vars = 4447 × 2259 
    obs: 'sample', 'samplename_seurat_obj', 'incude for analysis', 'baby born', 'date processed', 'time post partum (days)', 'time post partum (weeks)', 'milk stage', 'donor ', 'milk processing ', 'bead lot ', 'array lot', 'cell count live (per mL)', 'cell count dead (per mL)', 'sample volume (mL)', 'time to processing (hrs)', 'time pumped', 'infant sick (yes or no)', 'weening (yes or no)', 'mastisis (yes or no)', 'other soreness in breast (yes or no)', 'other health issues reported', 'directly breast feeding (yes or no)', 'any formula (yes or no)', 'sample from right, left or both breasts? ', 'favored breast?', 'sample provided in', 'sequencing run ', 'FASTQ Location', 'Age', 'Week Delivered ', 'Labor Induced ', 'Antibiotics During Delivery ', 'Pregnancy Number', 'Miscarrage ', 'Delivery Mode', 'Race', 'doublet_scores', 'predicted_doublet', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_count

In [69]:
mapping_options = {}
mapping_options["anndata"] = adata
if len(sample_metadata_path) > 0:
    mapping_options[sample_metadata_path] = sample_metadata_path
    
if len(donor_metadata_path) > 0:
    mapping_options[donor_metadata_path] =donor_metadata_path

In [None]:
output_dir = ""

## Step 1: Dimensionality reduction 'cluster files'

The SCP expects one file per dimensionality reduction visualization, so the following code chunk will save a file for each dimensionality reduction stored in the ``obsm`` attribute of your ``AnnData`` object. The ``print(adata)`` command above should show you the available dimensionality reduction values. 


The upload interface also requires a minimum and maximum for each axis of each dimensionailty reduction. This next code chunk will also print out those results so you can enter them in the upload interface.

In [None]:
conv.save_cluster_dfs(adata, output_dir)

## Step 2: expression matrices

Next you will save your expression matrix in the SCP format



In [None]:
out_df = make_expression_df(adata)
out_df.to_csv(os.path.join(output_dir, "expression_file.txt.gz"), compression='gzip')

## Step 3: Set up metadata

Now we will convert metadata in the scanpy object to the correct metadata format and naming scheme for Alexandria

In [70]:
cell_level_metadata = pd.DataFrame(index=adata.obs_names)

### Starting with global metadata

These metadata attributes will likely be the same for all of your columns

In [71]:
global_attributes = {}

### Select species for this data

Type a search term in the drop down below (ex. 'homo' for human or 'mus' for mouse)

then run the next code section and select the value from that drop-down

If you do not see the value you were looking for and would like to search again, type in a new value in the text box and *rerun the code block that generates the species*

In [100]:
s =widgets.Text(
    value='Species',
    placeholder='Species search',
    description='Species:',
    disabled=False
)
display(s)

Text(value='Species', description='Species:', placeholder='Species search')

In [105]:
list_for_dropdown, name_id_dict=uh.query_search_term('ncbitaxon',s.value)
m=uh.choose_metadata_name_dropdown(list_for_dropdown,"species")
display(m)

Dropdown(description='species', options=(('Mus musculoides: ', 'NCBITaxon_60742'), ('Mus fragilicauda: ', 'NCB…

### Once you are happy with your selection above, run the code block below!!

In [72]:
global_attributes['species']=m.value
global_attributes['species__ontology_label'] = name_id_dict[m.value]

AttributeError: 'HBox' object has no attribute 'value'

### Select library preparation protocol for this data

Just select from the dropdown below then run the code block under it

In [125]:
list_for_dropdown, name_id_dict = uh.query_all_values_under_root("efo","EFO_0001457")
m=uh.choose_metadata_name_dropdown(list_for_dropdown,"experimental method")
display(m)

11


Dropdown(description='experimental method', options=(('RARseq: Restriction site associated RNA sequencing', 'E…

In [126]:
global_attributes['library_preparation_protocol']=m.value
global_attributes['library_preparation_protocol__ontology_label'] = name_id_dict[m.value]

In [73]:
for k,v in global_attributes.items():
    cell_level_metadata[k] = v

cell_level_metadata["CellID"] = cell_level_metadata.index

In [74]:
metadata_info = pd.read_csv("metadata_name_type_info.tsv",sep="\t",index_col=0)
metadata_info["is unit"] = ["unit" in i for i in metadata_info.index]
available_metadata = metadata_info[~metadata_info["class"].isin(["unit_label", "ontology_label"]) & ~metadata_info["is unit"]].index

available_metadata=available_metadata.drop("species")
available_metadata=available_metadata.drop('library_preparation_protocol')

## Renaming metadata from your dataframe and files

Now we will facilitate mapping any of your metadata to the Alexandria metadata convention.

If there is required (or optional) metadata that you would like to add that is not already in one of your files AND is an ontology or controlled list metadata type AND follows the same pattern as some other metadata in your files (ex. sample level or donor level), you can do so by mapping the metadata from some unrelated metadata.

For example, if you recorded the organ of each sample but did not save it as an explicit column, you can choose the sample column and map from that


You should run this group of cells once per metadata source (as in sample level file, your cell level dataframe, etc)

In [139]:
meta_addr = ma.MetadataAdder(mapping_options, available_metadata, metadata_info, cell_level_metadata)

Dropdown(description='table with metadata to map', options=('anndata',), value='anndata')

Button(description='select table', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Dropdown(description='my metadata column', options=('sample', 'samplename_seurat_obj', 'incude for analysis', …

Dropdown(description='Alexandria metadata column', options=('donor_id', 'ethnicity', 'race', 'mouse_strain', '…

Button(description='map ', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Button(description='save my maps, I am done with this matrix', layout=Layout(height='40px', width='auto'), sty…

This metadata column is a non-controlled string type. It will be directly mapped from the values in your column.


Dropdown(description='my metadata column', options=('sample', 'samplename_seurat_obj', 'incude for analysis', …

Dropdown(description='Alexandria metadata column', options=('ethnicity', 'race', 'mouse_strain', 'vaccination'…

Button(description='map ', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Button(description='save my maps, I am done with this matrix', layout=Layout(height='40px', width='auto'), sty…

Each value in this column will need to be mapped to a controlled vocabulary. Please select the unique value you would like to map first


HBox(children=(Text(value='BM01_12dpp', description='BM01_12dpp', placeholder='search term'), Button(descripti…

HBox(children=(Text(value='BM01_7wkpp', description='BM01_7wkpp', placeholder='search term'), Button(descripti…

HBox(children=(Text(value='BM01_16dpp_r3', description='BM01_16dpp_r3', placeholder='search term'), Button(des…

HBox(children=(Text(value='BM01_5dpp_r1', description='BM01_5dpp_r1', placeholder='search term'), Button(descr…

HBox(children=(Text(value='BM01_13wkpp_r2', description='BM01_13wkpp_r2', placeholder='search term'), Button(d…

Button(description='save', style=ButtonStyle())

In [276]:
cell_level_metadata = meta_attr.cell_level_metadata

Now if you have any more files to add, go to step 1. If not go to the next cell

## Check that all required keys are mapped

In [279]:
required_keys = metadata_info.loc[metadata_info["required"]=="Yes"].index

In [290]:
type(metadata_info.loc["disease","default"])

str

In [291]:
for k in required_keys:
    if k not in cell_level_metadata.columns:
        #default?
        default_val = metadata_info.loc[k,"default"]
        if type(default_val) is str: # this is sketchy but right now all the defaults are strings so whatevs
            cell_level_metadata[k] = default_val
        else:
            print("You need to add a value for "+k+" before proceeding!")

You need to add a value for is_living before proceeding!
You need to add a value for sample_type before proceeding!
You need to add a value for disease__ontology_label before proceeding!
You need to add a value for CellID before proceeding!


In [293]:
metadata_info.columns

Index(['required', 'default', 'type', 'array', 'class', 'ontology',
       'ontology_root', 'controlled_list_entries', 'dependency',
       'dependency_condition', 'dependent', 'attribute_description',
       'is unit'],
      dtype='object')

## All numeric metadata needs units, so now add units...

In [303]:
# make a list of unit metadata that you have
my_metadata_info = metadata_info.loc[cell_level_metadata.columns]
unit_dropdowns = {}
name_id_dicts = {}
for dep_metadata in my_metadata_info.loc[~my_metadata_info["dependent"].isna()].index:
    m_name = metadata_info.loc[dep_metadata,"dependent"]
    
    if metadata_info.loc[m_name, "class"] == "ontology":
        ont = metadata_info.loc[m_name, "ontology"].split("/")[-1]
        list_for_dropdown, name_id_dict = uh.query_all_values_under_root(ont,metadata_info.loc[m_name, "ontology_root"])
        unit_dropdowns[m_name] = uh.choose_metadata_name_dropdown(list_for_dropdown,m_name)
        name_id_dicts[m_name] = name_id_dict
    else:
        unit_dropdowns[m_name] = widgets.Text(
                    value="type unit here",
                    placeholder='type unit here',
                    description=m_name,
                    disabled=False
                )
    
for n,v in unit_dropdowns.items():
    display(v)

1


Dropdown(description='organism_age__unit', options=(("month: A time unit which is approximately equal to the l…

In [305]:
unit_values = {}
for n,v in unit_dropdowns.items():
    cells_with_dep = cell_level_metadata.loc[~cell_level_metadata[metadata_info.loc[n,"dependency"]].isna()].index
    val = v.value
    if n in name_id_dicts:
        cell_level_metadata.loc[cells_with_dep, n+"__ontology_label"] = name_id_dicts[n][val]
    cell_level_metadata.loc[cells_with_dep,n] = val



(13341, 17)

(4447, 8)

## Add in any unstructured metadata that you want to add (and their types)

## Map the structured Alexandria metadata types