In [1]:
%matplotlib inline
import matplotlib
import scanpy as sc
import pandas as pd
import anndata
import convert_adata_to_scp as conv
import os
import ipywidgets as widgets
import requests
import widget_helpers as uh
import MetadataAdder as ma

  from ._conv import register_converters as _register_converters


In [2]:
importlib.reload(ma)

NameError: name 'importlib' is not defined

# Alexandria metadata annotation for scanpy or Seurat files

For this notebook, you are expected to have the following inputs:



* ONE cell level metadata text or csv file:
    * cells as rows, metadata as columns
    * this can come from seurat_obs@meta.data containing cell-level metadata, or a scanpy_obs.obs
    * all cell names must be unique and only include alphanumeric characters and underscores
    * the cell names must exactly match the cells names in your expression files
    * all cells must be in one metadata file


---AND---
* (optional) a per sample csv, txt, excel, or tsv file with the following attributes:
    * one row per sample
    * the values in at least one column are sample names, and these sample names exactly match the values in the sample column in the scanpy/seurat metadata
    
* (optional) a per donor csv, txt, excel, or tsv file with the following attributes:
    * This is useful if you have multiple samples per donor and some metadata that is the same each sample from the same donor
    * there should be a column in this file with values as donor identifiers that maps to a column in the sample csv or the scanpy/seurat object
    
    
    
    
    
## Troubleshooting:

If you are not seeing graphical output try running ``jupyter labextension install @jupyter-widgets/jupyterlab-manager``

## USER INPUT: Paths to files

In [65]:

# complete path to cell level metadata file, 
# these should be: cells as rows and metadata/clusters as columns
# this is required!
#cell_metadata_path = "/Users/nyquist/Dropbox (MIT)/Breast Milk Study/zz_Analysis/Sarah/all_cells_clustering/october_analysis/immune_cells_for_britt/immune_cells_metadata.csv"
cell_metadata_path = "/Users/nyquist/Dropbox (MIT)/CD8_depletion/all_cells_metadata.csv"
#sample_metadata_path = "/Users/nyquist/Dropbox (MIT)/shalek_data/Projects/Alexandria/test_metadata_files/sample_metadata.txt"

#donor_metadata_path = "/Users/nyquist/Dropbox (MIT)/shalek_data/Projects/Alexandria/test_metadata_files/donor_metadata.txt"


# desired output directory full path (leave as "" if you want to output to the directory containing this notebook)
output_dir = "/Users/nyquist/Dropbox (MIT)/CD8_depletion/"

output_file_names = {}

In [3]:
# no editing required for this cell, just run it
mapping_options = {}
if ".txt" in cell_metadata_path or ".tsv" in cell_metadata_path:
    cell_df = pd.read_csv(cell_metadata_path, index_col=0,sep="\t")
elif ".csv" in cell_metadata_path:
    cell_df = pd.read_csv(cell_metadata_path, index_col=0)
else:
    print("cell_metadata_path must be a .tsv, .txt, or .csv file")
mapping_options["cell level dataframe"] = cell_df

if len(sample_metadata_path) > 0:
    mapping_options["sample_metadata_path"] = sample_metadata_path
    
if len(donor_metadata_path) > 0:
    mapping_options["donor_metadata_path"] = donor_metadata_path



  interactivity=interactivity, compiler=compiler, result=result)


NameError: name 'sample_metadata_path' is not defined

# Set up metadata

Now we will convert metadata in your object to the correct metadata format and naming scheme for Alexandria


This includes 3 parts:

* Global Metadata

* Alexandria structured metadata to be mapped from your metadata tables

* Units for numeric metadata

* Unstructured metadata mapped from your metadata tables


In [4]:
# TODO: adjust this if you are not looking for cell level and are doing sample level instead
cell_level_metadata = pd.DataFrame(index=mapping_options["cell level dataframe"].index)

### Step 3a: Starting with global metadata

These metadata attributes must be the same for all of your columns

In [5]:
global_attributes = {}

### Global Metadata: Select species for this data

Type a search term in the drop down below (ex. 'homo' for human or 'mus' for mouse)

then run the next code section and select the value from that drop-down

If you do not see the value you were looking for and would like to search again, type in a new value in the text box and *rerun the code block that generates the species*

In [6]:
s =widgets.Text(
    value='Species',
    placeholder='Species search',
    description='Species:',
    disabled=False
)
display(s)
print("enter a search term for the species in the box above then run the next notebook cell to see a dropdown of search results")

Text(value='Species', description='Species:', placeholder='Species search')

enter a search term for the species in the box above then run the next notebook cell to see a dropdown of search results


In [7]:
list_for_dropdown, name_id_dict=uh.query_search_term('ncbitaxon',s.value)
m=uh.choose_metadata_name_dropdown(list_for_dropdown,"species")
display(m)

Dropdown(description='species', options=(('Macaca mulatta: ', 'NCBITaxon_9544'), ('Macaca mulatta vestita: ', …

### Once you are happy with your selection above, run the code block below!!

In [8]:
global_attributes['species']=m.value
global_attributes['species__ontology_label'] = name_id_dict[m.value]

### Global Metadata: Select library preparation protocol for this data

Just select from the dropdown below then run the code block under it

In [9]:
print("this takes a second if your internet connection isn't super fast")
list_for_dropdown, name_id_dict = uh.query_all_values_under_root("efo","EFO_0001457")
exp_method_dpdn=uh.choose_metadata_name_dropdown(list_for_dropdown,"experimental method")
display(exp_method_dpdn)
print("select from dropdown above then run the next notebook cell. If you want to change the selection, you need to re-run the cell below")

this takes a second if your internet connection isn't super fast
11


Dropdown(description='experimental method', options=(('RARseq: Restriction site associated RNA sequencing', 'E…

select from dropdown above then run the next notebook cell. If you want to change the selection, you need to re-run the cell below


In [10]:
global_attributes['library_preparation_protocol']=exp_method_dpdn.value
global_attributes['library_preparation_protocol__ontology_label'] = name_id_dict[exp_method_dpdn.value]

print(name_id_dict[exp_method_dpdn.value] +" saved as library preparation protocol")

Seq-Well saved as library preparation protocol


In [11]:
# explicitly saving the cellID column 
for k,v in global_attributes.items():
    cell_level_metadata[k] = v

cell_level_metadata["CellID"] = cell_level_metadata.index

In [12]:
# reading in all the metadata in the convention
#TODO: make this work with JSON, for now it is just a table copy and pasted from the google sheets
metadata_info = pd.read_csv("metadata_name_type_info.tsv",sep="\t",index_col=0)
metadata_info["is unit"] = ["unit" in i for i in metadata_info.index] # removing units from this so people are less confused
# removing the "label" types because we will add those automatically
available_metadata = metadata_info[~metadata_info["class"].isin(["unit_label", "ontology_label"]) & ~metadata_info["is unit"]].index

# we already added species and library prep so drop those as well
available_metadata=available_metadata.drop("species")
available_metadata=available_metadata.drop('library_preparation_protocol')

## Step 3b: Renaming metadata from your dataframe and files

Now we will facilitate mapping any of your metadata to the Alexandria metadata convention.

If there is required (or optional) metadata that you would like to add that is not already in one of your files AND is an ontology or controlled list metadata type AND follows the same pattern as some other metadata in your files (ex. sample level or donor level), you can do so by mapping the metadata from some unrelated metadata.

For example, if you recorded the organ of each sample but did not save it as an explicit column, you can choose the sample column and map from that


You should run this group of cells once per metadata source (as in sample level file, your cell level dataframe, etc)

In [13]:
metadata_info.columns

Index(['required', 'default', 'type', 'array', 'class', 'ontology',
       'ontology_root', 'controlled_list_entries', 'dependency',
       'dependency_condition', 'dependent', 'attribute_description',
       'is unit'],
      dtype='object')

### The following metadata are required, make sure you add them:

In [14]:
for k in metadata_info.loc[metadata_info["required"]=="Yes"].index:
    print(k+": "+str(metadata_info.loc[k,"attribute_description"]))

species: The scientific binomial name for the species of the organism.
species__ontology_label: species__ontology_label
sex: Biological sex
is_living: Whether organism was alive at time of biomaterial collection
biosample_id: Biosample ID
organ: The organ that the biomaterial came from
organ__ontology_label: organ__ontology_label
sample_type: one of: cell line, organoid, direct from donor (fresh), direct from donor (frozen), cultured primary cells
disease: The disease state(s) of the individual donating the sample at the time of donation
disease__ontology_label: disease__ontology_label
library_preparation_protocol: The single cell RNA-sequencing protocol used for Library preparation
library_preparation_protocol__ontology_label: library_preparation_protocol__ontology_label
CellID: Cell ID
donor_id: Donor ID


In [47]:
import importlib
importlib.reload(ma)

<module 'MetadataAdder' from '/Users/nyquist/Dropbox (MIT)/shalek_data/Projects/Alexandria/alexandria/uploadHelpers/MetadataAdder.py'>

In [45]:
importlib.reload(uh)

<module 'widget_helpers' from '/Users/nyquist/Dropbox (MIT)/shalek_data/Projects/Alexandria/alexandria/uploadHelpers/widget_helpers.py'>

In [35]:
float("6.0")

6.0

In [48]:
# this is the workhorse cell of the metadata adder!
# TODO: give indication of which metadata has been added
# TODO: array type metadata is not handled at all yet
# Searching celltypes is crazy slow, maybe stop loading all of the children as search results?
meta_addr = ma.MetadataAdder(mapping_options, available_metadata, metadata_info, cell_level_metadata)

Dropdown(description='table with metadata to map', options=('cell level dataframe',), style=DescriptionStyle(d…

Button(description='select table', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Output()

Dropdown(description='my metadata column', options=('Animal', 'Array', 'CellID', 'CellType', 'Granuloma', 'UMA…

Dropdown(description='Alexandria metadata column', options=('ethnicity', 'race', 'mouse_strain', 'vaccination'…

Button(description='map ', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Button(description='save my maps, I am done with this matrix', layout=Layout(height='40px', width='auto'), sty…

FloatText(value=0.0, description='Old', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.0, description='No gran', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.0, description='New', style=DescriptionStyle(description_width='initial'))

Button(description='save numbers', style=ButtonStyle())

Dropdown(description='my metadata column', options=('Animal', 'Array', 'CellID', 'CellType', 'Granuloma', 'UMA…

Dropdown(description='Alexandria metadata column', options=('ethnicity', 'race', 'mouse_strain', 'vaccination'…

Button(description='map ', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Button(description='save my maps, I am done with this matrix', layout=Layout(height='40px', width='auto'), sty…

Dropdown(description='Old', options=('yes', 'no', 'unknown', ''), style=DescriptionStyle(description_width='in…

Dropdown(description='No gran', options=('yes', 'no', 'unknown', ''), style=DescriptionStyle(description_width…

Dropdown(description='New', options=('yes', 'no', 'unknown', ''), style=DescriptionStyle(description_width='in…

Button(description='map', style=ButtonStyle())

Dropdown(description='my metadata column', options=('Animal', 'Array', 'CellID', 'CellType', 'Granuloma', 'UMA…

Dropdown(description='Alexandria metadata column', options=('ethnicity', 'race', 'mouse_strain', 'vaccination'…

Button(description='map ', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Button(description='save my maps, I am done with this matrix', layout=Layout(height='40px', width='auto'), sty…

Dropdown(description='0.0', options=('cell line', 'organoid', 'direct from donor - fresh', 'direct from donor …

Dropdown(description='nan', options=('cell line', 'organoid', 'direct from donor - fresh', 'direct from donor …

Dropdown(description='1.0', options=('cell line', 'organoid', 'direct from donor - fresh', 'direct from donor …

Dropdown(description='0', options=('cell line', 'organoid', 'direct from donor - fresh', 'direct from donor - …

Dropdown(description='0?', options=('cell line', 'organoid', 'direct from donor - fresh', 'direct from donor -…

Dropdown(description='1', options=('cell line', 'organoid', 'direct from donor - fresh', 'direct from donor - …

Button(description='map', style=ButtonStyle())

Dropdown(description='my metadata column', options=('Animal', 'Array', 'CellID', 'CellType', 'Granuloma', 'UMA…

Dropdown(description='Alexandria metadata column', options=('ethnicity', 'race', 'mouse_strain', 'vaccination'…

Button(description='map ', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Button(description='save my maps, I am done with this matrix', layout=Layout(height='40px', width='auto'), sty…

HBox(children=(Text(value='Old', description='Old', placeholder='search term', style=DescriptionStyle(descript…

HBox(children=(Text(value='No gran', description='No gran', placeholder='search term', style=DescriptionStyle(…

HBox(children=(Text(value='New', description='New', placeholder='search term', style=DescriptionStyle(descript…

Button(description='save', style=ButtonStyle())

Dropdown(description='my metadata column', options=('Animal', 'Array', 'CellID', 'CellType', 'Granuloma', 'UMA…

Dropdown(description='Alexandria metadata column', options=('ethnicity', 'race', 'mouse_strain', 'vaccination'…

Button(description='map ', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Button(description='save my maps, I am done with this matrix', layout=Layout(height='40px', width='auto'), sty…

Dropdown(description='Old', options=('male', 'female', 'mixed', 'unknown', ''), style=DescriptionStyle(descrip…

Dropdown(description='No gran', options=('male', 'female', 'mixed', 'unknown', ''), style=DescriptionStyle(des…

Dropdown(description='New', options=('male', 'female', 'mixed', 'unknown', ''), style=DescriptionStyle(descrip…

Button(description='map', style=ButtonStyle())

Dropdown(description='my metadata column', options=('Animal', 'Array', 'CellID', 'CellType', 'Granuloma', 'UMA…

Dropdown(description='Alexandria metadata column', options=('ethnicity', 'race', 'mouse_strain', 'vaccination'…

Button(description='map ', layout=Layout(height='40px', width='auto'), style=ButtonStyle())

Button(description='save my maps, I am done with this matrix', layout=Layout(height='40px', width='auto'), sty…

In [49]:
cell_level_metadata = meta_addr.cell_level_metadata


In [50]:
cell_level_metadata

Unnamed: 0,species,species__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label,CellID,donor_id,biosample_id,cell_type,cell_type__ontology_label,disease,disease__ontology_label,organ,organ__ontology_label,disease__time_since_onset,is_living,sample_type,sequencing_instrument_manufacturer_model,sequencing_instrument_manufacturer_model__ontology_label,sex
Array1_28818_AAAAAAGCGGTC,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAAAGCGGTC,28818,Array1_28818,CL_0000084,T cell,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAAATACGAC,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAAATACGAC,28818,Array1_28818,CL_0000097,mast cell,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAACAAGGAG,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAACAAGGAG,28818,Array1_28818,CL_0000097,mast cell,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAACCCTTCT,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAACCCTTCT,28818,Array1_28818,CL_0000235,macrophage,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAACCTGGGA,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAACCTGGGA,28818,Array1_28818,CL_0000084,T cell,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAACTGTGGG,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAACTGTGGG,28818,Array1_28818,CL_0000097,mast cell,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAAGGGATTG,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAAGGGATTG,28818,Array1_28818,CL_0000235,macrophage,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAAGTACACA,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAAGTACACA,28818,Array1_28818,CL_0000084,T cell,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAAGTGTGAC,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAAGTGTGAC,28818,Array1_28818,CL_0000236,B cell,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown
Array1_28818_AAAAATAAACAG,NCBITaxon_9544,Macaca mulatta,EFO_0008919,Seq-Well,Array1_28818_AAAAATAAACAG,28818,Array1_28818,CL_0000084,T cell,MONDO_0018076,tuberculosis,UBERON_0002170,upper lobe of right lung,6.0,no,direct from donor - fresh,EFO_0008637,Illumina NovaSeq 6000,unknown


In [23]:
cell_level_metadata["donor_id"].unique()

array(['BM01', 'BM02', 'BM03', 'BM04', 'BM05', 'BM06', 'BM07', 'BM08',
       'BM10', 'poop', 'BM11'], dtype=object)

Now if you have any more files to add, go back to _Step 3b_. If not go to the next cell

## Step 3c: Check that all required keys are mapped

In [51]:
required_keys = metadata_info.loc[metadata_info["required"]=="Yes"].index

In [52]:
for k in required_keys:
    if k not in cell_level_metadata.columns:
        #default?
        default_val = metadata_info.loc[k,"default"]
        if type(default_val) is str: # this is sketchy but right now all the defaults are strings so whatevs
            cell_level_metadata[k] = default_val
        else:
            print("You need to add a value for "+k+" before proceeding!")

To add these required values you should go back to Step 3b. 

## Step 3d: All numeric metadata needs units, so now add units...

In [53]:
# make a list of unit metadata that you have
my_metadata_info = metadata_info.loc[cell_level_metadata.columns]
unit_dropdowns = {}
name_id_dicts = {}
for dep_metadata in my_metadata_info.loc[~my_metadata_info["dependent"].isna()].index:
    m_name = metadata_info.loc[dep_metadata,"dependent"]
    
    if metadata_info.loc[m_name, "class"] == "ontology":
        ont = metadata_info.loc[m_name, "ontology"].split("/")[-1]
        list_for_dropdown, name_id_dict = uh.query_all_values_under_root(ont,metadata_info.loc[m_name, "ontology_root"])
        unit_dropdowns[m_name] = uh.choose_metadata_name_dropdown(list_for_dropdown,m_name)
        name_id_dicts[m_name] = name_id_dict
    else:
        unit_dropdowns[m_name] = widgets.Text(
                    value="type unit here",
                    placeholder='type unit here',
                    description=m_name,
                    disabled=False
                )
    
for n,v in unit_dropdowns.items():
    display(v)

1


Dropdown(description='disease__time_since_onset__unit', options=(("month: A time unit which is approximately e…

In [54]:
unit_values = {}
for n,v in unit_dropdowns.items():
    cells_with_dep = cell_level_metadata.loc[~cell_level_metadata[metadata_info.loc[n,"dependency"]].isna()].index
    val = v.value
    if n in name_id_dicts:
        cell_level_metadata.loc[cells_with_dep, n+"_label"] = name_id_dicts[n][val]
    cell_level_metadata.loc[cells_with_dep,n] = val



(13341, 17)

(4447, 8)

## Step 3e: Add in any unstructured metadata that you want to add (and their types)

In [55]:
print("here are all the metadata columns in your cell level files:")

print(mapping_options["cell level dataframe"].columns)



here are all the metadata columns in your cell level files:
Index(['Animal', 'Array', 'CellID', 'CellType', 'Granuloma', 'UMAP1', 'UMAP2',
       'batch', 'louvain', 'n_counts', 'n_genes', 'sample', 'leiden',
       'mapped_celltype', 'Date', 'treatment', 'infection dose', 'Tissue name',
       'CFU/granuloma', 'total thoracic CFU', 'lung region', 'celltypes all',
       'log1p_n_genes_by_counts', 'log1p_total_counts', 'n_genes_by_counts',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'pct_counts_in_top_50_genes',
       'percent_mito', 'total_counts', 'New', 'Gran State', 'sample_name'],
      dtype='object')


Metadata names need to have no spaces and only have alphanumeric and underscore characters. If some of the names you want to map do not meet this criteria, map them to acceptable values in the cell below

In [57]:
mapping_options["cell level dataframe"].rename(columns={"n_genes_by_counts":"num_genes","treatment":"depletion_type","CFU/granuloma":"CFU_per_granuloma",
                                                       "total thoracic CFU":"total_thoracic_CFU","infection dose":"infection_dose","lung region":"lung_region",
                                                       "Gran State":"granuloma_state"}, inplace=True)

In the dictionary below, replace the keys with column names in the choices printed above and replace the values with the type: group or numeric, of the column. Add as many as you would like

In [58]:
unstructured_metadata_types = {"total_thoracic_CFU":"numeric","infection_dose": "group",
                              "depletion_type":"group","CFU_per_granuloma":"numeric",
                              "num_genes":"numeric","lung_region":"group","percent_mito":"numeric",
                              "granuloma_state":"group","total_counts":"numeric"}

In [59]:
for m in unstructured_metadata_types:
    cell_level_metadata[m] = mapping_options["cell level dataframe"].loc[cell_level_metadata.index, m]

## Map the structured Alexandria metadata types

In [63]:
metadata_info['alexandria type']=metadata_info['type'].map({"string":"group","number":"numeric","boolean":"group"})

types_row = pd.DataFrame(index= ["TYPE"], columns = cell_level_metadata.columns)

for column in cell_level_metadata.columns:
    if column in metadata_info.index:
        types_row.loc["TYPE",column] = metadata_info.loc[column,'alexandria type']
        
    else:
        types_row.loc["TYPE", column] = unstructured_metadata_types[column]


final_metadata_dataframe = pd.concat([types_row, cell_level_metadata])

final_metadata_dataframe.index.name = "CELL"


# Write file to Alexandria format

In [66]:
final_metadata_dataframe.to_csv(output_dir+"/alexandria_structured_metadata.csv")

# Now what?

This is a description of how to figure out how to upload these files and a like to SCP (and how to get it in the alexandria namespace)