# Describe study metadata

Each screen contains an experiment with different parameters and conditions.

Extract this information based on ID and save details.

In [1]:
import pathlib
import time
import os
import requests
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import multiprocessing

In [2]:
def extract_study_info(session, screen_id):
    """Pull metadata info per screen, given screen id

    Parameters
    ----------
    session: Requests.session()
        Requests session providing access to IDR API
    screen_id: int
        ID of the screen data set

    Returns
    -------
    pandas.DataFrame() of metadata per screen id
    """
    base_url = "https://idr.openmicroscopy.org/webclient/api/annotations/"
    screen_url = f"?type=map&screen={screen_id}"

    url = f"{base_url}{screen_url}"
    response = session.get(url).json()

    annotations = response["annotations"]
    study_index = [x["ns"] for x in annotations].index(
        'idr.openmicroscopy.org/study/info')

    id_ = annotations[study_index]["id"]
    date_ = annotations[study_index]["date"]
    name_ = annotations[study_index]["link"]["parent"]["name"]

    details_ = (
        pd.DataFrame({x[0]: x[1]
                      for x in annotations[study_index]["values"]}, index=[0])
        .assign(
            internal_id=id_,
            upload_date=date_,
            idr_name=name_,
            screen_id=screen_id
        )
    )

    return details_


def describe_screen(screen_id):
    """Pull additional metadata info per plate, given screen id

    Parameters
    ----------
    screen_id: int
        ID of the screen data set

    Returns
    -------
    pandas.DataFrame() of the following metadata values:

        screen_id: IDR ID for the screen
        plate_id: IDR ID for each plate
        plate_name: Names given to each plate
        image_id: IDR ID for each image
        cell_line: Cell line used in the screen experiment
        strain: Strain of the cell line (if specified)
        gene_identifier: Accession code for the gene being perturbed in a well
        phenotype_identifier: Accession code for the phenotype perturbed in a well
        stain: Set of the stains used in the screen
        stain_target: The target protein or media of the stain
        pixel_size_x: Width of the image
        pixel_size_y: Height of the image
        imaging_method: Method used to collect images (ex: fluorescence microscopy)
    """
    session = requests.Session()

    # Get number of plates per screen and append to a dictionary
    PLATES_URL = f"https://idr.openmicroscopy.org/webclient/api/plates/?id={screen_id}"
    all_plates = session.get(PLATES_URL).json()["plates"]
    study_plates = {x["id"]: x["name"] for x in all_plates}
    print(f"Number of plates found in screen {screen_id}: ", len(study_plates))

    # Initialize results list
    plate_results = list()

    # Iterate through all plates in the study
    for plate in study_plates:
        imageIDs = list()
        plate_name = study_plates[plate]

        # Access .json file for the plate ID. Contains image ID (id) numbers
        # for replicate images per plate.
        WELLS_IMAGES_URL = f"https://idr.openmicroscopy.org/webgateway/plate/{plate}/"
        grid = session.get(WELLS_IMAGES_URL).json()

        try:
            pixel_size_x = grid["image_sizes"][0]["x"]
            pixel_size_y = grid["image_sizes"][0]["y"]

            # Get image and well ids
            for image in grid['grid'][0]:
                # Append IDs to iterable lists
                thumb_url = image['thumb_url'].rstrip(image['thumb_url'][-1])
                image_id = thumb_url.split('/')[-1]
                imageIDs.append(image_id)

        except (ValueError, KeyError):
            plate_results.append(
                [screen_id, plate, plate_name, None, None, None, None, None])
            continue

        # Get image details from each image id for each plate
        for id in imageIDs:
            MAP_URL = f"https://idr.openmicroscopy.org/webclient/api/annotations/?type=map&image={id}"
            annotations = session.get(MAP_URL).json()["annotations"]

            # Get stain and stain target
            try:
                bulk_index = [x["ns"] for x in annotations].index(
                    'openmicroscopy.org/omero/bulk_annotations')
                channels = {x[0]: x[1]
                            for x in annotations[bulk_index]
                            ["values"]}["Channels"]

                # Clean channels value and separate into stain and stain target
                stain = set()
                stain_target = set()
                for entry in channels.split(";"):
                    temp_list = entry.split(":")
                    stain.add(temp_list[0])
                    stain_target.add(temp_list[1])

            except (ValueError, KeyError):
                stain = "Not listed"
                stain_target = "Not listed"

            # Get cell line
            try:
                cell_line_index = [x["ns"] for x in annotations].index(
                    'openmicroscopy.org/mapr/cell_line')
                cell_line = {x[0]: x[1] for x in
                             annotations[cell_line_index]["values"]}["Cell Line"]
            except (ValueError, KeyError):
                cell_line = "Not listed"

            # Get strain of cell line
            try:
                strain = {x[0]: x[1] for x in
                          annotations[bulk_index]["values"]}["Strain"]
            except (ValueError, KeyError):
                strain = "Not listed"

            # Get gene identification accession code
            try:
                gene_identifier_index = int(
                    [x["ns"] for x in annotations].index(
                        'openmicroscopy.org/mapr/gene'))
                gene_identifier = {x[0]: x[1] for x in annotations[gene_identifier_index]["values"]}[
                    "Gene Identifier"]
            except (ValueError, KeyError):
                gene_identifier = "Not Listed"

            # Get phenotype identification accession code
            try:
                phenotype_identifier_index = int(
                    [x["ns"] for x in annotations].index('openmicroscopy.org/mapr/phenotype'))
                phenotype_identifier = {x[0]: x[1] for x in annotations[phenotype_identifier_index]["values"]}[
                    "Phenotype Term Accession"]
            except (ValueError, KeyError):
                phenotype_identifier = "Not Listed"

            # Build results
            plate_results.append([
                screen_id,
                plate,
                plate_name,
                id,
                cell_line,
                strain,
                gene_identifier,
                phenotype_identifier,
                stain,
                stain_target,
                pixel_size_x,
                pixel_size_y
            ])

    # Output results
    plate_results_df = pd.DataFrame(
        plate_results,
        columns=[
            "screen_id",
            "plate_id",
            "plate_name",
            "image_id",
            "cell_line",
            "strain",
            "gene_identifier",
            "phenotype_identifier",
            "stain",
            "stain_target",
            "pixel_size_x",
            "pixel_size_y"
        ]
    )
    return plate_results_df

In [3]:
data_dir = pathlib.Path("data")

In [4]:
# Load IDR ids
data_dir = pathlib.Path(
    "~/Documents/publicly-available-microscopy-data/IDR/data")
id_file = pathlib.Path(data_dir, "idr_ids.tsv")
id_df = pd.read_csv(id_file, sep="\t")

print(id_df.shape)
id_df.head(10)

(198, 5)


Unnamed: 0,id,name,title,description,category
0,3,idr0001-graml-sysgro/screenA,A genomic Multiprocess survey of machineries t...,Primary screen of fission yeast knock out muta...,Screen
1,102,idr0002-heriche-condensation/screenA,Integration of biological data by kernels on g...,Screen of 100 candidate genes predicted to be ...,Screen
2,51,idr0003-breker-plasticity/screenA,A novel single-cell screening platform reveals...,Screen to characterize yeast stress responses ...,Screen
3,202,idr0004-thorpe-rad52/screenA,Bringing Rad52 foci into focus.,This screen assesses the proportion of cells c...,Screen
4,597,idr0005-toret-adhesion/screenA,A genome-wide screen identifies conserved prot...,A genome-wide RNAi screen for loss of DE- cadh...,Screen
5,751,idr0005-toret-adhesion/screenB,A genome-wide screen identifies conserved prot...,"Re-screen of 803 hits from the first screen, u...",Screen
6,253,idr0006-fong-nuclearbodies/screenA,Whole-genome screening identifies proteins loc...,A whole-genome screening for proteins localize...,Screen
7,201,idr0007-srikumar-sumo/screenA,Global analysis of SUMO chain function reveals...,In this screen two mutant yeast strains unable...,Screen
8,154,idr0008-rohn-actinome/screenA,Comparative RNAi screening identifies a conser...,"A genome-wide, high-content RNAi screen in the...",Screen
9,206,idr0008-rohn-actinome/screenB,Comparative RNAi screening identifies a conser...,This screen consists of siRNAs targetting 516 ...,Screen


In [5]:
# Create http session
INDEX_PAGE = "https://idr.openmicroscopy.org/webclient/?experimenter=-1"
with requests.Session() as session:
    request = requests.Request('GET', INDEX_PAGE)
    prepped = session.prepare_request(request)
    response = session.send(prepped)
    if response.status_code != 200:
        response.raise_for_status()

In [6]:
# Extract summary details for all screens
screen_ids = id_df.query("category=='Screen'").id.tolist()
screen_details_df = (
    pd.concat([
       extract_study_info(session=session, screen_id=x) for x in screen_ids
    ], axis="rows")
    .reset_index(drop=True)
)

output_file = pathlib.Path(data_dir, "screen_details.tsv")
screen_details_df.to_csv(output_file, index=False, sep="\t")

print(screen_details_df.shape)
screen_details_df.head(3)

There are a total of 80 screens
(80, 24)


Unnamed: 0,Sample Type,Organism,Study Type,Screen Type,Screen Technology Type,Imaging Method,Publication Title,Publication Authors,PubMed ID,PMC ID,...,External URL,Annotation File,internal_id,upload_date,idr_name,screen_id,Study Title,Data Publisher,Data DOI,BioStudies Accession
0,cell,Schizosaccharomyces pombe,high content screen,primary screen,gene deletion screen,spinning disk confocal microscopy,A genomic Multiprocess survey of machineries t...,"Graml V, Studera X, Lawson JLD, Chessel A, Gey...",25373780 https://www.ncbi.nlm.nih.gov/pubmed/2...,PMC4648281 https://www.ncbi.nlm.nih.gov/pmc/ar...,...,www.sysgro.org,idr0001-screenA-annotation.csv https://github....,20516150,2019-06-04T10:43:11+01:00,idr0001-graml-sysgro/screenA,3,,,,
1,cell,Homo sapiens,high content screen,primary screen,RNAi screen,fluorescence microscopy,Integration of biological data by kernels on g...,"Hériché JK, Lees JG, Morilla I, Walter T, Petr...",24943848 https://www.ncbi.nlm.nih.gov/pubmed/2...,PMC4142622 https://www.ncbi.nlm.nih.gov/pmc/ar...,...,,idr0002-screenA-annotation.csv https://github....,20516151,2019-06-04T10:43:19+01:00,idr0002-heriche-condensation/screenA,102,Focused mitotic chromsome condensaton screen u...,,,
2,cell,Saccharomyces cerevisiae,high content screen,primary screen,protein screen,fluorescence microscopy,A novel single-cell screening platform reveals...,"Breker M, Gymrek M, Schuldiner M",23509072 https://www.ncbi.nlm.nih.gov/pubmed/2...,PMC3601363 https://www.ncbi.nlm.nih.gov/pmc/ar...,...,http://www.weizmann.ac.il/molgen/loqate/,idr0003-screenA-annotation.csv https://github....,20516152,2019-06-04T10:43:22+01:00,idr0003-breker-plasticity/screenA,51,,,,


In [7]:
# Subset the data
test_indices = [0, 1, 2, 3]
test_screen_ids = [screen_ids[i] for i in test_indices]

# Initialize Pool object for threading
start = time.time()
available_cores = len(os.sched_getaffinity(0))
pool = multiprocessing.Pool(processes=available_cores)
print(f"\nNow processing {len(test_screen_ids)} screens with {available_cores} cpu cores.\n")

# Pull pertinent details about the screen (plates, wells, channels, cell line, etc.)
plate_results_dfs = pool.map(describe_screen, test_screen_ids)

# Terminate pool processes
pool.close()
pool.join()

Now processing screen: 3
192 plates found. Done

Now processing screen: 102
12 plates found. Done

Now processing screen: 51
85 plates found. Done

Now processing screen: 202
47 plates found. Done

Now processing screen: 597
141 plates found. Done

Now processing screen: 751
18 plates found. Done

Now processing screen: 253
169 plates found. Done

Now processing screen: 201
12 plates found. Done

Now processing screen: 154
58 plates found. Done

Now processing screen: 206
11 plates found. Done

Now processing screen: 251
500 plates found. Done

Now processing screen: 803
70 plates found. Done

Now processing screen: 1351
148 plates found. Done

Now processing screen: 1501
129 plates found. Done

Now processing screen: 1551
40 plates found. Done

Now processing screen: 1601
4 plates found. Done

Now processing screen: 1602
8 plates found. Done

Now processing screen: 1603
1 plates found. Done

Now processing screen: 1202
68 plates found. Done

Now processing screen: 1101
500 plates foun

In [8]:
# Combine to create full dataframe
all_plate_results_df = pd.concat(plate_results_dfs, ignore_index=True)

# Collect imaging method metadata for each screen
img_screen_index = dict()
for index in screen_details_df.itertuples(index=False):
    screenID = index[19]
    img_type = index[5]
    img_screen_index[screenID] = img_type

# Map imaging method per screen to the final data frame
all_plate_results_df["imaging_method"] = all_plate_results_df["screen_id"].map(
    img_screen_index)

print(f'Metadata collected. Running cost is {(time.time()-start)/60:.1f} min. ', 'Now saving file.')

# Save data frame as a single parquet file
output_file = pathlib.Path(data_dir, "plate_details_per_screen.parquet")
pq_table = pa.Table.from_pandas(all_plate_results_df)
pq.write_table(pq_table, output_file)

print(all_plate_results_df.shape)
all_plate_results_df.head(10)

(5217, 8)


Unnamed: 0,screen_id,plate_id,plate_name,n_wells,cell_line,channels,pixel_size_x,pixel_size_y
0,3,2551,JL_120731_S6A,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
1,3,2552,JL_120731_S6B,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
2,3,2554,JL_120801_S7A,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
3,3,2553,JL_120801_S7B,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
4,3,2555,JL_120802_S8A,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
5,3,2556,JL_120802_S8B,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
6,3,2557,JL_120803_S9A,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
7,3,2558,JL_120803_S9B,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
8,3,2559,JL_120804_S10A,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
9,3,2560,JL_120804_S10B,96,Not listed,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,1376,1040
