# Annotate merged single cells with metadata from platemap file for each plate

## Import libraries

In [1]:
import sys
import pathlib
import os
import yaml
import json

import pandas as pd
from pycytominer import annotate
from pycytominer.cyto_utils import output

sys.path.append("../utils")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# output directory for annotated data
output_dir = pathlib.Path("./data/annotated_data")
# if directory if doesn't exist, will not raise error if it already exists
os.makedirs(output_dir, exist_ok=True)

# load in dicionary from yaml file
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path) as file:
    plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)

# add paths to dictionary that are used for annotation
plate_info_dictionary["Plate_1"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate1.csv"))
plate_info_dictionary["Plate_2"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate2.csv"))
# both plates 3 and 3 prime use the same platemap file (same metadata)
plate_info_dictionary["Plate_3"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate3.csv"))
plate_info_dictionary["Plate_3_prime"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate3.csv"))

# view the dictionary to assess that all info is added correctly
print(json.dumps(plate_info_dictionary, indent=4))

{
    "Plate_1": {
        "dest_path": "data/converted_data/Plate_1.parquet",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate1.csv"
    },
    "Plate_2": {
        "dest_path": "data/converted_data/Plate_2.parquet",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate2.csv"
    },
    "Plate_3": {
        "dest_path": "data/converted_data/Plate_3.parquet",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_3.sqlite",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate3.csv"
    },
    "Plate_3_prime": {
        "dest_path": "data/converted_data/Plate_3_prime.parquet",
        "source_path": "../2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite",
        "platemap_path": "../0.download_data/metadata/platemap_NF1_plate3.csv"

## Annotate merged single cells

In [3]:
for plate, info in plate_info_dictionary.items():
    # single_cell_df is the dataframe loaded in from the converted parquet file
    single_cell_df = pd.read_parquet(info["dest_path"])
    platemap_df = pd.read_csv(info["platemap_path"])
    output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc.parquet"))
    # save path to annotated file to dictionary for downstream use
    plate_info_dictionary[plate]["annotated_path"] = output_file
    print(f"Adding annotations to merged single cells for {plate}!")

    # add metadata from platemap file to extracted single cell features
    annotated_df = annotate(
        profiles=single_cell_df,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
    )

    # move metadata well and single cell count to the front of the df (for easy visualization in python)
    well_column = annotated_df.pop("Metadata_Well")
    singlecell_column = annotated_df.pop("Metadata_number_of_singlecells")
    # insert the column as the second index column in the dataframe
    annotated_df.insert(1, "Metadata_Well", well_column)
    annotated_df.insert(2, "Metadata_number_of_singlecells", singlecell_column)

    # save annotated df as parquet file
    output(
        df=annotated_df,
        output_filename=output_file,
        output_type="parquet",
    )
    print(f"Annotations have been added to {plate} and saved!")

Adding annotations to merged single cells for Plate_1!
Annotations have been added to Plate_1 and saved!
Adding annotations to merged single cells for Plate_2!
Annotations have been added to Plate_2 and saved!
Adding annotations to merged single cells for Plate_3!
Annotations have been added to Plate_3 and saved!
Adding annotations to merged single cells for Plate_3_prime!
Annotations have been added to Plate_3_prime and saved!


In [4]:
# print last annotated df to see if annotation occurred
print(annotated_df.shape)
annotated_df.head()

(4098, 1597)


Unnamed: 0,Metadata_WellRow,Metadata_Well,Metadata_number_of_singlecells,Metadata_WellCol,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_ImageNumber,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,G,G4,388,4,NF1,WT,4000,1651,1,1,...,1192.902866,1094.22489,84.848068,75.654841,82.65422,81.654833,219.422496,219.431461,215.112282,213.774707
1,G,G4,388,4,NF1,WT,4000,1651,2,2,...,1359.546002,1322.896817,46.111026,47.824555,54.34909,47.782485,163.445508,164.258567,160.110226,156.69313
2,G,G4,388,4,NF1,WT,4000,1651,3,3,...,864.58548,817.911062,25.834279,26.309631,29.205057,26.448667,158.885066,156.440878,161.265018,162.068418
3,G,G4,388,4,NF1,WT,4000,1651,4,4,...,706.547653,679.70237,17.301069,16.879489,18.67935,17.938121,201.177069,213.167484,207.48202,190.636553
4,G,G4,388,4,NF1,WT,4000,1651,5,5,...,941.04943,908.335782,52.4727,64.080576,52.398347,50.52007,183.724946,187.010835,183.114493,173.291387


## Write updated dictionary to yaml file for use in downstream steps

In [5]:
with open(dictionary_path, 'w') as file:
    yaml.dump(plate_info_dictionary, file)