# Extract image features

**Note:** This does not include any processing of the features (e.g. normalization/feature selection/etc.)

## Import libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import annotate
from pycytominer.cyto_utils import output

import sys
sys.path.append("../../utils")
import extract_image_features_utils as extract_utils

## Set paths and constants

In [2]:
# Set file and directory constants
cp_output_dir = pathlib.Path("../../3.cellprofiler_analysis/analysis_output")
features_output_dir = pathlib.Path("./data")
platemap_df = pd.read_csv(pathlib.Path("../../../metadata/Interstellar_plate2_platemap.csv"))

# image categories/measurements to extract
image_feature_categories = ["Image_Correlation", "Image_Granularity", "Image_Texture", "Image_Intensity"]
image_cols="ImageNumber"
# strata are the columns that can be used to groupby and/or aggregate, but I use it to make sure I have all
# metadata I need to use to identify what exact image the features come from
strata=["Image_Metadata_Well", "Image_Metadata_Plate", "Image_Metadata_Site"]

# set directory for sqlite files
sqlite_dir = pathlib.Path(
    "/projects/mlippincott@xsede.org/"
).resolve(strict=True)

# dictionary with info for the sqlite file from each run
run_info_dictionary = {
    "batch_1": {
        # path to outputted SQLite file
        "source_path": str(
            pathlib.Path(
                f"{sqlite_dir}/PBMC_batch_1.sqlite"
            )
        ),
        "dest_path": str(pathlib.Path(f"{features_output_dir}/PBMC_batch_1_image_quality.parquet")),
    },
    "batch_2": {
        # path to outputted SQLite file
        "source_path": str(
            pathlib.Path(
                f"{sqlite_dir}/PBMC_batch_2.sqlite"
            )
        ),
        "dest_path": str(pathlib.Path(f"{features_output_dir}/PBMC_batch_2_image_quality.parquet")),
    },
    "batch_3": {
        # path to outputted SQLite file
        "source_path": str(
            pathlib.Path(
                f"{sqlite_dir}/PBMC_batch_3.sqlite"
            )
        ),
        "dest_path": str(pathlib.Path(f"{features_output_dir}/PBMC_batch_3.parquet")),
    },
    "batch_4": {
        # path to outputted SQLite file
        "source_path": str(
            pathlib.Path(
                f"{sqlite_dir}/PBMC_batch_4.sqlite"
            )
        ),
        "dest_path": str(pathlib.Path(f"{features_output_dir}/PBMC_batch_4.parquet")),
    },
    "batch_5": {
        # path to outputted SQLite file
        "source_path": str(
            pathlib.Path(
                f"{sqlite_dir}/PBMC_batch_5.sqlite"
            )
        ),
        "dest_path": str(pathlib.Path(f"{features_output_dir}/PBMC_batch_5.parquet")),
    },
    "batch_6": {
        # path to outputted SQLite file
        "source_path": str(
            pathlib.Path(
                f"{sqlite_dir}/PBMC_batch_6.sqlite"
            )
        ),
        "dest_path": str(pathlib.Path(f"{features_output_dir}/PBMC_batch_6.parquet")),
    },
    "batch_7": {
        # path to outputted SQLite file
        "source_path": str(
            pathlib.Path(
                f"{sqlite_dir}/PBMC_batch_7.sqlite"
            )
        ),
        "dest_path": str(pathlib.Path(f"{features_output_dir}/PBMC_batch_7.parquet")),
    }   
}


## Load in the `Per_Image` table as df for both SQLite files (each run) and combine into one df

In [3]:
# read in SQLite Per_Image table as dataframe for each run
## First run
sql_file_first_run = run_info_dictionary["batch_1"]["source_path"]
single_cell_file_first_run = f"sqlite:///{cp_output_dir}/{sql_file_first_run}"
image_df_first_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_first_run, image_table_name="Per_Image"
)

## Second run
sql_file_second_run = run_info_dictionary["batch_2"]["source_path"]
single_cell_file_second_run = f"sqlite:///{cp_output_dir}/{sql_file_second_run}"
image_df_second_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_second_run, image_table_name="Per_Image"
)

## Third run
sql_file_third_run = run_info_dictionary["batch_3"]["source_path"]
single_cell_file_third_run = f"sqlite:///{cp_output_dir}/{sql_file_third_run}"
image_df_third_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_third_run, image_table_name="Per_Image"
)

## Fourth run
sql_file_fourth_run = run_info_dictionary["batch_4"]["source_path"]
single_cell_file_fourth_run = f"sqlite:///{cp_output_dir}/{sql_file_fourth_run}"
image_df_fourth_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_fourth_run, image_table_name="Per_Image"
)

## Fifth run
sql_file_fifth_run = run_info_dictionary["batch_5"]["source_path"]
single_cell_file_fifth_run = f"sqlite:///{cp_output_dir}/{sql_file_fifth_run}"
image_df_fifth_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_fifth_run, image_table_name="Per_Image"
)

## Sixth run
sql_file_sixth_run = run_info_dictionary["batch_6"]["source_path"]
single_cell_file_sixth_run = f"sqlite:///{cp_output_dir}/{sql_file_sixth_run}"
image_df_sixth_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_sixth_run, image_table_name="Per_Image"
)

## Seventh run
sql_file_seventh_run = run_info_dictionary["batch_7"]["source_path"]
single_cell_file_seventh_run = f"sqlite:///{cp_output_dir}/{sql_file_seventh_run}"
image_df_seventh_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_seventh_run, image_table_name="Per_Image"
)





# merge the dataframes together into one combined run
PBMC_run_df = pd.concat([image_df_first_run, image_df_second_run, image_df_third_run, image_df_fourth_run, image_df_fifth_run, image_df_sixth_run, image_df_seventh_run], ignore_index=True)

print(PBMC_run_df.shape)
PBMC_run_df.head()

(2464, 630)


Unnamed: 0,ImageNumber,Image_Correlation_Correlation_CorrDNA_CorrER,Image_Correlation_Correlation_CorrDNA_CorrGasdermin,Image_Correlation_Correlation_CorrDNA_CorrMito,Image_Correlation_Correlation_CorrDNA_CorrPM,Image_Correlation_Correlation_CorrER_CorrGasdermin,Image_Correlation_Correlation_CorrER_CorrMito,Image_Correlation_Correlation_CorrER_CorrPM,Image_Correlation_Correlation_CorrGasdermin_CorrMito,Image_Correlation_Correlation_CorrGasdermin_CorrPM,...,Image_Width_IllumDNA,Image_Width_IllumER,Image_Width_IllumGasdermin,Image_Width_IllumMito,Image_Width_IllumPM,Image_Width_OrigDNA,Image_Width_OrigER,Image_Width_OrigGasdermin,Image_Width_OrigMito,Image_Width_OrigPM
0,1,0.571709,0.573797,0.539813,0.562183,0.995418,0.859627,0.829783,0.857063,0.834798,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
1,2,0.200815,0.192012,0.59992,0.543271,0.988046,0.292342,0.407301,0.281361,0.402754,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
2,3,0.284507,0.289991,0.526236,0.593636,0.991492,0.4349,0.498339,0.437262,0.508183,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
3,4,0.330648,0.327711,0.545466,0.55347,0.995623,0.490845,0.547498,0.486403,0.543832,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
4,5,0.244275,0.253517,0.553556,0.552547,0.99615,0.338766,0.428687,0.351297,0.443756,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160


## Extract the image features, annotate with metadata, and save as parquet file

In [4]:
# extract image quality features from merged PBMC runs image table df
image_features_df = extract_utils.extract_image_features(
    image_feature_categories=image_feature_categories,
    image_df=PBMC_run_df,
    image_cols=image_cols,
    strata=strata
)

# annotate df with platemap file to include all metadata
annotated_image_features_df = annotate(
    profiles=image_features_df,
    platemap=platemap_df,
    join_on=["Metadata_well_id", "Image_Metadata_Well"],
    output_file="none",
)

# output df as parquet file
output(
    df=annotated_image_features_df,
    output_filename=pathlib.Path(f"{features_output_dir}/plate2_PBMC_image_features.parquet"),
    output_type='parquet',
)
print("The image features for the PBMC cells have been extracted and saved!")

The image features for the SHSY5Y cells have been extracted and saved!


## Confirm that the annotation worked

Check to see that data doesn't show NaNs when there should be values.

In [5]:
print(annotated_image_features_df.shape)
annotated_image_features_df.head()

(2464, 520)


Unnamed: 0,Metadata_cell_type,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,Metadata_inducer2,Metadata_inducer2_concentration,...,Image_Texture_Variance_CorrGasdermin_3_02_256,Image_Texture_Variance_CorrGasdermin_3_03_256,Image_Texture_Variance_CorrMito_3_00_256,Image_Texture_Variance_CorrMito_3_01_256,Image_Texture_Variance_CorrMito_3_02_256,Image_Texture_Variance_CorrMito_3_03_256,Image_Texture_Variance_CorrPM_3_00_256,Image_Texture_Variance_CorrPM_3_01_256,Image_Texture_Variance_CorrPM_3_02_256,Image_Texture_Variance_CorrPM_3_03_256
0,SH-SY5Y,6,Media ctr,,,media ctr,,,,,...,3.32307,3.316185,6.186644,6.187249,6.188951,6.187248,4.08747,4.085558,4.087237,4.085559
1,SH-SY5Y,6,Media ctr,,,media ctr,,,,,...,66.882381,66.968605,5.845572,5.851207,5.848049,5.851201,7.755544,7.759332,7.758575,7.759326
2,SH-SY5Y,6,Media ctr,,,media ctr,,,,,...,15.367783,15.378558,4.992922,4.991233,4.996662,4.991235,5.752803,5.754091,5.758829,5.754094
3,SH-SY5Y,6,Media ctr,,,media ctr,,,,,...,14.033084,14.042549,7.349496,7.35547,7.360959,7.355476,6.226774,6.226449,6.229304,6.226446
4,SH-SY5Y,6,Media ctr,,,media ctr,,,,,...,22.451863,22.479333,7.304544,7.310332,7.306508,7.310332,5.554351,5.553939,5.55261,5.553932


In [6]:
annotated_image_features_df["Metadata_inhibitor"].unique()

array(['Media ctr', 'DMSO', 'Z-VAD-FMK', 'Disulfiram'], dtype=object)

In [7]:
annotated_image_features_df[annotated_image_features_df["Metadata_inhibitor"] == "DMSO"]

Unnamed: 0,Metadata_cell_type,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,Metadata_inducer2,Metadata_inducer2_concentration,...,Image_Texture_Variance_CorrGasdermin_3_02_256,Image_Texture_Variance_CorrGasdermin_3_03_256,Image_Texture_Variance_CorrMito_3_00_256,Image_Texture_Variance_CorrMito_3_01_256,Image_Texture_Variance_CorrMito_3_02_256,Image_Texture_Variance_CorrMito_3_03_256,Image_Texture_Variance_CorrPM_3_00_256,Image_Texture_Variance_CorrPM_3_01_256,Image_Texture_Variance_CorrPM_3_02_256,Image_Texture_Variance_CorrPM_3_03_256
16,SH-SY5Y,6,DMSO,1.000,%,DMSO,0.100,%,,,...,10.833351,10.844102,7.412987,7.418720,7.412234,7.418713,5.671746,5.671432,5.669221,5.671420
17,SH-SY5Y,6,DMSO,1.000,%,DMSO,0.100,%,,,...,11.982584,11.993703,5.794548,5.797689,5.793648,5.797691,6.019337,6.018265,6.018151,6.018261
18,SH-SY5Y,6,DMSO,1.000,%,DMSO,0.100,%,,,...,17.794940,17.816688,3.572679,3.574138,3.571166,3.574139,6.258081,6.205844,6.203565,6.205849
19,SH-SY5Y,6,DMSO,1.000,%,DMSO,0.100,%,,,...,5.068424,5.071049,2.378047,2.379838,2.378448,2.379838,4.323410,4.323182,4.324006,4.323182
20,SH-SY5Y,6,DMSO,1.000,%,DMSO,0.100,%,,,...,3.919985,3.917862,2.125729,2.124586,2.124367,2.124588,3.813297,3.810492,3.814835,3.810491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2427,SH-SY5Y,6,DMSO,0.025,%,Flagellin,1.000,µg/ml,,,...,36.101932,36.146832,7.878755,7.882810,7.880108,7.882778,5.727817,5.729555,5.727708,5.729527
2428,SH-SY5Y,6,DMSO,0.025,%,Flagellin,1.000,µg/ml,,,...,8.714428,8.721619,15.754548,15.770707,15.755178,15.770727,5.694403,5.695373,5.694790,5.695394
2429,SH-SY5Y,6,DMSO,0.025,%,Flagellin,1.000,µg/ml,,,...,9.797140,9.802990,6.886099,6.886494,6.885284,6.886481,5.805150,5.806995,5.805431,5.806999
2430,SH-SY5Y,6,DMSO,0.025,%,Flagellin,1.000,µg/ml,,,...,21.626081,21.643735,11.423734,11.431489,11.425014,11.431495,5.539756,5.540412,5.539653,5.540408
