# Extract image features

**Note:** This does not include any processing of the features (e.g. normalization/feature selection/etc.)

## Import libraries

In [1]:
import pathlib
import pandas as pd

from pycytominer import annotate
from pycytominer.cyto_utils import output

import sys
sys.path.append("../../utils")
import extract_image_features_utils as extract_utils

## Set paths and constants

In [2]:
# Set file and directory constants
cp_output_dir = pathlib.Path("../3.cellprofiler_analysis/analysis_output")
features_output_dir = pathlib.Path("./data")
platemap_df = pd.read_csv(pathlib.Path("../../metadata/Interstellar_plate2_platemap.csv"))

# image categories/measurements to extract
image_feature_categories = ["Image_Correlation", "Image_Granularity", "Image_Texture", "Image_Intensity"]
image_cols="ImageNumber"
strata=["Image_Metadata_Well", "Image_Metadata_Plate", "Image_Metadata_Site"]

run_info_dictionary = {
    "SHSY5Y_first_run": {
        "sql_file": "SHSY5Y_cells_incomplete_first_run.sqlite",
        "image_features_output_file": pathlib.Path(f"{features_output_dir}/frist_run_image_quality.csv.gz"),

    },
    "SHSY5Y_second_run": {
        "sql_file": "SHSY5Y_cells_second_run.sqlite",
        "image_features_output_file": pathlib.Path(f"{features_output_dir}/second_run_image_quality.csv.gz"),
    },
}


## Load in the `Per_Image` table as df for both SQLite files (each run) and combine into one df

In [3]:
# read in SQLite Per_Image table as dataframe for each run
## First run
sql_file_first_run = run_info_dictionary["SHSY5Y_first_run"]["sql_file"]
single_cell_file_first_run = f"sqlite:///{cp_output_dir}/{sql_file_first_run}"
image_df_first_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_first_run, image_table_name="Per_Image"
)
## Second run
sql_file_second_run = run_info_dictionary["SHSY5Y_second_run"]["sql_file"]
single_cell_file_second_run = f"sqlite:///{cp_output_dir}/{sql_file_second_run}"
image_df_second_run = extract_utils.load_sqlite_as_df(
    sqlite_file_path=single_cell_file_second_run, image_table_name="Per_Image"
)

# merge the dataframes together into one combined run
SHSY5Y_run_df = pd.concat([image_df_first_run, image_df_second_run], ignore_index=True)

print(SHSY5Y_run_df.shape)
SHSY5Y_run_df.head()

(2464, 630)


Unnamed: 0,ImageNumber,Image_Correlation_Correlation_CorrDNA_CorrER,Image_Correlation_Correlation_CorrDNA_CorrGasdermin,Image_Correlation_Correlation_CorrDNA_CorrMito,Image_Correlation_Correlation_CorrDNA_CorrPM,Image_Correlation_Correlation_CorrER_CorrGasdermin,Image_Correlation_Correlation_CorrER_CorrMito,Image_Correlation_Correlation_CorrER_CorrPM,Image_Correlation_Correlation_CorrGasdermin_CorrMito,Image_Correlation_Correlation_CorrGasdermin_CorrPM,...,Image_Width_IllumDNA,Image_Width_IllumER,Image_Width_IllumGasdermin,Image_Width_IllumMito,Image_Width_IllumPM,Image_Width_OrigDNA,Image_Width_OrigER,Image_Width_OrigGasdermin,Image_Width_OrigMito,Image_Width_OrigPM
0,1,0.571709,0.573797,0.539813,0.562183,0.995418,0.859627,0.829783,0.857063,0.834798,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
1,2,0.200815,0.192012,0.59992,0.543271,0.988046,0.292342,0.407301,0.281361,0.402754,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
2,3,0.284507,0.289991,0.526236,0.593636,0.991492,0.4349,0.498339,0.437262,0.508183,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
3,4,0.330648,0.327711,0.545466,0.55347,0.995623,0.490845,0.547498,0.486403,0.543832,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
4,5,0.244275,0.253517,0.553556,0.552547,0.99615,0.338766,0.428687,0.351297,0.443756,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160


## Extract the image features, annotate with metadata, and save as parquet file

In [4]:
# extract image quality features from merged SHSY5Y runs image table df
image_features_df = extract_utils.extract_image_features(
    image_feature_categories=image_feature_categories,
    image_df=SHSY5Y_run_df,
    image_cols=image_cols,
    strata=strata
)

# annotate df with platemap file to include all metadata
annotated_image_features_df = annotate(
    profiles=image_features_df,
    platemap=platemap_df,
    join_on=["Metadata_well_id", "Image_Metadata_Well"],
    output_file="none",
)

# output df as parquet file
output(
    df=annotated_image_features_df,
    output_filename=pathlib.Path(f"{features_output_dir}/plate2_SHSY5Y_image_features.parquet"),
    output_type='parquet',
)
print("The image features for the SHSY5Y cells have been extracted and saved!")

The image features for the SHSY5Y cells have been extracted and saved!
