In [1]:
import argparse
import glob
import pathlib
import sqlite3
import sys

import lancedb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from cytotable import convert, presets
from parsl.config import Config
from parsl.executors import HighThroughputExecutor
from pycytominer import aggregate, annotate, feature_select, normalize
from pycytominer.cyto_utils import output

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

  from .autonotebook import tqdm as notebook_tqdm


## Set paths and variables

In [2]:
sqlite_path = pathlib.Path(
    "../../4.cellprofiler_analysis/analysis_output/endpoint_whole_image"
).resolve()
# get the files in the children directories
sqlite_files = glob.glob(f"{sqlite_path}/**/*.sqlite", recursive=True)

In [3]:
# load in platemap file as a pandas dataframe
platemap_path = pathlib.Path("../../data/platemap_AnnexinV_2ch.csv").resolve()
platemap_df = pd.read_csv(platemap_path)

# directory where the annotated parquet files are saved to
output_dir = pathlib.Path("../data/endpoint_whole_image/").resolve()
output_dir.mkdir(exist_ok=True, parents=True)

normalized_data_dir = pathlib.Path(
    output_dir, "normalized_whole_image.parquet"
).resolve()
feature_selected_data_dir = pathlib.Path(
    output_dir, "feature_selected_whole_image.parquet"
).resolve()
aggregated_data_dir = pathlib.Path(
    output_dir, "aggregated_whole_image.parquet"
).resolve()

## Convert

In [4]:
preset = """SELECT * FROM Per_Image;"""

In [5]:
blacklist_keywords = [
    "Skeleton",
    "URL",
    "ExecutionTime",
    "Frame",
    "Group",
    "Height",
    "Width",
    "MD5",
    "Scaling",
    "Series",
]

In [6]:
list_of_dfs = []
for file in sqlite_files:
    source_path = pathlib.Path(file)
    output_file_dir = output_dir / source_path.stem
    # get the path to the sqlite file
    with sqlite3.connect(source_path) as conn:
        query = "SELECT * FROM Per_Image;"
        df = pd.read_sql_query(query, conn)
    list_of_dfs.append(df)

df = pd.concat(list_of_dfs, ignore_index=True)
df = df.drop_duplicates()
# Save the DataFrame to a Parquet file

# df.to_parquet(output_parquet_path, index=False)
list_of_col_to_remove = []
for col in df.columns:
    for keyword in blacklist_keywords:
        if keyword in col:
            list_of_col_to_remove.append(col)
df.drop(columns=list_of_col_to_remove, inplace=True)

for col in df.columns:
    if col.startswith("Image_"):
        df.rename(columns={col: col.replace("Image_", "")}, inplace=True)
print(df.shape)

(118, 137)


## Annotate

In [7]:
# add metadata from platemap file to extracted single cell features
annotated_df = annotate(
    profiles=df,
    platemap=platemap_df,
    join_on=["Metadata_well", "Metadata_Well"],
)
# drop duplicate columns
annotated_df.drop_duplicates(inplace=True)
columns_to_drop = [
    "ImageNumber",
    "FileName_AnnexinV",
    "FileName_DNA",
    "PathName_AnnexinV",
    "PathName_DNA",
]
annotated_df.drop(columns=columns_to_drop, inplace=True)
print(annotated_df.shape)
annotated_df.head()

(118, 136)


Unnamed: 0,Metadata_plate,Metadata_compound,Metadata_dose,Metadata_control,Metadata_Channel,Metadata_FOV,Metadata_FileLocation,Metadata_Time,Metadata_Well,Metadata_Z_slice,...,Texture_SumVariance_DNA_3_02_256,Texture_SumVariance_DNA_3_03_256,Texture_Variance_AnnexinV_3_00_256,Texture_Variance_AnnexinV_3_01_256,Texture_Variance_AnnexinV_3_02_256,Texture_Variance_AnnexinV_3_03_256,Texture_Variance_DNA_3_00_256,Texture_Variance_DNA_3_01_256,Texture_Variance_DNA_3_02_256,Texture_Variance_DNA_3_03_256
0,1,Staurosporine,1.22,test,,4,,14,C-04,1,...,0.204565,0.197763,0.254029,0.2541,0.253996,0.2541,0.058763,0.058764,0.058709,0.058764
1,1,Staurosporine,1.22,test,,2,,14,C-04,1,...,0.20865,0.200193,0.218253,0.218183,0.218197,0.218182,0.060559,0.060571,0.060572,0.06057
2,1,Staurosporine,1.22,test,,3,,14,C-04,1,...,0.088057,0.08256,0.284062,0.284278,0.284222,0.284278,0.025187,0.025211,0.025692,0.025212
3,1,Staurosporine,1.22,test,,1,,14,C-04,1,...,0.183499,0.176284,0.245752,0.24546,0.245324,0.24546,0.053496,0.053502,0.053459,0.053502
48,1,Staurosporine,2.44,test,,4,,14,E-05,1,...,,,,,,,,,,


## Normalize

In [8]:
metadata_columns = [x for x in annotated_df.columns if "Metadata_" in x]
feature_columns = [x for x in annotated_df.columns if "Metadata_" not in x]

In [9]:
normalized_df = normalize(
    # df with annotated raw merged single cell features
    profiles=annotated_df,
    # specify samples used as normalization reference (negative control)
    samples="Metadata_compound == 'Staurosporine' and Metadata_dose == 0.0",
    # normalization method used
    method="standardize",
    features=feature_columns,
    meta_features=metadata_columns,
)
normalized_df = normalized_df.drop_duplicates()
normalized_df = normalized_df.reset_index(drop=True)
print(normalized_df.shape)
normalized_df.to_parquet(normalized_data_dir, index=False)

(118, 136)


## Feature selection

In [10]:
# define operations to be performed on the data
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
    "correlation_threshold",
]

feature_select_df = feature_select(
    normalized_df,
    operation=feature_select_ops,
    # specify features to be used for feature selection
    features=feature_columns,
)


print(f"Number of features before feature selection: {normalized_df.shape[1]}")
print(f"Number of features after feature selection: {feature_select_df.shape[1]}")
feature_select_df.to_parquet(
    feature_selected_data_dir,
    index=False,
)
print(feature_select_df.shape)
feature_select_df.head()

Number of features before feature selection: 136
Number of features after feature selection: 21
(118, 21)


Unnamed: 0,Metadata_plate,Metadata_compound,Metadata_dose,Metadata_control,Metadata_Channel,Metadata_FOV,Metadata_FileLocation,Metadata_Time,Metadata_Well,Metadata_Z_slice,...,Intensity_MADIntensity_AnnexinV,Intensity_MADIntensity_DNA,Intensity_MaxIntensity_AnnexinV,Intensity_MaxIntensity_DNA,Intensity_MeanIntensity_AnnexinV,Intensity_MeanIntensity_DNA,Intensity_StdIntensity_AnnexinV,Intensity_StdIntensity_DNA,Intensity_UpperQuartileIntensity_AnnexinV,Intensity_UpperQuartileIntensity_DNA
0,1,Staurosporine,1.22,test,,4,,14,C-04,1,...,-0.151765,0.0,-0.686519,0.667825,-0.791621,1.135055,-1.146585,1.439476,-1.003397,1.802776
1,1,Staurosporine,1.22,test,,2,,14,C-04,1,...,-0.316177,0.0,-0.805701,1.29068,-1.919608,1.281231,-2.254584,1.492178,-1.30675,1.802776
2,1,Staurosporine,1.22,test,,3,,14,C-04,1,...,-1.795886,0.0,-0.658955,1.432735,-1.656662,-1.208505,0.061217,-1.1947,-1.761779,0.0
3,1,Staurosporine,1.22,test,,1,,14,C-04,1,...,-1.467062,0.0,-0.338675,1.017498,-2.110191,0.887887,-1.351941,1.05004,-1.155074,1.802776
4,1,Staurosporine,2.44,test,,4,,14,E-05,1,...,0.505883,0.0,-0.151942,0.842662,-0.341616,1.34032,-0.660948,1.488995,-0.396692,0.0


## Aggregation

In [11]:
metadata_cols = feature_select_df.columns[
    feature_select_df.columns.str.contains("Metadata")
]
feature_cols = feature_select_df.columns[
    ~feature_select_df.columns.str.contains("Metadata")
].to_list()

aggregated_df = aggregate(
    feature_select_df,
    features=feature_cols,
    strata=["Metadata_Well", "Metadata_dose"],
    operation="median",
)

print(aggregated_df.shape)
aggregated_df.to_parquet(aggregated_data_dir)
print(aggregated_df.shape)
aggregated_df.head()

(30, 13)
(30, 13)


Unnamed: 0,Metadata_Well,Metadata_dose,Intensity_LowerQuartileIntensity_AnnexinV,Intensity_MADIntensity_AnnexinV,Intensity_MADIntensity_DNA,Intensity_MaxIntensity_AnnexinV,Intensity_MaxIntensity_DNA,Intensity_MeanIntensity_AnnexinV,Intensity_MeanIntensity_DNA,Intensity_StdIntensity_AnnexinV,Intensity_StdIntensity_DNA,Intensity_UpperQuartileIntensity_AnnexinV,Intensity_UpperQuartileIntensity_DNA
0,C-02,0.0,-0.545275,-0.480589,0.0,1.246418,0.266248,-0.921335,0.667563,-0.138017,0.144151,-1.003397,0.0
1,C-03,0.61,-0.545275,-0.151765,0.0,-0.453394,1.025694,-1.044321,0.626815,-1.099832,0.270661,-1.230912,0.901388
2,C-04,1.22,-0.545275,-0.89162,0.0,-0.672737,1.154089,-1.788135,1.011471,-1.249263,1.244758,-1.230912,1.802776
3,C-05,2.44,-0.545275,-1.138238,0.0,-0.712917,0.484793,-1.784165,-0.594062,-0.768595,0.250849,-1.685941,0.0
4,C-06,4.88,-0.545275,-0.645001,0.0,-0.739316,0.757975,-0.777688,0.087873,-0.609561,0.862966,-1.079236,0.0
