# Process single cell morphology features for CellProfiler readouts - `Plate 2`

## Import Libraries

In [1]:
import pandas as pd
import pathlib

import extraction_utils as sc_util

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = pathlib.Path("../CellProfiler_pipelines")
output_dir = pathlib.Path("data/Plate2/CellProfiler")

## Set up paths to sqlite files and outputs

In [3]:
# Set paths for all cellprofiler method
method1 = "all_cellprofiler"
sql_file1 = f"{method1}.sqlite"
single_cell_file1 = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file1}"

# set paths for pybasic cellprofiler method
method2 = "pybasic_cellprofiler"
sql_file2 = f"{method2}.sqlite"
single_cell_file2 = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file2}"

# set paths for pybasic cellpose method
method3 = "pybasic_cellpose"
sql_file3 = f"{method3}.sqlite"
single_cell_file3 = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file3}"

# set paths for cellprofiler cellpose method
method4 = "cellprofiler_cellpose"
sql_file4 = f"{method4}.sqlite"
single_cell_file4 = f"sqlite:///{cp_dir}/Analysis_Output/Plate2_Output/{sql_file4}"

# set path to the platemap for plate 2
platemap_file = pathlib.Path(f"{cp_dir}/Metadata/platemap_NF1_CP_Plate2.csv")

## Set up names for linking columns between tables in the database file

In [4]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Load in platemap

In [5]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df.head()

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,A,1,A1,NF1,WT
1,A,6,A6,NF1,WT
2,A,7,A7,NF1,Null
3,A,12,A12,NF1,Null
4,B,1,B1,NF1,WT


## Perform extraction with All CellProfiler method

- Merge single cells
- Normalize
- Feature selection on normalized features

In [6]:
sc_util.extract_single_cells(
    single_cell_file=single_cell_file1,
    linking_cols=linking_cols,
    platemap_df=platemap_df,
    output_folder=output_dir,
    method_name=method1,
    norm_feature_select=True,
)

  c /= stddev[:, None]
  c /= stddev[None, :]


## Perform extraction with PyBaSiC CellProfiler method

- Merge single cells
- Normalize
- Feature selection on normalized features

In [7]:
sc_util.extract_single_cells(
    single_cell_file=single_cell_file2,
    linking_cols=linking_cols,
    platemap_df=platemap_df,
    output_folder=output_dir,
    method_name=method2,
    norm_feature_select=True,
)



## Perform extraction with PyBaSiC Cellpose method

- Merge single cells
- Normalize
- Feature selection on normalized features

In [8]:
sc_util.extract_single_cells(
    single_cell_file=single_cell_file3,
    linking_cols=linking_cols,
    platemap_df=platemap_df,
    output_folder=output_dir,
    method_name=method3,
    norm_feature_select=True,
)



## Perform extraction with CellProfiler Cellpose method

- Merge single cells
- Normalize
- Feature selection on normalized features

In [9]:
sc_util.extract_single_cells(
    single_cell_file=single_cell_file4,
    linking_cols=linking_cols,
    platemap_df=platemap_df,
    output_folder=output_dir,
    method_name=method4,
    norm_feature_select=True,
)



---
## Visualize count statisitics

### All CellProfiler

In [10]:
data_path = f"{output_dir}/nf1_sc_{method1}.csv.gz"
data_df = pd.read_csv(data_path, compression="gzip")

data_df.Metadata_genotype.value_counts()

Null    994
WT      687
Name: Metadata_genotype, dtype: int64

In [11]:
# visualize the single cell counts in each well per genotype
pd.crosstab(data_df.Metadata_genotype, data_df.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,44,0,52,0,47,0,61,0,62,...,0,74,0,53,0,73,0,52,0,58
WT,47,0,52,0,48,0,47,0,54,0,...,45,0,54,0,32,0,27,0,29,0


### PyBaSiC CellProfiler

In [12]:
data_path = f"{output_dir}/nf1_sc_{method2}.csv.gz"
data_df = pd.read_csv(data_path, compression="gzip")

data_df.Metadata_genotype.value_counts()

Null    1000
WT       693
Name: Metadata_genotype, dtype: int64

In [13]:
pd.crosstab(data_df.Metadata_genotype, data_df.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,44,0,53,0,48,0,59,0,64,...,0,77,0,54,0,72,0,53,0,59
WT,48,0,51,0,49,0,48,0,55,0,...,44,0,55,0,33,0,26,0,29,0


### PyBaSiC Cellpose

In [14]:
data_path = f"{output_dir}/nf1_sc_{method3}.csv.gz"
data_df = pd.read_csv(data_path, compression="gzip")

data_df.Metadata_genotype.value_counts()

Null    1024
WT       717
Name: Metadata_genotype, dtype: int64

In [15]:
pd.crosstab(data_df.Metadata_genotype, data_df.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,51,0,58,0,50,0,62,0,66,...,0,78,0,54,0,68,0,52,0,62
WT,53,0,51,0,53,0,51,0,56,0,...,41,0,58,0,38,0,30,0,28,0


### CellProfiler Cellpose

In [16]:
data_path = f"{output_dir}/nf1_sc_{method4}.csv.gz"
data_df = pd.read_csv(data_path, compression="gzip")

data_df.Metadata_genotype.value_counts()

Null    1030
WT       727
Name: Metadata_genotype, dtype: int64

In [17]:
pd.crosstab(data_df.Metadata_genotype, data_df.Metadata_Well)

Metadata_Well,A1,A12,A6,A7,B1,B12,B6,B7,C1,C12,...,F6,F7,G1,G12,G6,G7,H1,H12,H6,H7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Null,0,51,0,59,0,50,0,65,0,66,...,0,79,0,54,0,68,0,53,0,62
WT,54,0,52,0,54,0,52,0,57,0,...,41,0,59,0,38,0,30,0,28,0
