# Process single cell morphology features for CellProfiler readouts - `Plate 1`

## Import Libraries

In [1]:
import pandas as pd
import pathlib

import extraction_utils as sc_util

## Set up paths to CellProfiler directory and outputs

In [2]:
# Set file and directory constants
cp_dir = pathlib.Path("../CellProfiler_pipelines")
output_dir = pathlib.Path("data/Plate1/CellProfiler")

## Set up paths to sqlite files and outputs

In [3]:
# Set paths for all cellprofiler method
method1 = "all_cellprofiler"
sql_file1 = f"{method1}.sqlite"
single_cell_file1 = f"sqlite:///{cp_dir}/Analysis_Output/Plate1_Output/{sql_file1}"

# set paths for pybasic cellprofiler method
method2 = "pybasic_cellprofiler"
sql_file2 = f"{method2}.sqlite"
single_cell_file2 = f"sqlite:///{cp_dir}/Analysis_Output/Plate1_Output/{sql_file2}"

# set paths for pybasic cellpose method
method3 = "pybasic_cellpose"
sql_file3 = f"{method3}.sqlite"
single_cell_file3 = f"sqlite:///{cp_dir}/Analysis_Output/Plate1_Output/{sql_file3}"

# set paths for cellprofiler cellpose method
method4 = "cellprofiler_cellpose"
sql_file4 = f"{method4}.sqlite"
single_cell_file4 = f"sqlite:///{cp_dir}/Analysis_Output/Plate1_Output/{sql_file4}"

# set path to the platemap for plate 1
platemap_file = pathlib.Path(f"{cp_dir}/Metadata/platemap_NF1_CP.csv")

## Set up names for linking columns between tables in the database file

In [4]:
# Define custom linking columns between compartments
linking_cols = {
    "Per_Cytoplasm": {
        "Per_Cells": "Cytoplasm_Parent_Cells",
        "Per_Nuclei": "Cytoplasm_Parent_Nuclei",
    },
    "Per_Cells": {"Per_Cytoplasm": "Cells_Number_Object_Number"},
    "Per_Nuclei": {"Per_Cytoplasm": "Nuclei_Number_Object_Number"},
}

## Load in platemap

In [5]:
# Load platemap file
platemap_df = pd.read_csv(platemap_file)
platemap_df.head()

Unnamed: 0,WellRow,WellCol,well_position,gene_name,genotype
0,C,6,C6,NF1,WT
1,C,7,C7,NF1,Null
2,D,6,D6,NF1,WT
3,D,7,D7,NF1,Null
4,E,6,E6,NF1,WT


## Perform extraction with All CellProfiler method

- Merge single cells
- Normalize
- Feature selection on normalized features

In [6]:
sc_util.extract_single_cells(
    single_cell_file=single_cell_file1,
    linking_cols=linking_cols,
    platemap_df=platemap_df,
    output_folder=output_dir,
    method_name=method1,
    norm_feature_select=True,
)

  c /= stddev[:, None]
  c /= stddev[None, :]


## Perform extraction with PyBaSiC CellProfiler method

- Merge single cells
- Normalize
- Feature selection on normalized features

In [7]:
sc_util.extract_single_cells(
    single_cell_file=single_cell_file2,
    linking_cols=linking_cols,
    platemap_df=platemap_df,
    output_folder=output_dir,
    method_name=method2,
    norm_feature_select=True,
)



## Perform extraction with PyBaSiC Cellpose method

- Merge single cells
- Normalize
- Feature selection on normalized features

In [8]:
sc_util.extract_single_cells(
    single_cell_file=single_cell_file3,
    linking_cols=linking_cols,
    platemap_df=platemap_df,
    output_folder=output_dir,
    method_name=method3,
    norm_feature_select=True,
)



## Perform extraction with CellProfiler Cellpose method

- Merge single cells
- Normalize
- Feature selection on normalized features

In [9]:
sc_util.extract_single_cells(
    single_cell_file=single_cell_file4,
    linking_cols=linking_cols,
    platemap_df=platemap_df,
    output_folder=output_dir,
    method_name=method4,
    norm_feature_select=True,
)

  c /= stddev[:, None]
  c /= stddev[None, :]


---
## Visualize count statisitics

### All CellProfiler

In [10]:
data_path = f"{output_dir}/nf1_sc_{method1}.csv.gz"
data_df = pd.read_csv(data_path, compression="gzip")

data_df.Metadata_genotype.value_counts()

Null    173
WT       69
Name: Metadata_genotype, dtype: int64

In [11]:
# visualize the single cell counts in each well per genotype
pd.crosstab(data_df.Metadata_genotype, data_df.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Null,0,21,0,29,0,56,0,67
WT,25,0,7,0,14,0,23,0


### PyBaSiC CellProfiler

In [12]:
data_path = f"{output_dir}/nf1_sc_{method2}.csv.gz"
data_df = pd.read_csv(data_path, compression="gzip")

data_df.Metadata_genotype.value_counts()

Null    172
WT       69
Name: Metadata_genotype, dtype: int64

In [13]:
pd.crosstab(data_df.Metadata_genotype, data_df.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Null,0,22,0,27,0,56,0,67
WT,25,0,7,0,15,0,22,0


### PyBaSiC Cellpose

In [14]:
data_path = f"{output_dir}/nf1_sc_{method3}.csv.gz"
data_df = pd.read_csv(data_path, compression="gzip")

data_df.Metadata_genotype.value_counts()

Null    182
WT       75
Name: Metadata_genotype, dtype: int64

In [15]:
pd.crosstab(data_df.Metadata_genotype, data_df.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Null,0,23,0,32,0,59,0,68
WT,26,0,7,0,17,0,25,0


### CellProfiler Cellpose

In [16]:
data_path = f"{output_dir}/nf1_sc_{method4}.csv.gz"
data_df = pd.read_csv(data_path, compression="gzip")

data_df.Metadata_genotype.value_counts()

Null    181
WT       76
Name: Metadata_genotype, dtype: int64

In [17]:
pd.crosstab(data_df.Metadata_genotype, data_df.Metadata_Well)

Metadata_Well,C6,C7,D6,D7,E6,E7,F6,F7
Metadata_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Null,0,23,0,32,0,59,0,67
WT,26,0,7,0,17,0,26,0
