WayScience · jenna-tomkinson · Nov 28, 2022 · Nov 14, 2022 · Nov 14, 2022 · Nov 14, 2022
diff --git a/3_extracting_features/metadata/NF1_annotations.csv b/3_extracting_features/metadata/NF1_annotations.csv
@@ -1,9 +1,9 @@
-Plate,Well,Gene Identifier,Gene Symbol,Genotype,Channels
-1,C6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,C7,ENSG00000196712,NF1,Het,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,D6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,D7,ENSG00000196712,NF1,Het,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,E6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,E7,ENSG00000196712,NF1,Het,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,F6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,F7,ENSG00000196712,NF1,Het,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+Plate,Well,Gene Identifier,Gene Symbol,Genotype,Channels
+1,C6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,C7,ENSG00000196712,NF1,Null,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,D6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,D7,ENSG00000196712,NF1,Null,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,E6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,E7,ENSG00000196712,NF1,Null,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,F6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,F7,ENSG00000196712,NF1,Null,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
diff --git a/4_processing_features/README.md b/4_processing_features/README.md
@@ -0,0 +1,27 @@
+# 4. Processing Extracted Single Cell Features 
+
+In this module, we present our pipeline for processing outputted `.sqlite` file with single cell features from CellProfiler.
+The processed features are saved into compressed `.csv.gz` for use during statistical analysis.
+
+## Pycytominer
+
+We use [Pycytominer](https://github.com/cytomining/pycytominer) to perform the aggregation, merging, and normalization of the NF1 single cell features.
+
+For more information regarding the functions that we used, please see [the documentation](https://pycytominer.readthedocs.io/en/latest/pycytominer.cyto_utils.html#pycytominer.cyto_utils.cells.SingleCells.merge_single_cells) from the Pycytominer team.
+
+### Normalization
+
+Normalization of the data is important because there will be variety in the shapes of distributions. To make statsitical analysis easier, we normalize using standardized method.
+
+---
+
+## Step 1: Setup Processing Feature Environment
+
+### Step 1a: Create Environment
+
+Make sure you are in the `4_processing_features` directory before performing the below command.
+
+```sh
+# Run this command in terminal to create the conda environment for feature extraction
+conda env create -f 4.processing_features.yml
+```
diff --git a/4_processing_features/data/nf1_sc_cellprofiler.csv.gz b/4_processing_features/data/nf1_sc_cellprofiler.csv.gz
diff --git a/4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz b/4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz
diff --git a/4_processing_features/extract_single_cell_features.ipynb b/4_processing_features/extract_single_cell_features.ipynb
@@ -5,7 +5,15 @@
    "id": "70a2251c-2a69-43f1-92bf-793095abc2cd",
    "metadata": {},
    "source": [
-    "## Process single cell morphology features for CellProfiler readouts"
+    "# Process single cell morphology features for CellProfiler readouts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7a50486",
+   "metadata": {},
+   "source": [
+    "## Import Libraries"
    ]
   },
   {
@@ -22,6 +30,14 @@
     "from pycytominer.cyto_utils import cells, output"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5f3929d2",
+   "metadata": {},
+   "source": [
+    "## Set up paths to CellProfiler directory and output"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -33,14 +49,24 @@
     "cp_dir = \"../CellProfiler_pipelines\"\n",
     "output_dir = \"data\"\n",
     "\n",
+    "# Set name and path of .sqlite file and path to metadata\n",
     "sql_file = \"NF1_data.sqlite\"\n",
     "single_cell_file = f\"sqlite:///{cp_dir}/Analysis_Output/{sql_file}\"\n",
     "platemap_file = f\"{cp_dir}/Metadata/platemap_NF1_CP.csv\"\n",
     "\n",
+    "# Set path with name for outputted data\n",
     "sc_output_file = pathlib.Path(f\"{output_dir}/nf1_sc_cellprofiler.csv.gz\")\n",
     "sc_norm_output_file = pathlib.Path(f\"{output_dir}/nf1_sc_norm_cellprofiler.csv.gz\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "728f5105",
+   "metadata": {},
+   "source": [
+    "## Set up names for linking columns between tables in the database file"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -59,6 +85,14 @@
     "}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3884560c",
+   "metadata": {},
+   "source": [
+    "## Load and view platemap file"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -108,7 +142,7 @@
        "      <td>7</td>\n",
        "      <td>C7</td>\n",
        "      <td>NF1</td>\n",
-       "      <td>Het</td>\n",
+       "      <td>Null</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -124,7 +158,7 @@
        "      <td>7</td>\n",
        "      <td>D7</td>\n",
        "      <td>NF1</td>\n",
-       "      <td>Het</td>\n",
+       "      <td>Null</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -140,7 +174,7 @@
        "      <td>7</td>\n",
        "      <td>E7</td>\n",
        "      <td>NF1</td>\n",
-       "      <td>Het</td>\n",
+       "      <td>Null</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
@@ -156,7 +190,7 @@
        "      <td>7</td>\n",
        "      <td>F7</td>\n",
        "      <td>NF1</td>\n",
-       "      <td>Het</td>\n",
+       "      <td>Null</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -165,13 +199,13 @@
       "text/plain": [
        "  WellRow  WellCol well_position gene_name genotype\n",
        "0       C        6            C6       NF1       WT\n",
-       "1       C        7            C7       NF1      Het\n",
+       "1       C        7            C7       NF1     Null\n",
        "2       D        6            D6       NF1       WT\n",
-       "3       D        7            D7       NF1      Het\n",
+       "3       D        7            D7       NF1     Null\n",
        "4       E        6            E6       NF1       WT\n",
-       "5       E        7            E7       NF1      Het\n",
+       "5       E        7            E7       NF1     Null\n",
        "6       F        6            F6       NF1       WT\n",
-       "7       F        7            F7       NF1      Het"
+       "7       F        7            F7       NF1     Null"
       ]
      },
      "execution_count": 4,
@@ -185,6 +219,14 @@
     "platemap_df"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "72daff37",
+   "metadata": {},
+   "source": [
+    "## Set up `SingleCells` class from Pycytominer"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -195,7 +237,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/waygr/miniforge3/envs/4.process-nf1-features/lib/python3.8/site-packages/pycytominer/cyto_utils/util.py:61: UserWarning: Non-canonical compartment detected: per_cells, per_cytoplasm, per_nuclei\n",
+      "/home/jenna/anaconda3/envs/4.process-nf1-features/lib/python3.8/site-packages/pycytominer/cyto_utils/util.py:61: UserWarning: Non-canonical compartment detected: per_cells, per_cytoplasm, per_nuclei\n",
       "  warnings.warn(warn_str)\n"
      ]
     }
@@ -214,6 +256,14 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f6a9d05d",
+   "metadata": {},
+   "source": [
+    "## Merge single cells "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 6,
@@ -492,6 +542,14 @@
     "sc_df.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6debbc47",
+   "metadata": {},
+   "source": [
+    "## Normalize Data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,
@@ -772,6 +830,8 @@
    "id": "480448ba-a0fc-4c4f-94e2-311543dce6df",
    "metadata": {},
    "source": [
+    "---\n",
+    "\n",
     "### Visualize basic count statistics"
    ]
   },
@@ -784,8 +844,8 @@
     {
      "data": {
       "text/plain": [
-       "Het    116\n",
-       "WT      33\n",
+       "Null    116\n",
+       "WT       33\n",
        "Name: Metadata_genotype, dtype: int64"
       ]
      },
@@ -848,7 +908,7 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>Het</th>\n",
+       "      <th>Null</th>\n",
        "      <td>0</td>\n",
        "      <td>12</td>\n",
        "      <td>0</td>\n",
@@ -876,7 +936,7 @@
       "text/plain": [
        "Metadata_Well      C6  C7  D6  D7  E6  E7  F6  F7\n",
        "Metadata_genotype                                \n",
-       "Het                 0  12   0  14   0  44   0  46\n",
+       "Null                0  12   0  14   0  44   0  46\n",
        "WT                 12   0   5   0   9   0   7   0"
       ]
      },
@@ -892,9 +952,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:4.process-nf1-features] *",
+   "display_name": "Python 3.8.13 ('4.process-nf1-features')",
    "language": "python",
-   "name": "conda-env-4.process-nf1-features-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -907,6 +967,11 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.8.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "6e6aca846613de2bb537f4a3be07c319e65610ef8366b19567099a39e74b14d7"
+   }
   }
  },
  "nbformat": 4,

diff --git a/...converted/extract_single_cell_features.py → ..._features/extract_single_cell_features.py b/...converted/extract_single_cell_features.py → ..._features/extract_single_cell_features.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python
 # coding: utf-8
 
-# ## Process single cell morphology features for CellProfiler readouts
+# # Process single cell morphology features for CellProfiler readouts
+
+# ## Import Libraries
 
 # In[1]:
 
@@ -13,21 +15,27 @@
 from pycytominer.cyto_utils import cells, output
 
 
+# ## Set up paths to CellProfiler directory and output
+
 # In[2]:
 
 
 # Set file and directory constants
 cp_dir = "../CellProfiler_pipelines"
 output_dir = "data"
 
+# Set name and path of .sqlite file and path to metadata
 sql_file = "NF1_data.sqlite"
 single_cell_file = f"sqlite:///{cp_dir}/Analysis_Output/{sql_file}"
 platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP.csv"
 
+# Set path with name for outputted data
 sc_output_file = pathlib.Path(f"{output_dir}/nf1_sc_cellprofiler.csv.gz")
 sc_norm_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_cellprofiler.csv.gz")
 
 
+# ## Set up names for linking columns between tables in the database file
+
 # In[3]:
 
 
@@ -42,6 +50,8 @@
 }
 
 
+# ## Load and view platemap file
+
 # In[4]:
 
 
@@ -50,6 +60,8 @@
 platemap_df
 
 
+# ## Set up `SingleCells` class from Pycytominer
+
 # In[5]:
 
 
@@ -66,6 +78,8 @@
 )
 
 
+# ## Merge single cells 
+
 # In[6]:
 
 
@@ -84,6 +98,8 @@
 sc_df.head()
 
 
+# ## Normalize Data
+
 # In[7]:
 
 
@@ -99,6 +115,8 @@
 normalize_sc_df.head()
 
 
+# ---
+# 
 # ### Visualize basic count statistics
 
 # In[8]:

diff --git a/CellProfiler_pipelines/Metadata/platemap_NF1_CP.csv b/CellProfiler_pipelines/Metadata/platemap_NF1_CP.csv
@@ -1,9 +1,9 @@
-WellRow,WellCol,well_position,gene_name,genotype
-C,6,C6,NF1,WT
-C,7,C7,NF1,Het
-D,6,D6,NF1,WT
-D,7,D7,NF1,Het
-E,6,E6,NF1,WT
-E,7,E7,NF1,Het
-F,6,F6,NF1,WT
-F,7,F7,NF1,Het
+WellRow,WellCol,well_position,gene_name,genotype
+C,6,C6,NF1,WT
+C,7,C7,NF1,Null
+D,6,D6,NF1,WT
+D,7,D7,NF1,Null
+E,6,E6,NF1,WT
+E,7,E7,NF1,Null
+F,6,F6,NF1,WT
+F,7,F7,NF1,Null