# Process single cell morphology features for DeepProfiler readouts - Plate 1

## Import libraries

In [1]:
import pathlib

from pycytominer import feature_select
from pycytominer.cyto_utils import DeepProfiler_processing, output, infer_cp_features

## Set paths to DP project

In [2]:
dp_output_nuc = pathlib.Path("../3_extracting_features/NF1_nuc_project-DP")
dp_output_cyto = pathlib.Path("../3_extracting_features/NF1_cyto_project-DP")

index_file_nuc = pathlib.Path(f"{dp_output_nuc}/inputs/metadata/index.csv")
index_file_cyto = pathlib.Path(f"{dp_output_cyto}/inputs/metadata/index.csv")

profile_dir_nuc = pathlib.Path(f"{dp_output_nuc}/outputs/efn_pretrained/features")
profile_dir_cyto = pathlib.Path(f"{dp_output_cyto}/outputs/efn_pretrained/features")

## Set paths to outputs

In [3]:
output_dir = pathlib.Path('data')

output_file_raw_nuc = pathlib.Path(f'{output_dir}/nf1_sc_deepprofiler_nuc.csv.gz')
output_file_raw_cyto = pathlib.Path(f'{output_dir}/nf1_sc_deepprofiler_cyto.csv.gz')

output_file_norm_nuc = pathlib.Path(f'{output_dir}/nf1_sc_norm_deepprofiler_nuc.csv.gz')
output_file_norm_cyto = pathlib.Path(f'{output_dir}/nf1_sc_norm_deepprofiler_cyto.csv.gz')

output_file_norm_fs_nuc = pathlib.Path(f'{output_dir}/nf1_sc_norm_fs_deepprofiler_nuc.csv.gz')
output_file_norm_fs_cyto = pathlib.Path(f'{output_dir}/nf1_sc_norm_fs_deepprofiler_cyto.csv.gz')

## Perform normalization and feature selection on DP Nuclei project

### Create DeepProfilerData object

In [4]:
deep_data_nuc = DeepProfiler_processing.DeepProfilerData(
    index_file_nuc, profile_dir_nuc, filename_delimiter="/", file_extension=".npz"
)

### Initalize SingleCellDeepProfiler class

In [5]:
deep_single_cell_nuc = DeepProfiler_processing.SingleCellDeepProfiler(deep_data_nuc)

### Compile raw single cell data

In [6]:
nuc_sc = deep_single_cell_nuc.get_single_cells(output=True, location_x_col_index=1, location_y_col_index=2)
output(nuc_sc, output_file_raw_nuc)

print(nuc_sc.shape)
nuc_sc.head()

(257, 3852)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,...,efficientnet_3830,efficientnet_3831,efficientnet_3832,efficientnet_3833,efficientnet_3834,efficientnet_3835,efficientnet_3836,efficientnet_3837,efficientnet_3838,efficientnet_3839
0,652.868421,760.552632,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.189552,-0.034666,0.636435,-0.144796,-0.169194,-0.166081,-0.033387,-0.09738,-0.10492,0.216034
1,1015.898477,209.162437,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.130013,-0.169359,0.309454,-0.167421,-0.211234,0.452966,-0.092908,-0.161609,-0.135684,0.470573
2,387.20283,238.853774,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.154498,-0.1078,1.008715,-0.173751,-0.113082,-0.18656,-0.044206,-0.105374,-0.082019,0.58532
3,259.502304,250.400922,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,0.048324,-0.167672,1.143355,-0.179259,-0.180391,-0.162171,-0.055307,-0.079885,-0.127507,0.332103
4,351.255708,592.429224,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.165685,-0.157138,0.196759,-0.202028,-0.010776,-0.224092,-0.041582,-0.147533,-0.12486,0.300342


### Normalize raw single cell data

In [7]:
normalized_nuc = deep_single_cell_nuc.normalize_deep_single_cells(
    output_file=output_file_norm_nuc, location_x_col_index=1, location_y_col_index=2
)

print(normalized_nuc.shape)
normalized_nuc.head()

getting single cells
(257, 3852)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,...,efficientnet_3830,efficientnet_3831,efficientnet_3832,efficientnet_3833,efficientnet_3834,efficientnet_3835,efficientnet_3836,efficientnet_3837,efficientnet_3838,efficientnet_3839
0,652.868421,760.552632,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.514201,0.478099,0.484105,-0.212163,-0.572582,-0.23907,0.907958,0.219261,0.311132,-0.288884
1,1015.898477,209.162437,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.250138,-0.740111,-0.190505,-0.690312,-0.862426,4.039268,-1.532861,-1.382833,-0.336643,0.894669
2,387.20283,238.853774,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.35873,-0.183352,1.252175,-0.824103,-0.18572,-0.380602,0.464281,0.019876,0.793334,1.428218
3,259.502304,250.400922,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,0.540821,-0.724848,1.529958,-0.9405,-0.649775,-0.212045,0.009084,0.655653,-0.164469,0.250813
4,351.255708,592.429224,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.408348,-0.629579,-0.423012,-1.421697,0.519623,-0.639995,0.571886,-1.03172,-0.108736,0.103133


### Separate metadata and features prior to feature selection

In [8]:
# extract metadata prior to feature selection
metadata_cols = infer_cp_features(normalized_nuc, metadata=True)
derived_features = [
    x for x in normalized_nuc.columns.tolist() if x not in metadata_cols
]

### Feature selection from normalized data

In [9]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
]

feature_select_norm_nuc = feature_select(
    normalized_nuc,
    features = derived_features,
    operation= feature_select_ops,
)

output(feature_select_norm_nuc, output_file_norm_fs_nuc)

print(feature_select_norm_nuc.shape)
feature_select_norm_nuc.head()

(257, 3837)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,...,efficientnet_3830,efficientnet_3831,efficientnet_3832,efficientnet_3833,efficientnet_3834,efficientnet_3835,efficientnet_3836,efficientnet_3837,efficientnet_3838,efficientnet_3839
0,652.868421,760.552632,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.514201,0.478099,0.484105,-0.212163,-0.572582,-0.23907,0.907958,0.219261,0.311132,-0.288884
1,1015.898477,209.162437,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.250138,-0.740111,-0.190505,-0.690312,-0.862426,4.039268,-1.532861,-1.382833,-0.336643,0.894669
2,387.20283,238.853774,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.35873,-0.183352,1.252175,-0.824103,-0.18572,-0.380602,0.464281,0.019876,0.793334,1.428218
3,259.502304,250.400922,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,0.540821,-0.724848,1.529958,-0.9405,-0.649775,-0.212045,0.009084,0.655653,-0.164469,0.250813
4,351.255708,592.429224,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.408348,-0.629579,-0.423012,-1.421697,0.519623,-0.639995,0.571886,-1.03172,-0.108736,0.103133


## Perform normalization and feature selection on DP Nuclei project

### Create DeepProfilerData object

In [10]:
deep_data_cyto = DeepProfiler_processing.DeepProfilerData(
    index_file_cyto, profile_dir_cyto, filename_delimiter="/", file_extension=".npz"
)

### Initialize SingleDeepProfiler class

In [11]:
deep_single_cell_cyto = DeepProfiler_processing.SingleCellDeepProfiler(deep_data_cyto)

### Compile raw single cell data

In [12]:
cyto_sc = deep_single_cell_cyto.get_single_cells(output=True, location_x_col_index=1, location_y_col_index=2)
output(cyto_sc, output_file_raw_cyto)

print(cyto_sc.shape)
cyto_sc.head()

(256, 3852)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,...,efficientnet_3830,efficientnet_3831,efficientnet_3832,efficientnet_3833,efficientnet_3834,efficientnet_3835,efficientnet_3836,efficientnet_3837,efficientnet_3838,efficientnet_3839
0,650.422472,736.706742,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,0.153386,-0.034785,-0.113517,-0.173233,0.013302,-0.17557,-0.112678,-0.030428,-0.098558,0.458548
1,949.182667,256.734667,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,0.158114,-0.099159,-0.174047,0.415832,-0.184661,-0.034013,-0.076901,-0.028081,-0.116231,0.030506
2,454.460081,265.797023,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,0.107825,-0.194261,-0.18526,-0.035056,0.167735,0.560817,-0.104378,-0.064447,-0.13314,0.303427
3,211.165254,274.745763,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.138649,-0.175582,-0.128276,-0.110945,-0.067334,0.173986,-0.101229,-0.025802,-0.090082,0.448318
4,375.12513,566.646507,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,0.12226,-0.110667,-0.155417,-0.143916,-0.120184,0.051254,-0.054232,-0.037242,-0.086592,0.667034


### Normalize raw single cell data

In [13]:
normalized_cyto = deep_single_cell_cyto.normalize_deep_single_cells(
    output_file=output_file_norm_cyto, location_x_col_index=1, location_y_col_index=2
)

print(normalized_cyto.shape)
normalized_cyto.head()

getting single cells
(256, 3852)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,...,efficientnet_3830,efficientnet_3831,efficientnet_3832,efficientnet_3833,efficientnet_3834,efficientnet_3835,efficientnet_3836,efficientnet_3837,efficientnet_3838,efficientnet_3839
0,650.422472,736.706742,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.034014,0.859518,-0.025713,-0.729571,0.267231,-0.847366,-1.090082,0.566786,-0.024363,0.230463
1,949.182667,256.734667,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.021109,0.185511,-0.612227,2.925806,-0.927761,-0.30435,-0.031065,0.626545,-0.40806,-1.211616
2,454.460081,265.797023,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.158382,-0.810211,-0.720877,0.127869,1.199453,1.977439,-0.844405,-0.299137,-0.775166,-0.292143
3,211.165254,274.745763,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.831178,-0.614643,-0.168719,-0.343051,-0.219525,0.493543,-0.751189,0.684555,0.159646,0.196
4,375.12513,566.646507,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.118977,0.065025,-0.43171,-0.547652,-0.538548,0.022739,0.639954,0.393346,0.235406,0.932857


### Separate metadata and features prior to feature selection

In [14]:
# extract metadata prior to feature selection
metadata_cols = infer_cp_features(normalized_cyto, metadata=True)
derived_features = [
    x for x in normalized_cyto.columns.tolist() if x not in metadata_cols
]

### Feature selection from normalized data

In [15]:
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
]

feature_select_norm_cyto = feature_select(
    normalized_cyto,
    features = derived_features,
    operation= feature_select_ops,
)

output(feature_select_norm_cyto, output_file_norm_fs_cyto)

print(feature_select_norm_cyto.shape)
feature_select_norm_cyto.head()

(256, 3844)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,...,efficientnet_3830,efficientnet_3831,efficientnet_3832,efficientnet_3833,efficientnet_3834,efficientnet_3835,efficientnet_3836,efficientnet_3837,efficientnet_3838,efficientnet_3839
0,650.422472,736.706742,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.034014,0.859518,-0.025713,-0.729571,0.267231,-0.847366,-1.090082,0.566786,-0.024363,0.230463
1,949.182667,256.734667,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.021109,0.185511,-0.612227,2.925806,-0.927761,-0.30435,-0.031065,0.626545,-0.40806,-1.211616
2,454.460081,265.797023,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.158382,-0.810211,-0.720877,0.127869,1.199453,1.977439,-0.844405,-0.299137,-0.775166,-0.292143
3,211.165254,274.745763,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.831178,-0.614643,-0.168719,-0.343051,-0.219525,0.493543,-0.751189,0.684555,0.159646,0.196
4,375.12513,566.646507,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.118977,0.065025,-0.43171,-0.547652,-0.538548,0.022739,0.639954,0.393346,0.235406,0.932857
