# Classify Cell Health Nuclei Features

### Import libraries


In [13]:
import pathlib
import urllib.request
import joblib
import importlib

import pandas as pd
import numpy as np

classification_utils = importlib.import_module("classification-utils")

### Define hard drive path and classifications output path


In [14]:
normalized_plates_path = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-nuc-merged-normalized"
)

classifications_save_path = pathlib.Path("plate_classifications/")
classifications_save_path.mkdir(exist_ok=True, parents=True)

### Derive and save phenotypic class probabilities


In [15]:
multi_class_models_dir = pathlib.Path(
    "phenotypic_profiling_model/2.train_model/models/multi_class_models"
)
single_class_models_dir = pathlib.Path(
    "phenotypic_profiling_model/2.train_model/models/single_class_models"
)

# iterate through plates so each plate data only needs to be loaded once
for normalized_plate_path in normalized_plates_path.iterdir():
    
    if "14615" in normalized_plate_path.name:
        continue
    
    # get plate name from normalized data path
    plate = normalized_plate_path.name.split("-")[0]
    print(f"Getting phenotypic_class_probabilities for plate {plate}...")
    
    # determine what type columns are
    all_cols = pd.read_csv(normalized_plate_path, nrows=1).columns.to_list()
    feature_cols = [
            col for col in all_cols if "P__" in col
        ]
    metadata_cols = [
            col for col in all_cols if "P__" not in col
        ]
    
    print("Loading plate feature data...")
    # load features
    col_types = {col: np.float32 for col in feature_cols}
    plate_features = pd.read_csv(normalized_plate_path, low_memory=True, usecols=feature_cols)
    # load metadata
    print("Loading plate metadata...")
    col_types = {col: str for col in metadata_cols}
    plate_metadata = pd.read_csv(normalized_plate_path, low_memory=True, usecols=metadata_cols)
    
    print("Getting multi-class model classifications...")
    for model_path in sorted(multi_class_models_dir.iterdir()):
        print(model_path)

        # load current model
        model = joblib.load(model_path)
        
        # get information about the current model
        model_type = model_path.name.split("__")[0]
        feature_type = model_path.name.split("__")[1].replace(".joblib", "")
        
        # get phenotypic class probabilities for the given plate features
        plate_probas = classification_utils.get_probas_dataframe(plate_features, model, feature_type)
        
        # save plate probas with metadata
        model_plate_probas_save_path = pathlib.Path(f"{classifications_save_path}/multi_class_models/{model_type}__{feature_type}/{plate}__cell_classifications.csv.gz")
        model_plate_probas_save_path.parent.mkdir(exist_ok=True, parents=True)
        pd.concat([plate_metadata, plate_probas], axis=1).to_csv(model_plate_probas_save_path, compression="gzip")

        
    
    print("Getting single-class model classifications...")
    for phenotypic_class_models_path in sorted(single_class_models_dir.iterdir()):
        for model_path in phenotypic_class_models_path.iterdir():
            print(model_path)

            # load current model
            model = joblib.load(model_path)
            
            # get information about the current model
            phenotypic_class = phenotypic_class_models_path.name.split("_")[0]
            model_type = model_path.name.split("__")[0]
            feature_type = model_path.name.split("__")[1].replace(".joblib", "")
            
            # get phenotypic class probabilities for the given plate features
            plate_probas = classification_utils.get_probas_dataframe(plate_features, model, feature_type)
            
            # save plate probas with metadata
            model_plate_probas_save_path = pathlib.Path(f"{classifications_save_path}/single_class_models/{phenotypic_class}_models/{model_type}__{feature_type}/{plate}__cell_classifications.csv.gz")
            model_plate_probas_save_path.parent.mkdir(exist_ok=True, parents=True)
            pd.concat([plate_metadata, plate_probas], axis=1).to_csv(model_plate_probas_save_path, compression="gzip", index=False)


    break


Getting phenotypic_class_probabilities for plate SQ00014617...
Loading plate feature data...
Loading plate metadata...
Getting multi-class model classifications...
phenotypic_profiling_model/2.train_model/models/multi_class_models/final__CP.joblib
phenotypic_profiling_model/2.train_model/models/multi_class_models/final__CP_and_DP.joblib
phenotypic_profiling_model/2.train_model/models/multi_class_models/final__DP.joblib
phenotypic_profiling_model/2.train_model/models/multi_class_models/shuffled_baseline__CP.joblib
phenotypic_profiling_model/2.train_model/models/multi_class_models/shuffled_baseline__CP_and_DP.joblib
phenotypic_profiling_model/2.train_model/models/multi_class_models/shuffled_baseline__DP.joblib
Getting single-class model classifications...
phenotypic_profiling_model/2.train_model/models/single_class_models/ADCCM_models/shuffled_baseline__CP.joblib
phenotypic_profiling_model/2.train_model/models/single_class_models/ADCCM_models/final__CP.joblib
phenotypic_profiling_model/2

In [16]:
pd.concat([plate_metadata, plate_probas], axis=1)

Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Cell_UUID,Metadata_Site,Metadata_Well,Metadata_Plate,Metadata_Plate_Map_Name,Metadata_Reagent,SmallIrregular,SmallIrregular Negative
0,646.773973,4.075342,7e4c9500-6a62-44c0-94a8-627101437637,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,0.461236,0.538764
1,141.984490,42.527520,983e4aed-5bde-44e6-99f4-88a7cfbbbc44,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,0.497140,0.502860
2,1661.244888,59.342876,3e14db6b-6bf8-4c75-a339-d5d28cbfdee2,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,0.505205,0.494795
3,1067.377054,73.602479,d525a18f-8a14-4925-b383-2adefabb0f93,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,0.470404,0.529596
4,1305.941113,72.588061,c9c4e64b-08cf-4b44-98a3-f174cc00c3e7,4,G18,SQ00014617,SQ00014617_G18_04,ARID1B-2,0.501715,0.498285
...,...,...,...,...,...,...,...,...,...,...
1395,343.307203,2017.323319,24df339f-2923-40b0-85f7-68d5159b7bbf,5,G18,SQ00014617,SQ00014617_G18_05,ARID1B-2,0.517165,0.482835
1396,690.239186,2030.182619,5ce5e5a5-a4cd-4dd8-aa4c-0d9f01e54ba0,5,G18,SQ00014617,SQ00014617_G18_05,ARID1B-2,0.524856,0.475144
1397,1191.724266,2072.115130,a4484121-0065-4c19-92b4-01acf9152b6d,5,G18,SQ00014617,SQ00014617_G18_05,ARID1B-2,0.513976,0.486024
1398,615.209564,2103.762131,8aacb2a3-24e3-455b-af9a-9ddc48adc219,5,G18,SQ00014617,SQ00014617_G18_05,ARID1B-2,0.518537,0.481463
