# Patch-Level Classification 
Use the CellProfiler per-patch features as opposed to the per-nucleus features.  
Confusingly, CellProfiler assigns an ImageNumber to each patch.  

Train on 6-class cancer classification.
Input is the CellProfiler Image features only.  
Run 5-fold cross validation.
Print the mean accuracy. 

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
#import tensorflow as tf
#tf.config.list_physical_devices('GPU')

2022-06-21 14:35:18.700920
Python 3.8.10
sklearn 1.0.2


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models
from CellProfiler_Util import CP_Util
from RandomForestUtil import RF_Util

In [3]:
BASE_DIR='/home/jrm/Adjeroh/Naved/CP_80K/'  # Alien
#BASE_DIR='D:\\Adjeroh\\Naved\\CP_80K\\'  # Windows
CLASSES=['Output0/','Output1/','Output2/','Output3/','Output4/','Output5/',]
MODELS_DIR='/home/jrm/Adjeroh/Naved/CP_80K/models/Patch.03'  # Alien
#MODELS_DIR='D:\\Adjeroh\\Naved\\CP_80K\\models\\Patch.03'  # Windows

# These patches were used to tune the CellProfiler pipeline
EXCLUDE = [ [311], [12], [30,41], [87], [67], [26,46] ]
IMAGE_FILENAME = 'Process100_Image.csv'

In [4]:
print(datetime.datetime.now())
X = None
y = None
for i in range(0,6):
    image_file = BASE_DIR+CLASSES[i]+IMAGE_FILENAME
    patch_info = pd.read_csv(image_file)
    # remove identifying information
    patch_info = patch_info.set_index(['ImageNumber'])
    patch_info = patch_info.drop(EXCLUDE[i])
    patch_info = patch_info.reset_index()
    Xi = patch_info.drop(columns=['ImageNumber','Group_Index'])  
    bad_cols = [x for x in Xi.columns if x.startswith('ExecutionTime_')]
    Xi.drop(columns=bad_cols,inplace=True)
    bad_cols = [x for x in Xi.columns if '_Nucleus_' in x or '_Tissue_' in x]
    Xi.drop(columns=bad_cols,inplace=True)
    bad_cols = [x for x in Xi.columns if '_MergeRBC_' in x or '_RBC_' in x or '_ShrinkRBC_' in x]
    Xi.drop(columns=bad_cols,inplace=True)
    bad_cols = [x for x in Xi.columns if '_ExpandCells_' in x or '_Cells_' in x]
    Xi.drop(columns=bad_cols,inplace=True)
    Xi = Xi.select_dtypes(include='number')   # drop strings
    size = len(Xi)
    yi = np.ones(size) * i   # e.g. class 3
    if X is None:
        X = Xi
        y = yi
    else:
        X = pd.concat( (X,Xi) )
        y = np.concatenate( (y,yi) )
    X.fillna(0,inplace=True)  
    print(i,X.shape,len(y))    
print(datetime.datetime.now())

2022-06-21 14:35:19.242386
0 (44965, 518) 44965
1 (57866, 518) 57866
2 (71814, 518) 71814
3 (78177, 518) 78177
4 (81373, 518) 81373
5 (82962, 518) 82962
2022-06-21 14:36:43.496904


In [5]:
for c in X.columns:
    print(c)

AreaOccupied_AreaOccupied_ExpandCells
AreaOccupied_AreaOccupied_MergeRBC
AreaOccupied_AreaOccupied_Nucleus
AreaOccupied_AreaOccupied_Tissue
AreaOccupied_Perimeter_ExpandCells
AreaOccupied_Perimeter_MergeRBC
AreaOccupied_Perimeter_Nucleus
AreaOccupied_Perimeter_Tissue
AreaOccupied_TotalArea_ExpandCells
AreaOccupied_TotalArea_MergeRBC
AreaOccupied_TotalArea_Nucleus
AreaOccupied_TotalArea_Tissue
Channel_Tumor
Count_Cells
Count_ExpandCells
Count_MergeRBC
Count_Nucleus
Count_RBC
Count_ShrinkRBC
Count_Tissue
Frame_Tumor
Granularity_10_Eosin
Granularity_10_Hematoxylin
Granularity_11_Eosin
Granularity_11_Hematoxylin
Granularity_12_Eosin
Granularity_12_Hematoxylin
Granularity_13_Eosin
Granularity_13_Hematoxylin
Granularity_14_Eosin
Granularity_14_Hematoxylin
Granularity_15_Eosin
Granularity_15_Hematoxylin
Granularity_16_Eosin
Granularity_16_Hematoxylin
Granularity_1_Eosin
Granularity_1_Hematoxylin
Granularity_2_Eosin
Granularity_2_Hematoxylin
Granularity_3_Eosin
Granularity_3_Hematoxylin
Granul

In [6]:
print(datetime.datetime.now())
Xtrain,ytrain=shuffle(X,y)  #set random_state for reproducibility
X = None
y = None

2022-06-21 14:36:43.529528


In [7]:
print(datetime.datetime.now())
print('Do a one-time fit on all the data...')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
rf1.fit()

2022-06-21 14:36:43.729621
Do a one-time fit on all the data...


In [8]:
print(datetime.datetime.now())
model = rf1.get_model()
joblib.dump(model,MODELS_DIR)

2022-06-21 14:38:55.526853


['/home/jrm/Adjeroh/Naved/CP_80K/models/Patch.03']

In [9]:
print(datetime.datetime.now())
print('Rank the features by importance.')
top = rf1.important_features()
pd.set_option('display.max_rows', None)
top.loc[:200]

2022-06-21 14:38:55.706614
Rank the features by importance.


Unnamed: 0,0,1
0,0.044461,Granularity_2_Hematoxylin
1,0.014251,Texture_Correlation_Hematoxylin_7_01_256
2,0.012096,Texture_Correlation_Hematoxylin_7_03_256
3,0.010269,Threshold_OrigThreshold_Nucleus
4,0.010199,Threshold_FinalThreshold_Nucleus
5,0.008559,Count_Nucleus
6,0.007836,Count_ExpandCells
7,0.007673,Granularity_1_Hematoxylin
8,0.006587,Count_Cells
9,0.00638,Texture_Correlation_Hematoxylin_7_02_256
