# Patch-Level Classification 
Use the CellProfiler per-patch features as opposed to the per-nucleus features.  
Confusingly, CellProfiler assigns an ImageNumber to each patch.  

Train on 6-class cancer classification.
Input is the CellProfiler Image features only.  
Run 5-fold cross validation.
Print the mean accuracy. 

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
#import tensorflow as tf
#tf.config.list_physical_devices('GPU')

2022-06-21 13:45:10.563920
Python 3.8.10
sklearn 1.0.2


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models
from CellProfiler_Util import CP_Util
from RandomForestUtil import RF_Util

In [3]:
BASE_DIR='/home/jrm/Adjeroh/Naved/CP_80K/'  # Alien
#BASE_DIR='D:\\Adjeroh\\Naved\\CP_80K\\'  # Windows
CLASSES=['Output0/','Output1/','Output2/','Output3/','Output4/','Output5/',]
MODELS_DIR='/home/jrm/Adjeroh/Naved/CP_80K/models/Patch.02'  # Alien
#MODELS_DIR='D:\\Adjeroh\\Naved\\CP_80K\\models\\Patch.02'  # Windows

# These patches were used to tune the CellProfiler pipeline
EXCLUDE = [ [311], [12], [30,41], [87], [67], [26,46] ]
IMAGE_FILENAME = 'Process100_Image.csv'

In [4]:
print(datetime.datetime.now())
X = None
y = None
for i in range(0,6):
    image_file = BASE_DIR+CLASSES[i]+IMAGE_FILENAME
    patch_info = pd.read_csv(image_file)
    # remove identifying information
    patch_info = patch_info.set_index(['ImageNumber'])
    patch_info = patch_info.drop(EXCLUDE[i])
    patch_info = patch_info.reset_index()
    Xi = patch_info.drop(columns=['ImageNumber','Group_Index'])  
    bad_cols = [x for x in Xi.columns if x.startswith('ExecutionTime_')]
    Xi.drop(columns=bad_cols,inplace=True)
    bad_cols = [x for x in Xi.columns if '_Nucleus_' in x or '_Tissue_' in x]
    Xi.drop(columns=bad_cols,inplace=True)
    bad_cols = [x for x in Xi.columns if '_MergeRBC_' in x or '_RBC_' in x or '_ShrinkRBC_' in x]
    Xi.drop(columns=bad_cols,inplace=True)
    bad_cols = [x for x in Xi.columns if '_ExpandCells_' in x or '_Cells_' in x]
    Xi.drop(columns=bad_cols,inplace=True)
    Xi = Xi.select_dtypes(include='number')   # drop strings esp filename
    size = len(Xi)
    yi = np.ones(size) * i   # e.g. class 3
    if X is None:
        X = Xi
        y = yi
    else:
        X = pd.concat( (X,Xi) )
        y = np.concatenate( (y,yi) )
    X.fillna(0,inplace=True)  
    print(i,X.shape,len(y))    
print(datetime.datetime.now())

2022-06-21 13:45:11.143012
0 (44965, 519) 44965
1 (57866, 519) 57866
2 (71814, 519) 71814
3 (78177, 519) 78177
4 (81373, 519) 81373
5 (82962, 519) 82962
2022-06-21 13:46:26.720407


In [5]:
for c in X.columns:
    print(c)

AreaOccupied_AreaOccupied_ExpandCells
AreaOccupied_AreaOccupied_MergeRBC
AreaOccupied_AreaOccupied_Nucleus
AreaOccupied_AreaOccupied_Tissue
AreaOccupied_Perimeter_ExpandCells
AreaOccupied_Perimeter_MergeRBC
AreaOccupied_Perimeter_Nucleus
AreaOccupied_Perimeter_Tissue
AreaOccupied_TotalArea_ExpandCells
AreaOccupied_TotalArea_MergeRBC
AreaOccupied_TotalArea_Nucleus
AreaOccupied_TotalArea_Tissue
Channel_Tumor
Count_Cells
Count_ExpandCells
Count_MergeRBC
Count_Nucleus
Count_RBC
Count_ShrinkRBC
Count_Tissue
Frame_Tumor
Granularity_10_Eosin
Granularity_10_Hematoxylin
Granularity_11_Eosin
Granularity_11_Hematoxylin
Granularity_12_Eosin
Granularity_12_Hematoxylin
Granularity_13_Eosin
Granularity_13_Hematoxylin
Granularity_14_Eosin
Granularity_14_Hematoxylin
Granularity_15_Eosin
Granularity_15_Hematoxylin
Granularity_16_Eosin
Granularity_16_Hematoxylin
Granularity_1_Eosin
Granularity_1_Hematoxylin
Granularity_2_Eosin
Granularity_2_Hematoxylin
Granularity_3_Eosin
Granularity_3_Hematoxylin
Granul

In [6]:
print(datetime.datetime.now())
Xtrain,ytrain=shuffle(X,y)  #set random_state for reproducibility
X = None
y = None

2022-06-21 13:46:26.746687


In [7]:
print(datetime.datetime.now())
print('Cross-validation...')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
cv_scores = rf1.cross_validation()
print(cv_scores)
print('mean %.4f +/- %.4f' % (cv_scores.mean(),cv_scores.std()))
print(datetime.datetime.now())

2022-06-21 13:46:26.882054
Cross-validation...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 1.5min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


[CV] END .................................................... total time= 1.5min
[CV] END .................................................... total time= 1.5min
[CV] END .................................................... total time= 1.5min
[CV] END .................................................... total time= 1.5min
[0.89182185 0.89254505 0.8918756  0.893081   0.89199614]
mean 0.8923 +/- 0.0005
2022-06-21 13:53:52.430866


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.4min finished
