# Patch-Level Classification 
Use the CellProfiler per-patch features as opposed to the per-nucleus features.  
Confusingly, CellProfiler assigns an ImageNumber to each patch.  

Train on 6-class cancer classification.
Input is the CellProfiler Image features only.  
Run 5-fold cross validation.
Print the mean accuracy. 

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
#import tensorflow as tf
#tf.config.list_physical_devices('GPU')

2022-06-21 15:07:42.161607
Python 3.9.6
sklearn 1.1.1


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models
from CellProfiler_Util import CP_Util
from RandomForestUtil import RF_Util

In [3]:
BASE_DIR='/home/jrm/Adjeroh/Naved/CP_80K/'  # Alien
BASE_DIR='D:\\Adjeroh\\Naved\\CP_80K\\'  # Windows
CLASSES=['Output0/','Output1/','Output2/','Output3/','Output4/','Output5/',]
MODELS_DIR='/home/jrm/Adjeroh/Naved/CP_80K/models/Patch.01'  # Alien
MODELS_DIR='D:\\Adjeroh\\Naved\\CP_80K\\models\\Patch.01'  # Windows

# These patches were used to tune the CellProfiler pipeline
EXCLUDE = [ [311], [12], [30,41], [87], [67], [26,46] ]
IMAGE_FILENAME = 'Process100_Image.csv'

In [4]:
print(datetime.datetime.now())
X = None
y = None
for i in range(0,6):
    image_file = BASE_DIR+CLASSES[i]+IMAGE_FILENAME
    patch_info = pd.read_csv(image_file)
    # remove identifying information
    patch_info = patch_info.set_index(['ImageNumber'])
    patch_info = patch_info.drop(EXCLUDE[i])
    patch_info = patch_info.reset_index()
    Xi = patch_info.drop(columns=['ImageNumber','Group_Index'])  
    Xi = Xi.select_dtypes(include='number')   # drop strings esp filename
    size = len(Xi)
    yi = np.ones(size) * i   # e.g. class 3
    if X is None:
        X = Xi
        y = yi
    else:
        X = pd.concat( (X,Xi) )
        y = np.concatenate( (y,yi) )
    X.fillna(0,inplace=True)  
    print(i,X.shape,len(y))    

2022-06-21 15:07:46.842528
0 (44965, 5338) 44965
1 (57866, 5338) 57866
2 (71814, 5338) 71814
3 (78177, 5338) 78177
4 (81373, 5338) 81373
5 (82962, 5338) 82962


In [5]:
print(datetime.datetime.now())
Xtrain,ytrain=shuffle(X,y)  #set random_state for reproducibility
X = None
y = None

2022-06-21 15:09:49.580789


In [6]:
print(datetime.datetime.now())
print('Cross-validation...')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
cv_scores = rf1.cross_validation()
print(cv_scores)
print('mean %.4f +/- %.4f' % (cv_scores.mean(),cv_scores.std()))
print(datetime.datetime.now())

2022-06-21 15:09:51.426637
Cross-validation...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 8.6min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.6min remaining:    0.0s


[CV] END .................................................... total time= 9.2min
[CV] END .................................................... total time= 9.0min
[CV] END .................................................... total time= 8.4min
[CV] END .................................................... total time= 8.3min
[0.88266136 0.87982884 0.88036403 0.87584378 0.87180569]
mean 0.8781 +/- 0.0038
2022-06-21 15:53:20.488900


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 43.5min finished
