# Patch-Level Classification 
Re-do patch 05.  
Make sure all possibly identifying columns have been removed from the dataframe.  

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
#import tensorflow as tf
#tf.config.list_physical_devices('GPU')

2022-06-30 17:01:46.311840
Python 3.8.10
sklearn 1.0.2


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models
from CellProfiler_Util import CP_Util
from RandomForestUtil import RF_Util

In [3]:
BASE_DIR='/home/jrm/Adjeroh/Naved/CP_80K/'  # Alien
#BASE_DIR='D:\\Adjeroh\\Naved\\CP_80K\\'  # Windows
CLASSES=['Output0/','Output1/','Output2/','Output3/','Output4/','Output5/',]
MODELS_DIR='/home/jrm/Adjeroh/Naved/CP_80K/models/Patch.06'  # Alien
#MODELS_DIR='D:\\Adjeroh\\Naved\\CP_80K\\models\\Patch.06'  # Windows

# These patches were used to tune the CellProfiler pipeline
EXCLUDE = [ [311], [12], [30,41], [87], [67], [26,46] ]
IMAGE_FILENAME = 'Process100_Image.csv'

In [4]:
print(datetime.datetime.now())
X = None
y = None
for i in range(0,6):
    image_file = BASE_DIR+CLASSES[i]+IMAGE_FILENAME
    patch_info = pd.read_csv(image_file)
    # remove identifying information
    patch_info = patch_info.set_index(['ImageNumber'])
    patch_info = patch_info.drop(EXCLUDE[i])
    patch_info = patch_info.reset_index()
    Xi = patch_info.drop(columns=['ImageNumber','Group_Index'])  
    Xi = Xi.select_dtypes(include='number')   # drop strings esp filename
    Xi = Xi[Xi['AreaOccupied_AreaOccupied_MergeRBC']<3333]  # filter
    Xi = Xi[Xi['AreaOccupied_AreaOccupied_Tissue']>80000]   # filter
    size = len(Xi)
    yi = np.ones(size) * i   # e.g. class 3
    if X is None:
        X = Xi
        y = yi
    else:
        X = pd.concat( (X,Xi) )
        y = np.concatenate( (y,yi) )
    X.fillna(0,inplace=True)  
    print(i,X.shape,len(y))    

2022-06-30 17:01:47.161566
0 (22940, 5338) 22940
1 (28960, 5338) 28960
2 (36561, 5338) 36561
3 (39498, 5338) 39498
4 (41063, 5338) 41063
5 (42209, 5338) 42209


Get rid of irrelevant or potentially revealing columns.

In [5]:
#    if c == 'Group_Number' or \     # We didn't make groups so CP put all in group 1
#    c.endswith('_Tumor') or \       # Columns like Width_Tumor (pixels) are same for every row
#    c.startswith('ExecutionTime_') or \    # Run time for each step of pipeline -- seems irrelevant
#    c.startswith('ModuleError_'):   # Zero in every case examined.
all_cols = X.columns
bad_cols = []
for c in all_cols:
    if c.startswith('ExecutionTime_'):
        bad_cols.append(c)
c = len(bad_cols)
print(c,'columns to be dropped')
X = X.drop(columns=bad_cols)
print(0, X.shape,len(y))    

26 columns to be dropped
0 (42209, 5312) 42209


In [6]:
print(datetime.datetime.now())
Xtrain,ytrain=shuffle(X,y)  #set random_state for reproducibility
X = None
y = None

2022-06-30 17:04:44.404027


In [7]:
print(datetime.datetime.now())
print('Cross-validation...')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
cv_scores = rf1.cross_validation()
print(cv_scores)
print('mean %.4f +/- %.4f' % (cv_scores.mean(),cv_scores.std()))
print(datetime.datetime.now())

2022-06-30 17:04:45.020252
Cross-validation...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 5.4min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.4min remaining:    0.0s


[CV] END .................................................... total time= 4.7min
[CV] END .................................................... total time= 4.7min
[CV] END .................................................... total time= 4.4min
[CV] END .................................................... total time= 4.4min
[0.88474295 0.88225539 0.8901919  0.88486141 0.88603246]
mean 0.8856 +/- 0.0026
2022-06-30 17:28:22.029762


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 23.6min finished


## Conclusion
Accuracy dropped from 91% in Patch 05 to 88.4% here.  
It could be random variation but the stdev is tight.  
I'll bet the accuracy of older runs, like Patch 01, would drop too.  
If so, this notebook shows the 'irrelevant' columns 
either contain relevant information or were giving away the labels.  
This needs further investigation.