# Random Forest
Like notebook 04, do 4-fold cross-validation instead of just one round.
But now aggregate patch-level predictions up to the WSI level.
For now, each patch casts an equal vote.
Later, weight votes by confidence i.e. score.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-07-15 11:43:33.926484
Python 3.8.10
sklearn 1.0.2


In [2]:
# Shuffle X,y in tandem -- can cause shape problems
from sklearn.utils import shuffle
# The model
from sklearn.ensemble import RandomForestClassifier

# This generates one split after shuffling. By default, not stratified.
# from sklearn.model_selection import train_test_split
# This does stratified K-fold cross-validation with no shuffling.
# from sklearn.model_selection import cross_val_score

#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.metrics import confusion_matrix
#from sklearn.feature_selection import RFE
#import joblib # used to dump/load sklearn models
#from CellProfiler_Util import CP_Util
#from RandomForestUtil import RF_Util

## Prepare train and valid sets

In [3]:
def list_shuffle(X):   # could use skilearn shuffle instead
    L = len(X)
    for i in range(L):
        r = random.randrange(0,L)
        if i!=r:
            temp = X[i]
            X[i] = X[r]
            X[r] = temp
    return X

In [4]:
# Prepare to divy up patches stratified by patient.
DF_Ypos = ['B7_','B15','D1_','D5_','E7_','E9_','F9_','G3_','H13','I1_','I5_','I13']
DF_Yneg = ['A3_','A5_','B13','C1_','C11','D3_','E5_','F3_','F7_','F11','F15','G15','H1_','H3_','H7_','H15']
ALL_PATIENTS = DF_Ypos + DF_Yneg
PATIENT_TO_LABEL = {}
for patient in DF_Ypos:
    PATIENT_TO_LABEL[patient]=np.int8(1)
for patient in DF_Yneg:
    PATIENT_TO_LABEL[patient]=np.int8(0)
NUM_FOLDS=4  # because both sets have size = multiple of 4
print('All patients',ALL_PATIENTS)
print('Count Ypos:Yneg',len(DF_Ypos),len(DF_Yneg))
print('Hold out these patients for validation in each cross-validation fold.')
def make_folds():
    rpos = list_shuffle(DF_Ypos)  # fix random state?
    rneg = list_shuffle(DF_Yneg)  # fix random state?
    folds_pos = [rpos[0:3],rpos[3:6],rpos[6:9],rpos[9:12]]
    folds_neg = [rneg[0:4],rneg[4:8],rneg[8:12],rneg[12:16]]
    for i in range(4):
        print('Fold %d pos, neg:'%i,end=' ')
        print(folds_pos[i],folds_neg[i])
    return folds_pos,folds_neg
Ypos_valid_patients,Yneg_valid_patients = make_folds()

All patients ['B7_', 'B15', 'D1_', 'D5_', 'E7_', 'E9_', 'F9_', 'G3_', 'H13', 'I1_', 'I5_', 'I13', 'A3_', 'A5_', 'B13', 'C1_', 'C11', 'D3_', 'E5_', 'F3_', 'F7_', 'F11', 'F15', 'G15', 'H1_', 'H3_', 'H7_', 'H15']
Count Ypos:Yneg 12 16
Hold out these patients for validation in each cross-validation fold.
Fold 0 pos, neg: ['D1_', 'B7_', 'D5_'] ['C1_', 'H3_', 'F3_', 'F7_']
Fold 1 pos, neg: ['E9_', 'F9_', 'I13'] ['F15', 'G15', 'F11', 'H7_']
Fold 2 pos, neg: ['H13', 'I1_', 'B15'] ['A5_', 'A3_', 'B13', 'H1_']
Fold 3 pos, neg: ['E7_', 'I5_', 'G3_'] ['C11', 'D3_', 'H15', 'E5_']


In [5]:
BASE_PATH='/home/jrm/Martinez/CellProfilerRuns/CP_20220705/'
FILENAME='Filtered_Image.csv'
def load_patient(p):
    filepath=BASE_PATH+p+'/'+FILENAME
    df = pd.read_csv(filepath)
    return df

In [6]:
def load_patients(fold):
    train_patients = list(ALL_PATIENTS)  # this will shrink
    valid_patients = []   # this will grow
    for patient in Ypos_valid_patients[fold]:
        train_patients.remove(patient)
        valid_patients.append(patient)
    for patient in Yneg_valid_patients[fold]:
        train_patients.remove(patient)
        valid_patients.append(patient)
    return train_patients,valid_patients

In [7]:
# Load csv rows for one fold
def load_fold(fold,train_patients,valid_patients):
    X_train=None
    y_train=None
    X_valid=None
    y_valid=None
    z_valid=[]
    for patient in train_patients:
        df = load_patient(patient)
        label = PATIENT_TO_LABEL[patient]
        if X_train is None:
            X_train = df
            y_train = np.full(shape=len(df),fill_value=label,dtype=np.int8)
        else:
            X_train = pd.concat( (X_train,df) )
            more = np.full(shape=len(df),fill_value=label,dtype=np.int8)
            y_train = np.concatenate( (y_train, more) )
    for patient in valid_patients:
        df = load_patient(patient)
        label = PATIENT_TO_LABEL[patient]
        if X_valid is None:
            X_valid = df
            y_valid = np.full(shape=len(df),fill_value=label,dtype=np.int8)
        else:
            X_valid = pd.concat( (X_valid,df) )
            more = np.full(shape=len(df),fill_value=label,dtype=np.int8)
            y_valid = np.concatenate( (y_valid, more) )
        z_valid = z_valid + [patient]*len(df)  # retain patient ID for evaluation
    return X_train,y_train,X_valid,y_valid,z_valid

## Random Forest

In [8]:
# Later, weight each patch label by the confidence i.e. score
def aggregate_accuracy(y_pred,z_valid):
    L = len(y_pred)
    if L != len(z_valid):
        raise Exception('Lengths of y and z do not match')
    correct = {}
    incorrect = {}
    patients = np.unique(z_valid)
    for patient in patients:
        correct[patient]=0
        incorrect[patient]=0
    for i in range(L):
        patient = z_valid[i]
        label = PATIENT_TO_LABEL[patient]
        pred = y_pred[i]
        if pred == label:
            correct[patient] += 1
        else:
            incorrect[patient] += 1
    numerator = 0
    denominator = 0
    for patient in patients:
        denominator += 1
        if correct[patient]>incorrect[patient]:
            numerator += 1
    accuracy = float(0)
    if denominator>0:
        accuracy = float(numerator/denominator) 
    return accuracy

In [12]:
patch_accuracies = []
patient_accuracies = []
for fold in range(NUM_FOLDS):
    print()
    print(datetime.datetime.now())
    train_patients,valid_patients = load_patients(fold)  
    X_train,y_train,X_valid,y_valid,z_valid = load_fold(fold,train_patients,valid_patients)
    
    print('Fold',fold,'patients train',train_patients,'patients valid',valid_patients)
    print('Fold:',fold,'patches train',X_train.shape,'patches valid',X_valid.shape)
    # CNN models are sensitive to train set order but RF models are not. 
    X_train,y_train = shuffle(X_train,y_train)
    rfc = RandomForestClassifier()
    rfc.fit(X_train,y_train)  # slow
    
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy = 100.0 * matches / len(y_pred)
    patch_accuracies.append(accuracy)
    print('Fold',fold,'Patch-level Validation Accuracy:',accuracy)
    
    accuracy = aggregate_accuracy(y_pred,z_valid)
    patient_accuracies.append(accuracy)
    print('Fold',fold,'Patient-level Validation Accuracy:',accuracy)


2022-07-15 11:47:23.064585
Fold 0 patients train ['B15', 'E7_', 'E9_', 'F9_', 'G3_', 'H13', 'I1_', 'I5_', 'I13', 'A3_', 'A5_', 'B13', 'C11', 'D3_', 'E5_', 'F11', 'F15', 'G15', 'H1_', 'H7_', 'H15'] patients valid ['D1_', 'B7_', 'D5_', 'C1_', 'H3_', 'F3_', 'F7_']
Fold: 0 patches train (5157, 5305) patches valid (1796, 5305)
Fold 0 Patch-level Validation Accuracy: 54.28730512249443
Fold 0 Patient-level Validation Accuracy: 0.5714285714285714

2022-07-15 11:47:46.309395
Fold 1 patients train ['B7_', 'B15', 'D1_', 'D5_', 'E7_', 'G3_', 'H13', 'I1_', 'I5_', 'A3_', 'A5_', 'B13', 'C1_', 'C11', 'D3_', 'E5_', 'F3_', 'F7_', 'H1_', 'H3_', 'H15'] patients valid ['E9_', 'F9_', 'I13', 'F15', 'G15', 'F11', 'H7_']
Fold: 1 patches train (5105, 5305) patches valid (1848, 5305)
Fold 1 Patch-level Validation Accuracy: 69.75108225108225
Fold 1 Patient-level Validation Accuracy: 0.7142857142857143

2022-07-15 11:48:09.175322
Fold 2 patients train ['B7_', 'D1_', 'D5_', 'E7_', 'E9_', 'F9_', 'G3_', 'I5_', 'I13'

In [13]:
print(datetime.datetime.now())
print('Cross validation patch-level accuracy:',patch_accuracies)
print('mean:',np.mean(patch_accuracies),'std:',np.std(patch_accuracies))
print('Cross validation patient-level accuracy:',patient_accuracies)
print('mean:',np.mean(patient_accuracies),'std:',np.std(patient_accuracies))


2022-07-15 11:48:55.319686
Cross validation patch-level accuracy: [54.28730512249443, 69.75108225108225, 64.66030989272944, 65.48129981606377]
mean: 63.54499927059247 std: 5.683606788094546
Cross validation patient-level accuracy: [0.5714285714285714, 0.7142857142857143, 0.5714285714285714, 0.7142857142857143]
mean: 0.6428571428571428 std: 0.07142857142857145
