# Random Forest
Try leave-one-out cross validation.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-07-19 18:07:30.583915
Python 3.8.10
sklearn 1.0.2


In [2]:
# Shuffle X,y in tandem -- can cause shape problems
from sklearn.utils import shuffle
# The model
from sklearn.ensemble import RandomForestClassifier

# This generates one split after shuffling. By default, not stratified.
# from sklearn.model_selection import train_test_split
# This does stratified K-fold cross-validation with no shuffling.
# from sklearn.model_selection import cross_val_score

#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.metrics import confusion_matrix
#from sklearn.feature_selection import RFE
#import joblib # used to dump/load sklearn models
#from CellProfiler_Util import CP_Util
#from RandomForestUtil import RF_Util

## Prepare train and valid sets

In [3]:
def list_shuffle(X):   # could use skilearn shuffle instead
    L = len(X)
    for i in range(L):
        r = random.randrange(0,L)
        if i!=r:
            temp = X[i]
            X[i] = X[r]
            X[r] = temp
    return X

In [4]:
# Prepare to divy up patches stratified by patient.
DF_Ypos = ['B7_','B15','D1_','D5_','E7_','E9_','F9_','G3_','H13','I1_','I5_','I13']
DF_Yneg = ['A3_','A5_','B13','C1_','C11','D3_','E5_','F3_','F7_','F11','F15','G15','H1_','H3_','H7_','H15']
ALL_PATIENTS = DF_Ypos + DF_Yneg
PATIENT_TO_LABEL = {}
for patient in DF_Ypos:
    PATIENT_TO_LABEL[patient]=np.int8(1)
for patient in DF_Yneg:
    PATIENT_TO_LABEL[patient]=np.int8(0)
print('All patients',ALL_PATIENTS)
print('Count Ypos:Yneg',len(DF_Ypos),len(DF_Yneg))

All patients ['B7_', 'B15', 'D1_', 'D5_', 'E7_', 'E9_', 'F9_', 'G3_', 'H13', 'I1_', 'I5_', 'I13', 'A3_', 'A5_', 'B13', 'C1_', 'C11', 'D3_', 'E5_', 'F3_', 'F7_', 'F11', 'F15', 'G15', 'H1_', 'H3_', 'H7_', 'H15']
Count Ypos:Yneg 12 16


In [5]:
BASE_PATH='/home/jrm/Martinez/CellProfilerRuns/CP_20220705/'
FILENAME='Filtered_Image.csv'
def load_patient(p):
    filepath=BASE_PATH+p+'/'+FILENAME
    df = pd.read_csv(filepath)
    return df

In [6]:
def load_patients(fold):
    train_patients = list(ALL_PATIENTS)  # this will shrink
    valid_patients = []   # this will grow
    for patient in Ypos_valid_patients[fold]:
        train_patients.remove(patient)
        valid_patients.append(patient)
    for patient in Yneg_valid_patients[fold]:
        train_patients.remove(patient)
        valid_patients.append(patient)
    return train_patients,valid_patients

In [7]:
# Load csv rows for one fold given list of train names (strings) and one valid name.
def load_fold(train_patients,valid_patient):
    X_train=None
    y_train=None
    X_valid=None
    y_valid=None
    z_valid=None
    for patient in train_patients:
        df = load_patient(patient)
        label = PATIENT_TO_LABEL[patient]
        if X_train is None:
            X_train = df
            y_train = np.full(shape=len(df),fill_value=label,dtype=np.int8)
        else:
            X_train = pd.concat( (X_train,df) )
            more = np.full(shape=len(df),fill_value=label,dtype=np.int8)
            y_train = np.concatenate( (y_train, more) )
    patient = valid_patient
    df = load_patient(patient)
    label = PATIENT_TO_LABEL[patient]
    X_valid = df
    y_valid = np.full(shape=len(df),fill_value=label,dtype=np.int8)
    z_valid = [patient]*len(df)  # retain patient ID for evaluation
    return X_train,y_train,X_valid,y_valid,z_valid

## Random Forest

In [8]:
# Later, weight each patch label by the confidence i.e. score
# This is specialized for leave-one-out: num patients = 1 
def aggregate_accuracy(y_pred,z_valid):
    L = len(y_pred)
    if L != len(z_valid):
        raise Exception('Lengths of y and z do not match')
    correct = 0
    incorrect = 0
    for i in range(L):
        patient = z_valid[i]
        label = PATIENT_TO_LABEL[patient]
        pred = y_pred[i]
        if pred == label:
            correct += 1
        else:
            incorrect += 1
    numerator = correct
    denominator = correct + incorrect
    accuracy = 100.0 * numerator/denominator 
    return accuracy

In [9]:
patch_accuracies = []
patient_accuracies = []
NUM_FOLDS = len(ALL_PATIENTS)
for fold in range(NUM_FOLDS):
    train_patients = list(ALL_PATIENTS)
    valid_patient = train_patients[fold]
    train_patients.remove(valid_patient)
    if len(train_patients)!=len(ALL_PATIENTS)-1:
        raise Exception('Train test split not as expected!')

    print()
    print(datetime.datetime.now(),'Fold',fold,'Validate patient',valid_patient)
    X_train,y_train,X_valid,y_valid,z_valid = load_fold(train_patients,valid_patient)
    
    #print('Fold',fold,'patients train',train_patients,'patients valid',valid_patient)
    print('Fold:',fold,'patches train',X_train.shape,'patches valid',X_valid.shape)
    # CNN models are sensitive to train set order but RF models are not. 
    X_train,y_train = shuffle(X_train,y_train)
    rfc = RandomForestClassifier(min_samples_leaf=4)
    rfc.fit(X_train,y_train)  # slow
    
    y_pred = rfc.predict(X_train)
    matches = np.count_nonzero(y_train==y_pred)
    accuracy = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Training Accuracy:',accuracy)
    
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Validation Accuracy:',accuracy)
    patch_accuracies.append(accuracy)
    
    accuracy = aggregate_accuracy(y_pred,z_valid)
    patient_accuracies.append(accuracy)
    print('Fold',fold,'Patient-level Validation Accuracy:',accuracy)


2022-07-19 18:07:31.035807 Fold 0 Validate patient B7_
Fold: 0 patches train (6675, 5305) patches valid (278, 5305)
Fold 0 Patch-level Training Accuracy: 98.96629213483146
Fold 0 Patch-level Validation Accuracy: 17.26618705035971
Fold 0 Patient-level Validation Accuracy: 17.26618705035971

2022-07-19 18:07:58.907809 Fold 1 Validate patient B15
Fold: 1 patches train (6688, 5305) patches valid (265, 5305)
Fold 1 Patch-level Training Accuracy: 99.11782296650718
Fold 1 Patch-level Validation Accuracy: 36.9811320754717
Fold 1 Patient-level Validation Accuracy: 36.9811320754717

2022-07-19 18:08:28.633857 Fold 2 Validate patient D1_
Fold: 2 patches train (6667, 5305) patches valid (286, 5305)
Fold 2 Patch-level Training Accuracy: 99.35503224838759
Fold 2 Patch-level Validation Accuracy: 48.6013986013986
Fold 2 Patient-level Validation Accuracy: 48.6013986013986

2022-07-19 18:08:59.190746 Fold 3 Validate patient D5_
Fold: 3 patches train (6701, 5305) patches valid (252, 5305)
Fold 3 Patch-l

In [10]:
print(datetime.datetime.now())
print('Cross validation patch-level accuracy:',patch_accuracies)
print('mean:',np.mean(patch_accuracies),'std:',np.std(patch_accuracies))
print('Cross validation patient-level accuracy:',patient_accuracies)
print('mean:',np.mean(patient_accuracies),'std:',np.std(patient_accuracies))


2022-07-19 18:20:31.366558
Cross validation patch-level accuracy: [17.26618705035971, 36.9811320754717, 48.6013986013986, 41.26984126984127, 66.00985221674877, 97.61092150170649, 22.006472491909385, 49.3421052631579, 45.45454545454545, 41.89723320158103, 64.31226765799256, 27.6, 50.925925925925924, 87.6847290640394, 95.29411764705883, 56.33187772925764, 92.4914675767918, 35.23809523809524, 37.407407407407405, 77.2, 32.916666666666664, 86.77685950413223, 91.90140845070422, 63.70967741935484, 98.91696750902527, 92.33716475095785, 91.89189189189189, 66.66666666666667]
mean: 61.28724572259602 std: 25.41975235700361
Cross validation patient-level accuracy: [17.26618705035971, 36.9811320754717, 48.6013986013986, 41.26984126984127, 66.00985221674877, 97.61092150170649, 22.006472491909385, 49.3421052631579, 45.45454545454545, 41.89723320158103, 64.31226765799256, 27.6, 50.925925925925924, 87.6847290640394, 95.29411764705883, 56.33187772925764, 92.4914675767918, 35.23809523809524, 37.4074074074