# Random Forest
Like notebook 03, use outputs of CSV_prep and CSV_filter.
Now, do 4-fold cross-validation instead of just one round.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-07-15 10:41:48.975679
Python 3.8.10
sklearn 1.0.2


In [2]:
# Shuffle X,y in tandem -- can cause shape problems
from sklearn.utils import shuffle
# The model
from sklearn.ensemble import RandomForestClassifier

# This generates one split after shuffling. By default, not stratified.
# from sklearn.model_selection import train_test_split
# This does stratified K-fold cross-validation with no shuffling.
# from sklearn.model_selection import cross_val_score

#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.metrics import confusion_matrix
#from sklearn.feature_selection import RFE
#import joblib # used to dump/load sklearn models
#from CellProfiler_Util import CP_Util
#from RandomForestUtil import RF_Util

## Prepare train and valid sets

In [3]:
def list_shuffle(X):   # could use skilearn shuffle instead
    L = len(X)
    for i in range(L):
        r = random.randrange(0,L)
        if i!=r:
            temp = X[i]
            X[i] = X[r]
            X[r] = temp
    return X

In [4]:
# Prepare to divy up patches stratified by patient.
DF_Ypos = ['B7_','B15','D1_','D5_','E7_','E9_','F9_','G3_','H13','I1_','I5_','I13']
DF_Yneg = ['A3_','A5_','B13','C1_','C11','D3_','E5_','F3_','F7_','F11','F15','G15','H1_','H3_','H7_','H15']
ALL_PATIENTS = DF_Ypos + DF_Yneg
NUM_FOLDS=4  # because both sets have size = multiple of 4
print('All patients',ALL_PATIENTS)
print('Count Ypos:Yneg',len(DF_Ypos),len(DF_Yneg))
print('Hold out these patients for validation in each cross-validation fold.')
def make_folds():
    rpos = list_shuffle(DF_Ypos)  # fix random state?
    rneg = list_shuffle(DF_Yneg)  # fix random state?
    folds_pos = [rpos[0:3],rpos[3:6],rpos[6:9],rpos[9:12]]
    folds_neg = [rneg[0:4],rneg[4:8],rneg[8:12],rneg[12:16]]
    for i in range(4):
        print('Fold %d pos, neg:'%i,end=' ')
        print(folds_pos[i],folds_neg[i])
    return folds_pos,folds_neg
Ypos_valid_patients,Yneg_valid_patients = make_folds()

All patients ['B7_', 'B15', 'D1_', 'D5_', 'E7_', 'E9_', 'F9_', 'G3_', 'H13', 'I1_', 'I5_', 'I13', 'A3_', 'A5_', 'B13', 'C1_', 'C11', 'D3_', 'E5_', 'F3_', 'F7_', 'F11', 'F15', 'G15', 'H1_', 'H3_', 'H7_', 'H15']
Count Ypos:Yneg 12 16
Hold out these patients for validation in each cross-validation fold.
Fold 0 pos, neg: ['D5_', 'E7_', 'B15'] ['D3_', 'A5_', 'C11', 'H7_']
Fold 1 pos, neg: ['I13', 'B7_', 'G3_'] ['F11', 'A3_', 'B13', 'H3_']
Fold 2 pos, neg: ['I1_', 'H13', 'E9_'] ['H1_', 'G15', 'F15', 'E5_']
Fold 3 pos, neg: ['F9_', 'D1_', 'I5_'] ['H15', 'F3_', 'F7_', 'C1_']


In [5]:
BASE_PATH='/home/jrm/Martinez/CellProfilerRuns/CP_20220705/'
FILENAME='Filtered_Image.csv'
def load_patient(p):
    filepath=BASE_PATH+p+'/'+FILENAME
    df = pd.read_csv(filepath)
    return df

In [6]:
def load_patients(fold):
    train_patients = list(ALL_PATIENTS)  # this will shrink
    valid_patients = []   # this will grow
    for patient in Ypos_valid_patients[fold]:
        train_patients.remove(patient)
        valid_patients.append(patient)
    for patient in Yneg_valid_patients[fold]:
        train_patients.remove(patient)
        valid_patients.append(patient)
    return train_patients,valid_patients

In [7]:
# Load csv rows for one fold
def load_fold(fold,train_patients,valid_patients):
    X_train=None
    y_train=None
    X_valid=None
    y_valid=None
    for patient in train_patients:
        df = load_patient(patient)
        func = np.ones
        if patient not in DF_Ypos:
            func = np.zeros
        if X_train is None:
            X_train = df
            y_train = func(len(df))
        else:
            X_train = pd.concat( (X_train,df) )
            y_train = np.concatenate( (y_train, func(len(df))))
    for patient in valid_patients:
        df = load_patient(patient)
        func = np.ones
        if patient not in DF_Ypos:
            func = np.zeros
        if X_valid is None:
            X_valid = df
            y_valid = func(len(df))
        else:
            X_valid = pd.concat( (X_valid,df) )
            y_valid = np.concatenate( (y_valid, func(len(df))))
    return X_train,y_train,X_valid,y_valid



## Random Forest

In [8]:
accuracies = []
for fold in range(NUM_FOLDS):
    train_patients,valid_patients = load_patients(fold)  
    X_train,y_train,X_valid,y_valid = load_fold(fold,train_patients,valid_patients)
    print()
    print(datetime.datetime.now())
    print('Fold',fold,'patients train',train_patients,'patients valid',valid_patients)
    print('Fold:',fold,'patches train',X_train.shape,'patches valid',X_valid.shape)
    # CNN models are sensitive to train set order but RF models are not. 
    X_train,y_train = shuffle(X_train,y_train)
    rfc = RandomForestClassifier()
    rfc.fit(X_train,y_train)  # slow
    
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy = 100.0 * matches / len(y_pred)
    accuracies.append(accuracy)
    print('Fold',fold,'Validation Accuracy:',accuracy)


2022-07-15 10:41:55.874495
Fold 0 patients train ['B7_', 'D1_', 'E9_', 'F9_', 'G3_', 'H13', 'I1_', 'I5_', 'I13', 'A3_', 'B13', 'C1_', 'E5_', 'F3_', 'F7_', 'F11', 'F15', 'G15', 'H1_', 'H3_', 'H15'] patients valid ['D5_', 'E7_', 'B15', 'D3_', 'A5_', 'C11', 'H7_']
Fold: 0 patches train (5305, 5305) patches valid (1648, 5305)
Fold 0 Validation Accuracy: 65.77669902912622

2022-07-15 10:42:19.280829
Fold 1 patients train ['B15', 'D1_', 'D5_', 'E7_', 'E9_', 'F9_', 'H13', 'I1_', 'I5_', 'A5_', 'C1_', 'C11', 'D3_', 'E5_', 'F3_', 'F7_', 'F15', 'G15', 'H1_', 'H7_', 'H15'] patients valid ['I13', 'B7_', 'G3_', 'F11', 'A3_', 'B13', 'H3_']
Fold: 1 patches train (5299, 5305) patches valid (1654, 5305)
Fold 1 Validation Accuracy: 64.32889963724304

2022-07-15 10:42:42.044554
Fold 2 patients train ['B7_', 'B15', 'D1_', 'D5_', 'E7_', 'F9_', 'G3_', 'I5_', 'I13', 'A3_', 'A5_', 'B13', 'C1_', 'C11', 'D3_', 'F3_', 'F7_', 'F11', 'H3_', 'H7_', 'H15'] patients valid ['I1_', 'H13', 'E9_', 'H1_', 'G15', 'F15', 'E

In [9]:
print(datetime.datetime.now())
print('Cross validation patch-level accuracy:',accuracies)
print('mean:',np.mean(accuracies),'std:',np.std(accuracies))


2022-07-15 10:43:21.362792
Cross validation patch-level accuracy: [65.77669902912622, 64.32889963724304, 69.3565976008724, 59.65877820583379]
mean: 64.78024361826887 std: 3.4773933671607815
