# Random Forest 15
Grid search on RF parameters.   
RF08 73.?% acc used every 10th, max_depth=5,  n_estimators=500. BUG   
RF10 55.5% acc used every 10th, max_depth=5,  n_estimators=500.    
RF11 54.6% acc used every  2nd, max_depth=5,  n_estimators=500.    
RF12 55.4% acc used every 10th, max_depth=10, n_estimators=500.    
RF13 55.4% acc used every 10th, max_depth=6,  n_estimators=500.    



In [1]:
MAX_DEPTH = 5
N_ESTIMATORS = 2000
CHOOSE_EVERY = 10

In [2]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import csv
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-07-23 12:27:12.261442
Python 3.9.6
sklearn 1.1.1


In [3]:
# Shuffle X,y in tandem -- can cause shape problems
from sklearn.utils import shuffle
# The model
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Our input is the output from the Data notebooks.
# These csv files were filtered to remove give-away columns and bad rows.
SLASH='/'     # unix
SLASH='\\'   # windows
BASE_PATH='/Users/jasonmiller/Downloads/CellProfilerFiltered/'   # air
BASE_PATH='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerFiltered/'  # alien
BASE_PATH='D:\\Adjeroh\\Naved\\July_Run\\CellProfilerFiltered\\'   # windows
# This is the patch-level csv file: one row per patch, with nucleus totals.
# Later, incorporate the nucleus-specific csv files.
FILENAME='Process100_Image.csv'
# This directory contains lists of patch filenames,
# divided into 5 folds of train/valid splits.
# Notice patch filename [0:19] is case ID, [0:23] is WSI ID,
# and last column is the 6-way cancer class (zero to five) of the case.
# For example, fold0_train.txt starts:
# TCGA-06-0129-01Z-00-DX1_5400_5100.png, 0
# TCGA-06-0129-01Z-00-DX1_5700_6000.png, 0
LEN_PATIENT_ID = 19
SPLITS_BASE='/Users/jasonmiller/Downloads/TrainTestSplit/'    #air
SPLITS_BASE='/home/jrm/Adjeroh/Naved/July_Run/TrainTestSplit/'  # alien
SPLITS_BASE='D:\\Adjeroh\\Naved\\July_Run\\TrainTestSplit\\'   # windows
SPLITS_FILES = [
    ('fold0_train.txt','fold0_test.txt'),
    ('fold1_train.txt','fold1_test.txt'),
    ('fold2_train.txt','fold2_test.txt'),
    ('fold3_train.txt','fold3_test.txt'),
    ('fold4_train.txt','fold4_test.txt'),
]
NUM_FOLDS = 5
PATIENT_TO_CANCER = {}  # hash case ID to class number

## Data loading functions

In [5]:
# Given a case ID = patient name = directory, load the csv file.
# Expect a value like p='TCGA-S9-A6UB-01Z-00'
def load_patient_data(p):
    filepath=BASE_PATH+p+'/'+FILENAME
    df = pd.read_csv(filepath)
    return df

In [6]:
# Given fold, load the case/patient IDs separated into training and validation.
# Fold should be an in between 0 and 4 for 5-fold cross validation.
# The returned list has unique strings like  TCGA-06-0129-01Z-00
def _load_patient_names(filename):
    patients=[]
    with open(filename) as infile:
        rows = csv.reader(infile)
        for row in rows:
            patient = row[0][:LEN_PATIENT_ID]
            cancer_class = int(row[1])
            if patient in PATIENT_TO_CANCER:
                if PATIENT_TO_CANCER[patient] != cancer_class:
                    raise Exception('One patient in two classes:',patient)
            else:
                PATIENT_TO_CANCER[patient] = cancer_class
            patients.append(patient)
    patients = list(np.unique(patients))
    return patients
def load_patient_names(fold):
    filenames = SPLITS_FILES[fold]  # tuple of train,valid
    train_patients = _load_patient_names(SPLITS_BASE+filenames[0])
    valid_patients = _load_patient_names(SPLITS_BASE+filenames[1])
    return train_patients,valid_patients

In [7]:
# Slurp csv into numpy array.
# This works!
def _load_train_valid_set3(patients,save_mem=False):
    X = None  # instances with features
    y = None  # labels = cancer class
    z = None  # patient_id for aggregation
    for patient in patients:
        filepath=BASE_PATH+patient+SLASH+FILENAME
        label = PATIENT_TO_CANCER[patient]
        Xall = np.loadtxt(filepath,skiprows=1,delimiter=',')
        if (save_mem):
            Xi = Xall[0::CHOOSE_EVERY].copy()  # e.g. choose every tenth row
        else:
            Xi = Xall
        yi = np.full(shape=len(Xi), fill_value=label, dtype=np.int8)
        zi = [patient]*len(Xi)  # same ID for all patches from one patient
        if X is None:
            X = Xi
            y = yi
            z = zi
        else:
            X = np.concatenate( (X, Xi) )
            y = np.concatenate( (y, yi) )
            z = np.concatenate( (z, zi) )
    X = pd.DataFrame(X)
    # DataFrame column headers are not required for the machine learning.
    # We'll add them just for debugging. Arbitrarily use first row of first file.
    patient = patients[0]
    filepath=BASE_PATH+patient+'/'+FILENAME
    with open(filepath) as infile:
        rows = csv.reader(infile)
        for row in rows:
            header = row
            break
    X.columns = header
    return X,y,z

In [8]:
# Load csv rows for one set of train+valid patient names.
# This is slow, probably due to concatenating dataframes.
# Consider a rewrite that concatenates csv before constructing a dataframe.
def load_train_valid_set(train_patient_names,valid_patient_names):
    X_train,y_train,z_train = _load_train_valid_set3(train_patient_names,True)
    print('Train lengths X,y,z:',X_train.shape,len(y_train),len(z_train))
    X_valid,y_valid,z_valid = _load_train_valid_set3(valid_patient_names,False)
    print('Valid lengths X,y,z:',X_valid.shape,len(y_valid),len(z_valid))
    
    # TO DO: raise exception if any valid patient is also a train patient
    return X_train,y_train,z_train,X_valid,y_valid,z_valid

## Aggregation functions

In [9]:
# Later, weight each patch label by the confidence i.e. score
def aggregate_accuracy(y_pred,z_valid):
    L = len(y_pred)
    if L != len(z_valid):
        raise Exception('Lengths of y and z do not match')
    correct = {}
    incorrect = {}
    patients = np.unique(z_valid)
    for patient in patients:
        correct[patient]=0
        incorrect[patient]=0
    for i in range(L):
        patient = z_valid[i]
        label = PATIENT_TO_CANCER[patient]
        pred = y_pred[i]
        if pred == label:
            correct[patient] += 1
        else:
            incorrect[patient] += 1
    numerator = 0
    denominator = 0
    for patient in patients:
        denominator += 1
        if correct[patient]>incorrect[patient]:
            numerator += 1
    accuracy = float(0)
    if denominator>0:
        accuracy = 100.0*numerator/denominator 
    return accuracy

## Main loop: Load, Classify, Report

In [10]:
def important_features(model):
        # Prereqs: fit().
        names = model.feature_names_in_
        importances = model.feature_importances_
        pairs = np.column_stack( (names,importances) )
        top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
        # There must be a way to do this witout a loop!
        top_list = []
        for i in top_array:
             top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
        top_df = pd.DataFrame(top_list)
        return top_df

In [11]:
patch_accuracies = []   # summary statistics
patient_accuracies = [] # summary statistics

for fold in range(NUM_FOLDS):
    print()
    print(datetime.datetime.now(),'Fold',fold,'Loading...')
    train_patients,valid_patients = load_patient_names(fold)  
    X_train,y_train,z_train,X_valid,y_valid,z_valid = \
        load_train_valid_set(train_patients,valid_patients)    
    # print('Fold',fold,'patients train',train_patients,'patients valid',valid_patients)
    # print('Fold:',fold,'patches train',X_train.shape,'patches valid',X_valid.shape)

    # This shuffle is pro forma, not strictly necessary.
    # CNN models are sensitive to train set order but RF models are not. 
    # print(datetime.datetime.now(),'Shuffle...')
    # X_train,y_train = shuffle(X_train,y_train)

    print(datetime.datetime.now(),'Train...')
    # min_samples_leaf=1 (default) led to overfitting
    rfc = RandomForestClassifier(max_depth=MAX_DEPTH,n_estimators=N_ESTIMATORS)
    rfc.fit(X_train,y_train)  # slow
    
    print(datetime.datetime.now(),'Ranked feature imporances...')
    top = important_features(rfc)
    #pd.set_option('display.max_rows', None)
    print(top.loc[:10])
    
    print(datetime.datetime.now(),'Evaluate...')
    y_pred = rfc.predict(X_train)
    matches = np.count_nonzero(y_train==y_pred)
    accuracy_t = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Training Accuracy:',accuracy_t)
    
    print(datetime.datetime.now(),'Validate...')
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy_v = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Validation Accuracy:',accuracy_v)
    patch_accuracies.append(accuracy_v)
    
    accuracy_p = aggregate_accuracy(y_pred,z_valid)
    patient_accuracies.append(accuracy_p)
    print('Fold',fold,'Patient-level Validation Accuracy:',accuracy_p)
    
    # This shouldn't be necessary but it seems to reduce memory footprint.
    X_train=None
    X_valid=None
    y_train=None
    y_valid=None
    z_train=None
    z_valid=None
    rfc = None
    y_pred = None
    matches = None


2022-07-23 12:27:13.797417 Fold 0 Loading...
Train lengths X,y,z: (6035, 5302) 6035 6035
Valid lengths X,y,z: (15773, 5302) 15773 15773
2022-07-23 12:30:17.193006 Train...


KeyboardInterrupt: 

In [None]:
print(datetime.datetime.now())
print('Cross validation patch-level accuracy:',patch_accuracies)
print('mean:',np.mean(patch_accuracies),'std:',np.std(patch_accuracies))
print('Cross validation patient-level accuracy:',patient_accuracies)
print('mean:',np.mean(patient_accuracies),'std:',np.std(patient_accuracies))
