# Random Forest 03
Notebook for lung cancer images, modify for glioma images. After bug fixes, this ran to completion.

Train a default random forest on 6-way cancer classification using just Image.csv data.

Do 4-fold cross-validation, and aggregate patch-level predictions up to the WSI level. We are getting low accuracy here.

For now, aggregate by majority voting, with each patch casting an equal vote. We have a bug that gives aggregate accuracy 0.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import csv
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-07-19 14:55:15.302502
Python 3.8.10
sklearn 1.0.2


In [2]:
# Shuffle X,y in tandem -- can cause shape problems
from sklearn.utils import shuffle
# The model
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Our input is the output from the Data notebooks.
# These csv files were filtered to remove give-away columns and bad rows.
BASE_PATH='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerFiltered/'
# This is the patch-level csv file: one row per patch, with nucleus totals.
# Later, incorporate the nucleus-specific csv files.
FILENAME='Process100_Image.csv'
# This directory contains lists of patch filenames,
# divided into 5 folds of train/valid splits.
# Notice patch filename [0:19] is case ID, [0:23] is WSI ID,
# and last column is the 6-way cancer class (zero to five) of the case.
# For example, fold0_train.txt starts:
# TCGA-06-0129-01Z-00-DX1_5400_5100.png, 0
# TCGA-06-0129-01Z-00-DX1_5700_6000.png, 0
LEN_PATIENT_ID = 19
SPLITS_BASE='/home/jrm/Adjeroh/Naved/July_Run/TrainTestSplit/'
SPLITS_FILES = [
    ('fold0_train.txt','fold0_test.txt'),
    ('fold1_train.txt','fold1_test.txt'),
    ('fold2_train.txt','fold2_test.txt'),
    ('fold3_train.txt','fold3_test.txt'),
    ('fold4_train.txt','fold4_test.txt'),
]
NUM_FOLDS = 5
PATIENT_TO_CANCER = {}  # hash case ID to class number

## Data loading functions

In [4]:
# sklearn can do this but we were stumbling on shape & type problems
def list_shuffle(X):   
    L = len(X)
    for i in range(L):
        r = random.randrange(0,L)
        if i!=r:
            temp = X[i]
            X[i] = X[r]
            X[r] = temp
    return X

In [5]:
# Given a case ID = patient name = directory, load the csv file.
# Expect a value like p='TCGA-S9-A6UB-01Z-00'
def load_patient_data(p):
    filepath=BASE_PATH+p+'/'+FILENAME
    df = pd.read_csv(filepath)
    return df

In [6]:
# Given fold, load the case/patient IDs separated into training and validation.
# Fold should be an in between 0 and 4 for 5-fold cross validation.
# The returned list has unique strings like  TCGA-06-0129-01Z-00
def _load_patient_names(filename):
    patients=[]
    with open(filename) as infile:
        rows = csv.reader(infile)
        for row in rows:
            patient = row[0][:LEN_PATIENT_ID]
            cancer_class = int(row[1])
            if patient in PATIENT_TO_CANCER:
                if PATIENT_TO_CANCER[patient] != cancer_class:
                    raise Exception('One patient in two classes:',patient)
            else:
                PATIENT_TO_CANCER[patient] = cancer_class
            patients.append(patient)
    patients = list(np.unique(patients))
    return patients
def load_patient_names(fold):
    filenames = SPLITS_FILES[fold]  # tuple of train,valid
    train_patients = _load_patient_names(SPLITS_BASE+filenames[0])
    valid_patients = _load_patient_names(SPLITS_BASE+filenames[1])
    return train_patients,valid_patients

In [7]:
def _load_train_valid_set1(patients):
    X = None  # instances with features
    y = None  # labels = cancer class
    z = []  # patient_id for aggregation
    for patient in patients:
        df = load_patient_data(patient)
        label = PATIENT_TO_CANCER[patient]
        if X is None:
            X = df 
            y = np.full(shape=len(df), fill_value=label, dtype=np.int8)
        else:
            X = pd.concat( (X, df) )
            more = np.full(shape=len(df), fill_value=label, dtype=np.int8)
            y = np.concatenate( (y, more) )
        z = z + [patient]*len(df)  # same ID for all patches from one patient
    return X,y,z

In [8]:
# Read csv into list and hold off building the dataframe till the end.
# This runs out of memory!
def _load_train_valid_set2(patients):
    X = []  # instances with features
    y = []  # labels = cancer class
    z = []  # patient_id for aggregation
    header = None
    patient_count = 0
    for patient in patients:
        row_count = 0
        patient_count += 1
        filepath=BASE_PATH+patient+'/'+FILENAME
        label = PATIENT_TO_CANCER[patient]
        with open(filepath) as infile:
            rows = csv.reader(infile)
            for row in rows:
                if row_count == 0:
                    if header is None:
                        header = row
                else:
                    X.append(row)
                    y.append(label)
                    z.append(patient)
                row_count += 1
        print('Patients:',patient_count,'Rows:',row_count)
    X = pd.DataFrame(X)
    X.columns = header
    return X,y,z

In [9]:
# Slurp csv into numpy array.
# This works!
def _load_train_valid_set3(patients):
    X = None  # instances with features
    y = None  # labels = cancer class
    z = None  # patient_id for aggregation
    patient = patients[0]
    filepath=BASE_PATH+patient+'/'+FILENAME
    with open(filepath) as infile:
        rows = csv.reader(infile)
        for row in rows:
            header = row
            break
    patient_count = 0
    for patient in patients:
        row_count = 0
        patient_count += 1
        filepath=BASE_PATH+patient+'/'+FILENAME
        label = PATIENT_TO_CANCER[patient]
        Xi = np.loadtxt(filepath,skiprows=1,delimiter=',')
        row_count += len(Xi)
        yi = np.full(shape=len(Xi), fill_value=label, dtype=np.int8)
        zi = [patient]*len(Xi)  # same ID for all patches from one patient
        if X is None:
            X = Xi
            y = yi
            z = zi
        else:
            X = np.concatenate( (X, Xi) )
            y = np.concatenate( (y, yi) )
            z = np.concatenate( (z, zi) )
    X = pd.DataFrame(X)
    X.columns = header
    return X,y,z

In [10]:
# Load csv rows for one set of train+valid patient names.
# This is slow, probably due to concatenating dataframes.
# Consider a rewrite that concatenates csv before constructing a dataframe.
def load_train_valid_set(train_patient_names,valid_patient_names):
    print('Load train set...')
    X_train,y_train,z_train = _load_train_valid_set3(train_patient_names)
    print('Loaded lengths X,y,z:',X_train.shape,len(y_train),len(z_train))
    print('Load valid set...')
    X_valid,y_valid,z_valid = _load_train_valid_set3(valid_patient_names)
    print('Loaded lengths X,y,z:',X_valid.shape,len(y_valid),len(z_valid))
    
    # TO DO: raise exception if any valid patient is also a train patient
    return X_train,y_train,z_train,X_valid,y_valid,z_valid

## Aggregation functions

In [11]:
# Later, weight each patch label by the confidence i.e. score
def aggregate_accuracy(y_pred,z_valid):
    L = len(y_pred)
    if L != len(z_valid):
        raise Exception('Lengths of y and z do not match')
    correct = {}
    incorrect = {}
    patients = np.unique(z_valid)
    for patient in patients:
        correct[patient]=0
        incorrect[patient]=0
    for i in range(L):
        patient = z_valid[i]
        label = PATIENT_TO_CANCER[patient]
        pred = y_pred[i]
        if pred == label:
            correct[patient] += 1
        else:
            incorrect[patient] += 1
    numerator = 0
    denominator = 0
    for patient in patients:
        denominator += 1
        if correct[patient]>incorrect[patient]:
            numerator += 1
    accuracy = float(0)
    if denominator>0:
        accuracy = 100.0*numerator/denominator 
    return accuracy

## Main loop: Load, Classify, Report

In [12]:
patch_accuracies = []
patient_accuracies = []
for fold in range(NUM_FOLDS):
    print()
    print(datetime.datetime.now(),'Fold',fold,'Loading...')
    train_patients,valid_patients = load_patient_names(fold)  
    X_train,y_train,z_train,X_valid,y_valid,z_valid = \
        load_train_valid_set(train_patients,valid_patients)    
    # print('Fold',fold,'patients train',train_patients,'patients valid',valid_patients)
    # print('Fold:',fold,'patches train',X_train.shape,'patches valid',X_valid.shape)

    # This shuffle is pro forma, not strictly necessary.
    # CNN models are sensitive to train set order but RF models are not. 
    print(datetime.datetime.now(),'Shuffle...')
    X_train,y_train = shuffle(X_train,y_train)

    print(datetime.datetime.now(),'Train...')
    rfc = RandomForestClassifier()
    rfc.fit(X_train,y_train)  # slow
    
    print(datetime.datetime.now(),'Evaluate...')
    y_pred = rfc.predict(X_train)
    matches = np.count_nonzero(y_train==y_pred)
    accuracy = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Training Accuracy:',accuracy)
    
    print(datetime.datetime.now(),'Validate...')
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Validation Accuracy:',accuracy)
    patch_accuracies.append(accuracy)
    
    accuracy = aggregate_accuracy(y_pred,z_valid)
    patient_accuracies.append(accuracy)
    print('Fold',fold,'Patient-level Validation Accuracy:',accuracy)
    X_train=None
    X_valid=None
    y_train=None
    y_valid=None
    z_train=None
    z_valid=None
    rfc = None
    y_pred = None
    matches = None


2022-07-19 14:55:15.750348 Fold 0 Loading...
Load train set...
Patients: 1 Rows: 393
Patients: 2 Rows: 379
Patients: 3 Rows: 391
Patients: 4 Rows: 638
Patients: 5 Rows: 1100
Patients: 6 Rows: 1346
Patients: 7 Rows: 368
Patients: 8 Rows: 744
Patients: 9 Rows: 2387
Patients: 10 Rows: 699
Patients: 11 Rows: 1108
Patients: 12 Rows: 1596
Patients: 13 Rows: 327
Patients: 14 Rows: 689
Patients: 15 Rows: 547
Patients: 16 Rows: 788
Patients: 17 Rows: 305
Patients: 18 Rows: 368
Patients: 19 Rows: 2889
Patients: 20 Rows: 365
Patients: 21 Rows: 1068
Patients: 22 Rows: 572
Patients: 23 Rows: 2402
Patients: 24 Rows: 759
Patients: 25 Rows: 1591
Patients: 26 Rows: 1344
Patients: 27 Rows: 2604
Patients: 28 Rows: 3110
Patients: 29 Rows: 780
Patients: 30 Rows: 724
Patients: 31 Rows: 357
Patients: 32 Rows: 389
Patients: 33 Rows: 391
Patients: 34 Rows: 1537
Patients: 35 Rows: 338
Patients: 36 Rows: 366
Patients: 37 Rows: 289
Patients: 38 Rows: 353
Patients: 39 Rows: 768
Patients: 40 Rows: 725
Patients: 41

Patients: 1 Rows: 385
Patients: 2 Rows: 1563
Patients: 3 Rows: 698
Patients: 4 Rows: 292
Patients: 5 Rows: 395
Patients: 6 Rows: 788
Patients: 7 Rows: 392
Patients: 8 Rows: 383
Patients: 9 Rows: 457
Patients: 10 Rows: 177
Patients: 11 Rows: 802
Patients: 12 Rows: 389
Patients: 13 Rows: 401
Patients: 14 Rows: 401
Patients: 15 Rows: 380
Patients: 16 Rows: 393
Patients: 17 Rows: 312
Patients: 18 Rows: 3214
Patients: 19 Rows: 747
Patients: 20 Rows: 1650
Patients: 21 Rows: 287
Patients: 22 Rows: 350
Patients: 23 Rows: 382
Patients: 24 Rows: 392
Patients: 25 Rows: 143
ndarray (15773, 5302)
dataframe (15773, 5302)
<class 'list'>
['AreaOccupied_AreaOccupied_ExpandCells', 'AreaOccupied_AreaOccupied_MergeRBC', 'AreaOccupied_AreaOccupied_Nucleus', 'AreaOccupied_AreaOccupied_Tissue', 'AreaOccupied_Perimeter_ExpandCells', 'AreaOccupied_Perimeter_MergeRBC', 'AreaOccupied_Perimeter_Nucleus', 'AreaOccupied_Perimeter_Tissue', 'Count_Cells', 'Count_ExpandCells', 'Count_MergeRBC', 'Count_Nucleus', 'Count

2022-07-19 14:56:50.324304 Train...
2022-07-19 15:04:19.890312 Evaluate...
Fold 0 Patch-level Training Accuracy: 100.0
2022-07-19 15:04:22.786322 Validate...
Fold 0 Patch-level Validation Accuracy: 48.43720281493692
Fold 0 Patient-level Validation Accuracy: 40.0

2022-07-19 15:04:23.522129 Fold 1 Loading...
Load train set...
Patients: 1 Rows: 385
Patients: 2 Rows: 698
Patients: 3 Rows: 393
Patients: 4 Rows: 379
Patients: 5 Rows: 292
Patients: 6 Rows: 395
Patients: 7 Rows: 788
Patients: 8 Rows: 638
Patients: 9 Rows: 392
Patients: 10 Rows: 1346
Patients: 11 Rows: 368
Patients: 12 Rows: 744
Patients: 13 Rows: 2387
Patients: 14 Rows: 699
Patients: 15 Rows: 1108
Patients: 16 Rows: 1596
Patients: 17 Rows: 383
Patients: 18 Rows: 689
Patients: 19 Rows: 457
Patients: 20 Rows: 547
Patients: 21 Rows: 788
Patients: 22 Rows: 305
Patients: 23 Rows: 368
Patients: 24 Rows: 2889
Patients: 25 Rows: 365
Patients: 26 Rows: 2402
Patients: 27 Rows: 759
Patients: 28 Rows: 1591
Patients: 29 Rows: 1344
Patient

KeyboardInterrupt: 

In [None]:
print(datetime.datetime.now())
print('Cross validation patch-level accuracy:',patch_accuracies)
print('mean:',np.mean(patch_accuracies),'std:',np.std(patch_accuracies))
print('Cross validation patient-level accuracy:',patient_accuracies)
print('mean:',np.mean(patient_accuracies),'std:',np.std(patient_accuracies))
