# RBC Filter then Random Forest
Opposite of notebook 03 which only used columns whose name includes Nucleus. This only uses columns whose names do not include Nucleus. The goal is to discover other important features.

In [1]:
MAX_DEPTH = 8
N_ESTIMATORS = 1000
CHOOSE_EVERY = 10   # ultimately controlled by the save_mem parameter
SAVE_MEM = True

In [2]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import csv
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
# The model
from sklearn.ensemble import RandomForestClassifier

2022-08-02 13:21:23.604834
Python 3.8.10
sklearn 1.0.2


In [3]:
# Patch data.
# This is the patch-level csv file: one row per patch, with nucleus totals from CellProfiler.
FILENAME='Process100_Image.csv'
# This directory contains one Image.csv file per patient.
# These csv files were slimmed to remove give-away columns.
# These csv files do contain high-RBC patches; consider filtering them.
BASE_PATH='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'  # alien

# Patient data.
# Each patch filename indicates patient/case [0:19] and WSI [0:23].
# For example: TCGA-06-0129-01Z-00-DX1_5400_5100.png
LEN_PATIENT_ID = 19
FOLDS_META_FILE = '/home/jrm/Adjeroh/Glioma/August_Run/TrainValidSplits/aug_train_valid_splits.csv'
NUM_FOLDS = 5
RBC_CUTOFF = 5000

## Data loading functions

In [4]:
METADATA = pd.read_csv(FOLDS_META_FILE)
METADATA

Unnamed: 0,Fold,Split,Case,Class
0,0,Train,TCGA-02-0004-01Z-00,0
1,0,Train,TCGA-02-0010-01Z-00,0
2,0,Train,TCGA-14-0789-01Z-00,0
3,0,Train,TCGA-02-0025-01Z-00,0
4,0,Train,TCGA-02-0033-01Z-00,0
...,...,...,...,...
630,4,Valid,TCGA-S9-A7J2-01Z-00,5
631,4,Valid,TCGA-02-0430-01Z-00,0
632,4,Valid,TCGA-28-1746-01Z-00,0
633,4,Valid,TCGA-HT-7676-01Z-00,1


In [5]:
def get_metadata_one_fold(fold,is_train):
    df = METADATA.loc[METADATA['Fold']==fold]
    if is_train:
        df = df.loc[df['Split']=='Train']
    else:
        df = df.loc[df['Split']=='Valid']
    patients = df['Case'].tolist()
    labels = df['Class'].tolist()
    return patients,labels

In [6]:
# Given a patient name, load the Image.csv file.
# Expect a string that matches a directory name, like 'TCGA-S9-A6UB-01Z-00'
# Returns a numpy array.
def load_one_patient_data(filepath):
    # Reading csv file, skip row 1 = column headers
    ary = np.loadtxt(filepath,skiprows=1,delimiter=',')
    return ary

In [7]:
def load_all_patients_data(patients,labels,save_mem=False):
    X = None  # dataframe with 5000 feature columns, one row per patch
    y = None  # list of labels = cancer class for each row
    z = None  # list of patient name for each row
    count=0
    for patient in patients:
        label = labels[count]
        count += 1
        directory = patient+'/'
        filepath=BASE_PATH+directory+FILENAME
        Xall = load_one_patient_data(filepath)
        if (save_mem):
            Xi = Xall[0::CHOOSE_EVERY].copy()  # e.g. choose every tenth patch
        else:
            Xi = Xall
        yi = np.full(shape=len(Xi), fill_value=label, dtype=np.int8)
        zi = [patient]*len(Xi)  # same ID for all patches from one patient
        if X is None:
            X = Xi
            y = yi
            z = zi
        else:
            X = np.concatenate( (X, Xi) )
            y = np.concatenate( (y, yi) )
            z = np.concatenate( (z, zi) )
    # X combines all patches of all WSI for all patients requested.
    # For debugging, we'll add column headers to the dataframe.
    X = pd.DataFrame(X)
    directory = patients[0] + '/'   # first patient is as good as any
    filepath=BASE_PATH+directory+FILENAME
    with open(filepath) as infile:
        rows = csv.reader(infile)
        for row in rows:
            header = row    # first row has column headers
            break
    X.columns = header
    if 'ImageNumber' in X.columns:
        X = X.drop(columns=['ImageNumber'])
    return X,y,z   # patch data, patch labels, patch patient names

In [8]:
# This is slow, probably due to concatenating dataframes.
# Consider a rewrite that concatenates csv before constructing a dataframe.
def load_train_valid_set(fold):
    patients,labels = get_metadata_one_fold(fold,True) # True=train
    X_train,y_train,z_train = load_all_patients_data(patients,labels,SAVE_MEM) 
    patients,labels = get_metadata_one_fold(fold,False) # False=valid
    X_valid,y_valid,z_valid = load_all_patients_data(patients,labels,False) # should always be False
    return X_train,y_train,z_train,X_valid,y_valid,z_valid

## Aggregation functions

In [9]:
# Each patch votes on the class for its WSI.
# Later, weight each patch label by the confidence i.e. score
def aggregate_accuracy(y_pred,y_valid,z_valid):
    L = len(y_pred)
    if L != len(y_valid) or L != len(z_valid):
        raise Exception('Lengths do not match')
    correct = {}
    incorrect = {}
    patients = np.unique(z_valid)
    for patient in patients:
        correct[patient]=0  # accumulate correct votes on patches
        incorrect[patient]=0  # accumulate incorrect votes
    for i in range(L):
        patient = z_valid[i]
        label =   y_valid[i]
        pred =    y_pred[i]
        if pred == label:
            correct[patient] += 1
        else:
            incorrect[patient] += 1
    numerator = 0
    denominator = 0
    for patient in patients:
        denominator += 1
        if correct[patient]>incorrect[patient]:
            numerator += 1
    accuracy = float(0)
    if denominator>0:
        accuracy = 100.0*numerator/denominator 
    return accuracy

## Main loop: Load, Classify, Report

In [10]:
def important_features(model):
        # Prereqs: fit().
        names = model.feature_names_in_
        importances = model.feature_importances_
        pairs = np.column_stack( (names,importances) )
        top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
        # There must be a way to do this witout a loop!
        top_list = []
        for i in top_array:
             top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
        top_df = pd.DataFrame(top_list)
        return top_df

In [11]:
def rbc_filter(X,y,z):
    if RBC_CUTOFF is not None:
        bad_rows = X[X.AreaOccupied_AreaOccupied_MergeRBC>=RBC_CUTOFF].index    
        X = X.drop(bad_rows)  # pandas dataframe
        y = np.delete(y,bad_rows)  # numpy array
        z = np.delete(z,bad_rows)
    return X,y,z

In [12]:
patch_accuracies = []   # summary statistics
patient_accuracies = [] # summary statistics

for fold in range(NUM_FOLDS):
    print()
    print(datetime.now(),'Fold',fold,'Loading...')
    X_train,y_train,z_train,X_valid,y_valid,z_valid = load_train_valid_set(fold)
    print('Train shapes X,y,z:',X_train.shape,len(y_train),len(z_train))
    print('Valid shapes X,y,z:',X_valid.shape,len(y_valid),len(z_valid))
    
    print(datetime.now(),'Fold',fold,'RBC Filter...')
    X_train,y_train,z_train = rbc_filter(X_train,y_train,z_train)
    X_valid,y_valid,z_valid = rbc_filter(X_valid,y_valid,z_valid)
    print('Train shapes X,y,z:',X_train.shape,len(y_train),len(z_train))
    print('Valid shapes X,y,z:',X_valid.shape,len(y_valid),len(z_valid))
    
    print(datetime.now(),'Fold',fold,'Reduce columns...')
    good_cols = [c for c in X_train.columns if 'Nucleus' not in c]
    X_train = X_train[good_cols]
    X_valid = X_valid[good_cols]
    print('Train shapes X,y,z:',X_train.shape,len(y_train),len(z_train))
    print('Valid shapes X,y,z:',X_valid.shape,len(y_valid),len(z_valid))

    # The shuffle is not strictly necessary with RF.
    # CNN models are sensitive to train set order but RF models are not. 
    # print(datetime.datetime.now(),'Shuffle...')
    # X_train,y_train = sklearn.shuffle(X_train,y_train)

    print(datetime.now(),'Fold',fold,'Train...')
    # min_samples_leaf=1 (default) led to overfitting
    rfc = RandomForestClassifier(max_depth=MAX_DEPTH,n_estimators=N_ESTIMATORS)
    rfc.fit(X_train,y_train)  # slow
    
    print(datetime.now(),'Fold',fold,'Ranked feature imporances...')
    top = important_features(rfc)
    #pd.set_option('display.max_rows', None)
    print(top.loc[:10])
    
    print(datetime.now(),'Fold',fold,'Evaluate...')
    y_pred = rfc.predict(X_train)
    matches = np.count_nonzero(y_train==y_pred)
    accuracy_t = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Training Accuracy:',accuracy_t)
    
    print(datetime.now(),'Fold',fold,'Validate...')
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy_v = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Validation Accuracy:',accuracy_v)
    patch_accuracies.append(accuracy_v)
    
    accuracy_p = aggregate_accuracy(y_pred,y_valid,z_valid)
    patient_accuracies.append(accuracy_p)
    print('Fold',fold,'Patient-level Validation Accuracy:',accuracy_p)
    
    # This shouldn't be necessary but it seems to reduce memory footprint.
    X_train=None
    X_valid=None
    y_train=None
    y_valid=None
    z_train=None
    z_valid=None
    rfc = None
    y_pred = None
    matches = None


2022-08-02 13:21:24.499416 Fold 0 Loading...
Train shapes X,y,z: (7592, 5301) 7592 7592
Valid shapes X,y,z: (15500, 5301) 15500 15500
2022-08-02 13:22:41.707001 Fold 0 RBC Filter...
Train shapes X,y,z: (6893, 5301) 6893 6893
Valid shapes X,y,z: (14084, 5301) 14084 14084
2022-08-02 13:22:42.052061 Fold 0 Reduce columns...
Train shapes X,y,z: (6893, 3350) 6893 6893
Valid shapes X,y,z: (14084, 3350) 14084 14084
2022-08-02 13:22:42.218327 Fold 0 Train...
2022-08-02 13:24:39.469088 Fold 0 Ranked feature imporances...
           0                                                  1
0   0.025485                          Granularity_2_Hematoxylin
1   0.013244  Mean_ExpandCells_Texture_Correlation_Hematoxyl...
2   0.012485  Median_ExpandCells_Texture_Correlation_Hematox...
3   0.012142  Mean_ExpandCells_Texture_Correlation_Hematoxyl...
4   0.011515  Median_ExpandCells_Texture_Correlation_Hematox...
5   0.007446           Texture_Correlation_Hematoxylin_7_03_256
6   0.006150           Texture_Co

In [13]:
print(datetime.now())
print('Cross validation patch-level accuracy:',patch_accuracies)
print('mean:',np.mean(patch_accuracies),'std:',np.std(patch_accuracies))
print('Cross validation patient-level accuracy:',patient_accuracies)
print('mean:',np.mean(patient_accuracies),'std:',np.std(patient_accuracies))


2022-08-02 13:36:38.732056
Cross validation patch-level accuracy: [54.032945186026694, 58.685674830429306, 52.579689106635215, 52.80658326085502, 41.28108151605375]
mean: 51.87719477999999 std: 5.738249522979222
Cross validation patient-level accuracy: [38.46153846153846, 34.61538461538461, 32.0, 40.0, 32.0]
mean: 35.41538461538461 std: 3.294463708141335
