# Random Forest 07
Get feature importances. 
At very least, this demonstrates we didn't expose a give-away field like patient ID.
The most important features vary between folds.
They often involve mean-or-median nucleus shape-or-texture.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import csv
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2022-07-20 08:08:02.409359
Python 3.8.10
sklearn 1.0.2


In [2]:
# Shuffle X,y in tandem -- can cause shape problems
from sklearn.utils import shuffle
# The model
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Our input is the output from the Data notebooks.
# These csv files were filtered to remove give-away columns and bad rows.
BASE_PATH='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerFiltered/'
# This is the patch-level csv file: one row per patch, with nucleus totals.
# Later, incorporate the nucleus-specific csv files.
FILENAME='Process100_Image.csv'
# This directory contains lists of patch filenames,
# divided into 5 folds of train/valid splits.
# Notice patch filename [0:19] is case ID, [0:23] is WSI ID,
# and last column is the 6-way cancer class (zero to five) of the case.
# For example, fold0_train.txt starts:
# TCGA-06-0129-01Z-00-DX1_5400_5100.png, 0
# TCGA-06-0129-01Z-00-DX1_5700_6000.png, 0
LEN_PATIENT_ID = 19
SPLITS_BASE='/home/jrm/Adjeroh/Naved/July_Run/TrainTestSplit/'
SPLITS_FILES = [
    ('fold0_train.txt','fold0_test.txt'),
    ('fold1_train.txt','fold1_test.txt'),
    ('fold2_train.txt','fold2_test.txt'),
    ('fold3_train.txt','fold3_test.txt'),
    ('fold4_train.txt','fold4_test.txt'),
]
NUM_FOLDS = 5
PATIENT_TO_CANCER = {}  # hash case ID to class number

## Data loading functions

In [4]:
# sklearn can do this but we were stumbling on shape & type problems
def list_shuffle(X):   
    L = len(X)
    for i in range(L):
        r = random.randrange(0,L)
        if i!=r:
            temp = X[i]
            X[i] = X[r]
            X[r] = temp
    return X

In [5]:
# Given a case ID = patient name = directory, load the csv file.
# Expect a value like p='TCGA-S9-A6UB-01Z-00'
def load_patient_data(p):
    filepath=BASE_PATH+p+'/'+FILENAME
    df = pd.read_csv(filepath)
    return df

In [6]:
# Given fold, load the case/patient IDs separated into training and validation.
# Fold should be an in between 0 and 4 for 5-fold cross validation.
# The returned list has unique strings like  TCGA-06-0129-01Z-00
def _load_patient_names(filename):
    patients=[]
    with open(filename) as infile:
        rows = csv.reader(infile)
        for row in rows:
            patient = row[0][:LEN_PATIENT_ID]
            cancer_class = int(row[1])
            if patient in PATIENT_TO_CANCER:
                if PATIENT_TO_CANCER[patient] != cancer_class:
                    raise Exception('One patient in two classes:',patient)
            else:
                PATIENT_TO_CANCER[patient] = cancer_class
            patients.append(patient)
    patients = list(np.unique(patients))
    return patients
def load_patient_names(fold):
    filenames = SPLITS_FILES[fold]  # tuple of train,valid
    train_patients = _load_patient_names(SPLITS_BASE+filenames[0])
    valid_patients = _load_patient_names(SPLITS_BASE+filenames[1])
    return train_patients,valid_patients

In [7]:
# Read csv into pandas dataframes and concatenate dataframes.
# This is too slow! Like 10 min per invocation.
def _load_train_valid_set1(patients):
    X = None  # instances with features
    y = None  # labels = cancer class
    z = []  # patient_id for aggregation
    for patient in patients:
        df = load_patient_data(patient)
        label = PATIENT_TO_CANCER[patient]
        if X is None:
            X = df 
            y = np.full(shape=len(df), fill_value=label, dtype=np.int8)
        else:
            X = pd.concat( (X, df) )
            more = np.full(shape=len(df), fill_value=label, dtype=np.int8)
            y = np.concatenate( (y, more) )
        z = z + [patient]*len(df)  # same ID for all patches from one patient
    return X,y,z

In [8]:
# Read csv into list and hold off building the dataframe till the end.
# This runs out of memory!
def _load_train_valid_set2(patients):
    X = []  # instances with features
    y = []  # labels = cancer class
    z = []  # patient_id for aggregation
    header = None
    patient_count = 0
    for patient in patients:
        row_count = 0
        patient_count += 1
        filepath=BASE_PATH+patient+'/'+FILENAME
        label = PATIENT_TO_CANCER[patient]
        with open(filepath) as infile:
            rows = csv.reader(infile)
            for row in rows:
                if row_count == 0:
                    if header is None:
                        header = row
                else:
                    X.append(row)
                    y.append(label)
                    z.append(patient)
                row_count += 1
        print('Patients:',patient_count,'Rows:',row_count)
    X = pd.DataFrame(X)
    X.columns = header
    return X,y,z

In [9]:
# Slurp csv into numpy array.
# This works!
def _load_train_valid_set3(patients,save_mem=False):
    X = None  # instances with features
    y = None  # labels = cancer class
    z = None  # patient_id for aggregation
    for patient in patients:
        filepath=BASE_PATH+patient+'/'+FILENAME
        label = PATIENT_TO_CANCER[patient]
        Xall = np.loadtxt(filepath,skiprows=1,delimiter=',')
        if (save_mem):
            Xi = Xall[0::2].copy()  # every even row
        else:
            Xi = Xall
        yi = np.full(shape=len(Xi), fill_value=label, dtype=np.int8)
        zi = [patient]*len(Xi)  # same ID for all patches from one patient
        if X is None:
            X = Xi
            y = yi
            z = zi
        else:
            X = np.concatenate( (X, Xi) )
            y = np.concatenate( (y, yi) )
            z = np.concatenate( (z, zi) )
    X = pd.DataFrame(X)
    # DataFrame column headers are not required for the machine learning.
    # We'll add them just for debugging. Arbitrarily use first row of first file.
    patient = patients[0]
    filepath=BASE_PATH+patient+'/'+FILENAME
    with open(filepath) as infile:
        rows = csv.reader(infile)
        for row in rows:
            header = row
            break
    X.columns = header
    return X,y,z

In [10]:
# Load csv rows for one set of train+valid patient names.
# This is slow, probably due to concatenating dataframes.
# Consider a rewrite that concatenates csv before constructing a dataframe.
def load_train_valid_set(train_patient_names,valid_patient_names):
    X_train,y_train,z_train = _load_train_valid_set3(train_patient_names,True)
    print('Loaded lengths X,y,z:',X_train.shape,len(y_train),len(z_train))
    X_valid,y_valid,z_valid = _load_train_valid_set3(valid_patient_names,True)
    print('Loaded lengths X,y,z:',X_valid.shape,len(y_valid),len(z_valid))
    
    # TO DO: raise exception if any valid patient is also a train patient
    return X_train,y_train,z_train,X_valid,y_valid,z_valid

## Aggregation functions

In [11]:
# Later, weight each patch label by the confidence i.e. score
def aggregate_accuracy(y_pred,z_valid):
    L = len(y_pred)
    if L != len(z_valid):
        raise Exception('Lengths of y and z do not match')
    correct = {}
    incorrect = {}
    patients = np.unique(z_valid)
    for patient in patients:
        correct[patient]=0
        incorrect[patient]=0
    for i in range(L):
        patient = z_valid[i]
        label = PATIENT_TO_CANCER[patient]
        pred = y_pred[i]
        if pred == label:
            correct[patient] += 1
        else:
            incorrect[patient] += 1
    numerator = 0
    denominator = 0
    for patient in patients:
        denominator += 1
        if correct[patient]>incorrect[patient]:
            numerator += 1
    accuracy = float(0)
    if denominator>0:
        accuracy = 100.0*numerator/denominator 
    return accuracy

## Main loop: Load, Classify, Report

In [12]:
def important_features(model):
        # Prereqs: fit().
        names = model.feature_names_in_
        importances = model.feature_importances_
        pairs = np.column_stack( (names,importances) )
        top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
        # There must be a way to do this witout a loop!
        top_list = []
        for i in top_array:
             top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
        top_df = pd.DataFrame(top_list)
        return top_df

In [13]:
patch_accuracies = []   # summary statistics
patient_accuracies = [] # summary statistics

for fold in range(NUM_FOLDS):
    print()
    print(datetime.datetime.now(),'Fold',fold,'Loading...')
    train_patients,valid_patients = load_patient_names(fold)  
    X_train,y_train,z_train,X_valid,y_valid,z_valid = \
        load_train_valid_set(train_patients,valid_patients)    
    # print('Fold',fold,'patients train',train_patients,'patients valid',valid_patients)
    # print('Fold:',fold,'patches train',X_train.shape,'patches valid',X_valid.shape)

    # This shuffle is pro forma, not strictly necessary.
    # CNN models are sensitive to train set order but RF models are not. 
    print(datetime.datetime.now(),'Shuffle...')
    X_train,y_train = shuffle(X_train,y_train)

    print(datetime.datetime.now(),'Train...')
    # min_samples_leaf=1 (default) led to overfitting
    rfc = RandomForestClassifier(min_samples_leaf=4)
    rfc.fit(X_train,y_train)  # slow
    
    print(datetime.datetime.now(),'Ranked feature imporances...')
    top = important_features(rfc)
    #pd.set_option('display.max_rows', None)
    print(top.loc[:10])
    
    print(datetime.datetime.now(),'Evaluate...')
    y_pred = rfc.predict(X_train)
    matches = np.count_nonzero(y_train==y_pred)
    accuracy = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Training Accuracy:',accuracy)
    
    print(datetime.datetime.now(),'Validate...')
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Patch-level Validation Accuracy:',accuracy)
    patch_accuracies.append(accuracy)
    
    accuracy = aggregate_accuracy(y_pred,z_valid)
    patient_accuracies.append(accuracy)
    print('Fold',fold,'Patient-level Validation Accuracy:',accuracy)
    
    # This shouldn't be necessary but it seems to reduce memory footprint.
    X_train=None
    X_valid=None
    y_train=None
    y_valid=None
    z_train=None
    z_valid=None
    rfc = None
    y_pred = None
    matches = None


2022-07-20 08:08:03.095654 Fold 0 Loading...
Loaded lengths X,y,z: (30017, 5302) 30017 30017
Loaded lengths X,y,z: (7893, 5302) 7893 7893
2022-07-20 08:09:12.117396 Shuffle...
2022-07-20 08:09:12.973032 Train...
2022-07-20 08:11:57.841652 Ranked feature imporances...
           0                                                  1
0   0.009161  Median_Nucleus_Texture_InfoMeas1_Hematoxylin_5...
1   0.008848  Mean_Nucleus_Texture_InfoMeas1_Hematoxylin_5_0...
2   0.008185                Median_Nucleus_AreaShape_MeanRadius
3   0.007733  Mean_Nucleus_Texture_InfoMeas1_Hematoxylin_7_0...
4   0.007084               Mean_Nucleus_AreaShape_MaximumRadius
5   0.006625      Median_Nucleus_Texture_Entropy_Eosin_3_02_256
6   0.005874  Median_Nucleus_Texture_InfoMeas1_Hematoxylin_4...
7   0.005800                Mean_Nucleus_AreaShape_MedianRadius
8   0.005796          Median_Nucleus_AreaShape_MinFeretDiameter
9   0.005662  Median_Nucleus_Texture_InfoMeas1_Hematoxylin_3...
10  0.005609             Me

In [14]:
print(datetime.datetime.now())
print('Cross validation patch-level accuracy:',patch_accuracies)
print('mean:',np.mean(patch_accuracies),'std:',np.std(patch_accuracies))
print('Cross validation patient-level accuracy:',patient_accuracies)
print('mean:',np.mean(patient_accuracies),'std:',np.std(patient_accuracies))


2022-07-20 08:27:08.596328
Cross validation patch-level accuracy: [50.006334726973265, 67.6103500761035, 41.40958113870078, 56.99075699075699, 59.2527886036823]
mean: 55.05396230724337 std: 8.839591295725352
Cross validation patient-level accuracy: [40.0, 48.0, 32.0, 48.0, 44.0]
mean: 42.4 std: 5.986651818838306
