# Random Forest
Per nucleus.   
Previous notebook used all fields: accuracy 57%.   
Here, try removing additional fields: 

In [1]:
MAX_DEPTH = 8
N_ESTIMATORS = 500
CHOOSE_EVERY = 10
SAVE_MEM = True

In [2]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import csv
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
# The model
from sklearn.ensemble import RandomForestClassifier

2022-08-10 08:20:23.747478
Python 3.8.10
sklearn 1.0.2


In [3]:
# Patch data.
# This is the nucleus-level csv file: one row per nucleus from CellProfiler.
FILENAME='Process100_Nucleus.csv'
# This directory contains one Nucleus.csv file per patient.
BASE_PATH='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'  # alien

# Patient data.
# Each patch filename indicates patient/case [0:19] and WSI [0:23].
# For example: TCGA-06-0129-01Z-00-DX1_5400_5100.png
LEN_PATIENT_ID = 19
FOLDS_META_FILE = '/home/jrm/Adjeroh/Glioma/August_Run/TrainValidSplits/aug_train_valid_splits.csv'

# Cross validation
NUM_FOLDS = 5

## Data loading functions

In [4]:
METADATA = pd.read_csv(FOLDS_META_FILE)
METADATA

Unnamed: 0,Fold,Split,Case,Class
0,0,Train,TCGA-02-0004-01Z-00,0
1,0,Train,TCGA-02-0010-01Z-00,0
2,0,Train,TCGA-14-0789-01Z-00,0
3,0,Train,TCGA-02-0025-01Z-00,0
4,0,Train,TCGA-02-0033-01Z-00,0
...,...,...,...,...
630,4,Valid,TCGA-S9-A7J2-01Z-00,5
631,4,Valid,TCGA-02-0430-01Z-00,0
632,4,Valid,TCGA-28-1746-01Z-00,0
633,4,Valid,TCGA-HT-7676-01Z-00,1


In [5]:
def get_metadata_one_fold(fold,is_train):
    df = METADATA.loc[METADATA['Fold']==fold]
    if is_train:
        df = df.loc[df['Split']=='Train']
    else:
        df = df.loc[df['Split']=='Valid']
    patients = df['Case'].tolist()
    labels = df['Class'].tolist()
    return patients,labels

In [6]:
# Given a patient name, load the Nucleus.csv file.
# Expect a string that matches a directory name, like 'TCGA-S9-A6UB-01Z-00'
# Returns a numpy array.
def load_one_patient_data(filepath):
    # Reading csv file, skip row 1 = column headers
    ary = np.loadtxt(filepath,skiprows=1,delimiter=',')
    return ary

In [7]:
def drop_columns(df):
    bad_cols = ['ImageNumber','ObjectNumber','Number_Object_Number',
                'AreaShape_Orientation','Children_Cells_Count']
    df = df.drop(columns=bad_cols)
    bad_cols = [c for c in df.columns 
                if c.startswith('Location_') 
                or c.startswith('AreaShape_BoundingBoxM')
                or c.startswith('AreaShape_Center')
                or c.startswith('Neighbors_')]
                #or c.startswith('RadialDistribution_')
    df = df.drop(columns=bad_cols)
    return df

In [8]:
def load_all_patients_data(patients,labels,save_mem=False):
    X = None  # dataframe with 5000 feature columns, one row per patch
    y = None  # list of labels = cancer class for each row
    z = None  # list of patient name for each row
    count=0
    for patient in patients:
        label = labels[count]
        count += 1
        directory = patient+'/'
        filepath=BASE_PATH+directory+FILENAME
        Xall = load_one_patient_data(filepath)
        if (save_mem):
            Xi = Xall[0::CHOOSE_EVERY].copy()  # e.g. choose every tenth patch
        else:
            Xi = Xall
        yi = np.full(shape=len(Xi), fill_value=label, dtype=np.int8)
        zi = [patient]*len(Xi)  # same ID for all patches from one patient
        if X is None:
            X = Xi
            y = yi
            z = zi
        else:
            X = np.concatenate( (X, Xi) )
            y = np.concatenate( (y, yi) )
            z = np.concatenate( (z, zi) )
    # X combines all nuclei of all WSI for all patients requested.
    # For debugging, we'll add column headers to the dataframe.
    X = pd.DataFrame(X)
    directory = patients[0] + '/'   # first patient is as good as any
    filepath=BASE_PATH+directory+FILENAME
    with open(filepath) as infile:
        rows = csv.reader(infile)
        for row in rows:
            header = row    # first row has column headers
            break
    X.columns = header
    X = drop_columns(X)
    return X,y,z   # patch data, patch labels, patch patient names

In [9]:
# This is slow, probably due to concatenating dataframes.
# Consider a rewrite that concatenates csv before constructing a dataframe.
def load_train_valid_set(fold):
    patients,labels = get_metadata_one_fold(fold,True) # True=train
    X_train,y_train,z_train = load_all_patients_data(patients,labels,SAVE_MEM) 
    print('Train lengths X,y,z:',X_train.shape,len(y_train),len(z_train))
    patients,labels = get_metadata_one_fold(fold,False) # False=valid
    X_valid,y_valid,z_valid = load_all_patients_data(patients,labels,SAVE_MEM) # reduces validation size
    print('Valid lengths X,y,z:',X_valid.shape,len(y_valid),len(z_valid))
    return X_train,y_train,z_train,X_valid,y_valid,z_valid

## Aggregation functions

In [10]:
# Each patch votes on the class for its WSI.
# Later, weight each patch label by the confidence i.e. score
def aggregate_accuracy(y_pred,y_valid,z_valid):
    L = len(y_pred)
    if L != len(y_valid) or L != len(z_valid):
        raise Exception('Lengths do not match')
    correct = {}
    incorrect = {}
    patients = np.unique(z_valid)
    for patient in patients:
        correct[patient]=0  # accumulate correct votes on patches
        incorrect[patient]=0  # accumulate incorrect votes
    for i in range(L):
        patient = z_valid[i]
        label =   y_valid[i]
        pred =    y_pred[i]
        if pred == label:
            correct[patient] += 1
        else:
            incorrect[patient] += 1
    numerator = 0
    denominator = 0
    for patient in patients:
        denominator += 1
        if correct[patient]>incorrect[patient]:
            numerator += 1
    accuracy = float(0)
    if denominator>0:
        accuracy = 100.0*numerator/denominator 
    return accuracy

## Main loop: Load, Classify, Report

In [11]:
def important_features(model):
    # Prereqs: fit().
    names = model.feature_names_in_
    importances = model.feature_importances_
    pairs = np.column_stack( (names,importances) )
    top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
    # There must be a way to do this witout a loop!
    top_list = []
    for i in top_array:
         top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
    top_df = pd.DataFrame(top_list)
    return top_df

In [12]:
patch_accuracies = []   # summary statistics
patient_accuracies = [] # summary statistics

for fold in range(NUM_FOLDS):
    print()
    print(datetime.datetime.now(),'Fold',fold,'Loading...')
    X_train,y_train,z_train,X_valid,y_valid,z_valid = load_train_valid_set(fold)    

    # The shuffle is not strictly necessary with RF.
    # CNN models are sensitive to train set order but RF models are not. 
    # print(datetime.datetime.now(),'Shuffle...')
    # X_train,y_train = sklearn.shuffle(X_train,y_train)

    print(datetime.datetime.now(),'Train...')
    # min_samples_leaf=1 (default) led to overfitting
    rfc = RandomForestClassifier(max_depth=MAX_DEPTH,n_estimators=N_ESTIMATORS)
    rfc.fit(X_train,y_train)  # slow
    
    print(datetime.datetime.now(),'Ranked feature imporances...')
    top = important_features(rfc)
    #pd.set_option('display.max_rows', None)
    print(top.loc[:10])
    
    print(datetime.datetime.now(),'Evaluate...')
    y_pred = rfc.predict(X_train)
    matches = np.count_nonzero(y_train==y_pred)
    accuracy_t = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Nucleus-level Training Accuracy:',accuracy_t)
    
    print(datetime.datetime.now(),'Validate...')
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy_v = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Nucleus-level Validation Accuracy:',accuracy_v)
    patch_accuracies.append(accuracy_v)
    
    accuracy_p = aggregate_accuracy(y_pred,y_valid,z_valid)
    patient_accuracies.append(accuracy_p)
    print('Fold',fold,'Patient-level Validation Accuracy:',accuracy_p)
    
    # This shouldn't be necessary but it seems to reduce memory footprint.
    X_train=None
    X_valid=None
    y_train=None
    y_valid=None
    z_train=None
    z_valid=None
    y_pred = None
    matches = None


2022-08-10 08:20:24.499139 Fold 0 Loading...
Train lengths X,y,z: (148324, 623) 148324 148324
Valid lengths X,y,z: (36559, 623) 36559 36559
2022-08-10 08:23:51.171053 Train...
2022-08-10 08:35:21.101060 Ranked feature imporances...
           0                                       1
0   0.028924  Texture_InfoMeas1_Hematoxylin_5_02_256
1   0.019438  Texture_InfoMeas1_Hematoxylin_4_00_256
2   0.019299               Granularity_2_Hematoxylin
3   0.019006  Texture_InfoMeas1_Hematoxylin_4_01_256
4   0.018793  Texture_InfoMeas1_Hematoxylin_4_03_256
5   0.017927  Texture_InfoMeas1_Hematoxylin_3_03_256
6   0.016865  Texture_InfoMeas2_Hematoxylin_5_01_256
7   0.015558  Texture_InfoMeas1_Hematoxylin_3_01_256
8   0.014642  Texture_InfoMeas1_Hematoxylin_4_02_256
9   0.014638  Texture_InfoMeas2_Hematoxylin_5_02_256
10  0.014254  Texture_InfoMeas1_Hematoxylin_7_02_256
2022-08-10 08:35:21.130099 Evaluate...
Fold 0 Nucleus-level Training Accuracy: 61.73714301124565
2022-08-10 08:35:30.215783 Validat

In [14]:
print(datetime.datetime.now())
if SAVE_MEM:
    print('Sampling every',CHOOSE_EVERY,'th record.')
print('Cross validation nucleus-level accuracy:',patch_accuracies)
print('mean:',np.mean(patch_accuracies),'std:',np.std(patch_accuracies,ddof=1))
print('Cross validation patient-level accuracy:',patient_accuracies)
print('mean:',np.mean(patient_accuracies),'std:',np.std(patient_accuracies,ddof=1))


2022-08-10 09:36:09.788008
Sampling every 10 th record.
Cross validation nucleus-level accuracy: [60.80308542356191, 56.612346049864826, 56.500748186001864, 58.514794512250525, 53.699947411914955]
mean: 57.22618431671881 std: 2.6359125303473285
Cross validation patient-level accuracy: [38.46153846153846, 34.61538461538461, 40.0, 36.0, 36.0]
mean: 37.01538461538461 std: 2.169176213171024


In [15]:
import numpy as np
print('sample std',np.std(patch_accuracies))
print('population std',np.std(patch_accuracies,ddof=1))

sample std 2.3576318402400416
population std 2.6359125303473285
