# Random Forest
Predict CNN attention per nucleus.   
Switch from numpy array loadtxt() to pandas dataframe read_csv().   

In [1]:
MAX_DEPTH = 8
N_ESTIMATORS = 500
CHOOSE_EVERY = 10
SAVE_MEM = True

In [2]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import csv
import numpy as np
import pandas as pd
import matplotlib as plt
import random
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
# The model
from sklearn.ensemble import RandomForestClassifier

2022-08-11 15:57:56.810178
Python 3.9.6
sklearn 1.1.1


In [3]:
# Patch data.
# This is the nucleus-level csv file: one row per nucleus from CellProfiler.
FILENAME='Process100_Nucleus.csv'
# This directory contains one Nucleus.csv file per patient.
BASE_PATH='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'  # Alien
BASE_PATH='D:\\Adjeroh\\Glioma\\August_Run\\CellProfilerPerPatient\\'   # Windows

# Patient data.
# Each patch filename indicates patient/case [0:19] and WSI [0:23].
# For example: TCGA-06-0129-01Z-00-DX1_5400_5100.png
LEN_PATIENT_ID = 19
FOLDS_META_FILE = '/home/jrm/Adjeroh/Glioma/August_Run/TrainValidSplits/aug_train_valid_splits.csv'  # Alien
FOLDS_META_FILE = 'D:\\Adjeroh\\Glioma\\August_Run\\TrainValidSplits\\aug_train_valid_splits.csv'  # Windows
# Cross validation
NUM_FOLDS = 5

# These bin thresholds yield nucleus temperature bins of about equal weight.
# See AT_PerPatient 01 notebook.
BIN_MAX = [-0.99,0.40,0.65,0.75,0.85,1.00]
BIN_LABEL = [1,2,3,4,5]

In [4]:
# Expect all temperatures to fall into bins 1 thru 5.
# Bins 0 and 6 should never happen.
every_temp = [0.35,0.50,0.60,0.70,0.90,1.10]
binned = np.digitize(every_temp,BIN_MAX,right=True)
binned

array([1, 2, 2, 3, 5, 6], dtype=int64)

## Data loading functions

In [5]:
METADATA = pd.read_csv(FOLDS_META_FILE)

In [6]:
def get_patient_attention(fold,is_train,patient):
    cls = 'mcls'
    filename = f"Attention_{cls}_fold{fold}.csv"
    filepath = BASE_PATH + patient + '/' + filename
    df = pd.read_csv(filepath)
    return df

In [7]:
def get_metadata_one_fold(fold,is_train):
    df = METADATA.loc[METADATA['Fold']==fold]
    if is_train:
        df = df.loc[df['Split']=='Train']
    else:
        df = df.loc[df['Split']=='Valid']
    patients = df['Case'].tolist()
    labels = df['Class'].tolist()
    return patients,labels

In [8]:
def drop_columns(df):
    bad_cols = ['ImageNumber','ObjectNumber','Number_Object_Number',
                'AreaShape_Orientation','Children_Cells_Count']
    df = df.drop(columns=bad_cols)
    bad_cols = [c for c in df.columns 
                if c.startswith('Location_') 
                or c.startswith('AreaShape_BoundingBoxM')
                or c.startswith('AreaShape_Center')
                or c.startswith('Neighbors_')]
    df = df.drop(columns=bad_cols)
    return df

In [9]:
def load_one_patient_attention(fold,multiclass,patient):
    filepath=BASE_PATH+patient+'/'+FILENAME
    X = pd.read_csv(filepath)
    cls='bcls'
    if multiclass:
        cls='mcls'
    att_filename = f"Attention_{cls}_fold{fold}.csv"
    att_path = BASE_PATH+patient+'/'+att_filename
    att_df = pd.read_csv(att_path)
    thermometer = {}
    for ndx,row in att_df.iterrows():
        key = (row['ImageNumber'],row['ObjectNumber'])
        value = row['AvgTemp']
        thermometer[key]=value
    every_temp = np.zeros(len(X),dtype=float)
    i = 0
    for ndx,row in X.iterrows():
        key = (row['ImageNumber'],row['ObjectNumber'])
        value = thermometer[key]
        every_temp[i] = value
        i += 1
    binned = np.digitize(every_temp,BIN_MAX,right=True)
    for i in range(len(binned)):
        bin_num = binned[i]
        if not (bin_num>=1 and bin_num<=5):
            print(i,binned[i],every_temp[i],patient)
            raise Exception ('Bad bin!')
    Y = list(binned) 
    return X,Y  

In [10]:
def load_all_patients_data(fold,patients,save_mem=False):
    X = None  # dataframe with 600 feature columns, one row per nucleus
    y = None  # list of labels = attention for each row
    z = None  # list of patient name for each row
    is_multiclass = True
    count=0
    for patient in patients:
        print('.',end='')
        count += 1
        directory = patient+'/'
        Xall,yall = load_one_patient_attention(fold,is_multiclass,patient)
        if (save_mem):
            Xi = Xall.iloc[0::CHOOSE_EVERY] # e.g. choose every tenth patch
            yi = yall[0::CHOOSE_EVERY].copy()  # e.g. choose every tenth patch
        else:
            Xi = Xall
            yi = yall
        zi = [patient]*len(Xi)  # same ID for all patches from one patient
        if X is None:
            X = Xi
            y = yi
            z = zi
        else:
            X = pd.concat( (X, Xi) )
            y = np.concatenate( (y, yi) )
            z = np.concatenate( (z, zi) )
    print()
    X = drop_columns(X)
    return X,y,z   # nucleus data, nucleus attention bin, patient name

In [11]:
# This is slow, probably due to concatenating dataframes.
# Consider a rewrite that concatenates csv before constructing a dataframe.
def load_train_valid_set(fold):
    is_train = True # training set
    patients,cancers = get_metadata_one_fold(fold,is_train) 
    X_train,y_train,z_train = load_all_patients_data(fold,patients,SAVE_MEM) 
    print('Train lengths X,y,z:',X_train.shape,len(y_train),len(z_train))
    is_train = False # validation set
    patients,cancers = get_metadata_one_fold(fold,is_train) 
    X_valid,y_valid,z_valid = load_all_patients_data(fold,patients,SAVE_MEM) # reduces validation size
    print('Valid lengths X,y,z:',X_valid.shape,len(y_valid),len(z_valid))
    return X_train,y_train,z_train,X_valid,y_valid,z_valid

## Aggregation functions

In [12]:
# Each patch votes on the class for its WSI.
# Later, weight each patch label by the confidence i.e. score
def aggregate_accuracy(y_pred,y_valid,z_valid):
    L = len(y_pred)
    if L != len(y_valid) or L != len(z_valid):
        raise Exception('Lengths do not match')
    correct = {}
    incorrect = {}
    patients = np.unique(z_valid)
    for patient in patients:
        correct[patient]=0  # accumulate correct votes on patches
        incorrect[patient]=0  # accumulate incorrect votes
    for i in range(L):
        patient = z_valid[i]
        label =   y_valid[i]
        pred =    y_pred[i]
        if pred == label:
            correct[patient] += 1
        else:
            incorrect[patient] += 1
    numerator = 0
    denominator = 0
    for patient in patients:
        denominator += 1
        if correct[patient]>incorrect[patient]:
            numerator += 1
    accuracy = float(0)
    if denominator>0:
        accuracy = 100.0*numerator/denominator 
    return accuracy

## Main loop: Load, Classify, Report

In [13]:
def important_features(model):
    # Prereqs: fit().
    names = model.feature_names_in_
    importances = model.feature_importances_
    pairs = np.column_stack( (names,importances) )
    top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
    # There must be a way to do this witout a loop!
    top_list = []
    for i in top_array:
         top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
    top_df = pd.DataFrame(top_list)
    return top_df

In [None]:
patch_accuracies = []   # summary statistics
patient_accuracies = [] # summary statistics

for fold in range(NUM_FOLDS):
    print()
    print(datetime.datetime.now(),'Fold',fold,'Loading...')
    X_train,y_train,z_train,X_valid,y_valid,z_valid = load_train_valid_set(fold)    

    # The shuffle is not strictly necessary with RF.
    # CNN models are sensitive to train set order but RF models are not. 
    # print(datetime.datetime.now(),'Shuffle...')
    # X_train,y_train = sklearn.shuffle(X_train,y_train)

    print(datetime.datetime.now(),'Train...')
    # min_samples_leaf=1 (default) led to overfitting
    rfc = RandomForestClassifier(max_depth=MAX_DEPTH,n_estimators=N_ESTIMATORS)
    rfc.fit(X_train,y_train)  # slow
    
    print(datetime.datetime.now(),'Ranked feature imporances...')
    top = important_features(rfc)
    #pd.set_option('display.max_rows', None)
    print(top.loc[:10])
    
    print(datetime.datetime.now(),'Evaluate...')
    y_pred = rfc.predict(X_train)
    matches = np.count_nonzero(y_train==y_pred)
    accuracy_t = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Nucleus-level Training Accuracy:',accuracy_t)
    
    print(datetime.datetime.now(),'Validate...')
    y_pred = rfc.predict(X_valid)
    matches = np.count_nonzero(y_valid==y_pred)
    accuracy_v = 100.0 * matches / len(y_pred)
    print('Fold',fold,'Nucleus-level Validation Accuracy:',accuracy_v)
    patch_accuracies.append(accuracy_v)
    
    accuracy_p = aggregate_accuracy(y_pred,y_valid,z_valid)
    patient_accuracies.append(accuracy_p)
    print('Fold',fold,'Patient-level Validation Accuracy:',accuracy_p)
    
    # This shouldn't be necessary but it seems to reduce memory footprint.
    X_train=None
    X_valid=None
    y_train=None
    y_valid=None
    z_train=None
    z_valid=None
    y_pred = None
    matches = None


2022-08-11 15:57:58.471513 Fold 0 Loading...
..................................................................................

In [None]:
print(datetime.datetime.now())
if SAVE_MEM:
    print('Sampling every',CHOOSE_EVERY,'th record.')
print('Cross validation nucleus-level accuracy:',patch_accuracies)
print('mean:',np.mean(patch_accuracies),'pop std:',np.std(patch_accuracies,ddof=1))
print('Cross validation patient-level accuracy:',patient_accuracies)
print('mean:',np.mean(patient_accuracies),'pop std:',np.std(patient_accuracies,ddof=1))
