# Random Forest
Train on 6-class cancer classification.  
Input is the nucleus+RBC rollup, training set only.  
Train on the full training set (no validation).  
Rank feature importance.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
import tensorflow as tf
tf.config.list_physical_devices('GPU')

2022-06-09 14:28:21.921367
Python 3.8.10
sklearn 1.0.2


[]

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models
from CellProfiler_Util import CP_Util
from RandomForestUtil import RF_Util

In [3]:
BASE_DIR='/home/jrm/Adjeroh/Naved/CP_80K/'  # append Output0/ etc
CLASSES=range(0,6)  # use all 6 classes
#CLASSES=[5]  # JUST FOR TESTING
MODELS_DIR='/home/jrm/Adjeroh/Naved/CP_80K/models/RandomForest.17'

In [None]:
print(datetime.datetime.now())
PREFIX='abc_'
def set_prefix(prefix):
    global PREFIX
    PREFIX=prefix
def get_prefix(col_name):
    global PREFIX
    return PREFIX+col_name  # hard coded for now
def load_all_classes():
    X = None
    y = None
    for i in CLASSES:
        print(datetime.datetime.now())
        print("Process class",i)
        FULL_PATH=BASE_DIR+'Output'+str(i)+'/'
        cp = CP_Util(FULL_PATH)
        cp.train_test_split()
        Xnuc = cp.get_nucleus_rollup()
        set_prefix('Nuc_')
        Xnuc.rename(get_prefix,axis='columns',inplace=True)
        Xrbc = cp.get_RBC_rollup()
        set_prefix('Rbc_')
        Xrbc.rename(get_prefix,axis='columns',inplace=True)
        Xi = Xnuc.join(Xrbc,how='outer')
        size = len(Xi)
        yi = np.ones(size) * i   # e.g. class 3
        if X is None:
            X = Xi
            y = yi
        else:
            X = pd.concat( (X,Xi) )
            y = np.concatenate( (y,yi) )
    X.fillna(0,inplace=True)  
    return X,y
Xtrain,ytrain=load_all_classes()
Xtrain

2022-06-09 14:28:24.179475
2022-06-09 14:28:24.179668
Process class 0
Num tumors in test/train sets: 23 94
Num patches in test/train sets: 8803 36163
2022-06-09 14:29:02.222411
Process class 1
Num tumors in test/train sets: 7 26
Num patches in test/train sets: 2734 10168
2022-06-09 14:29:15.207578
Process class 2
Num tumors in test/train sets: 7 30
Num patches in test/train sets: 2769 11181
2022-06-09 14:29:34.795910
Process class 3
Num tumors in test/train sets: 3 14
Num patches in test/train sets: 1091 5273
2022-06-09 14:29:45.382116
Process class 4
Num tumors in test/train sets: 2 6
Num patches in test/train sets: 813 2384
2022-06-09 14:29:53.088548
Process class 5
Num tumors in test/train sets: 1 3
Num patches in test/train sets: 396 1195


In [None]:
print(datetime.datetime.now())
print("Shuffle...")
Xtrain,ytrain=shuffle(Xtrain,ytrain)  #set random_state for reproducibility

In [None]:
print(datetime.datetime.now())
print('Train on full training set...')
rf3 = RF_Util()
rf3.set_train(Xtrain,ytrain)
rf3.fit()
print(datetime.datetime.now())
print('...and rank the features by importance.')
top = rf3.important_features()

In [None]:
pd.set_option('display.max_rows', None)
top.loc[:100]

In [None]:
print(datetime.datetime.now())
joblib.dump(rf3.get_model(),MODELS_DIR)