# Random Forest
Use numpy arrays for training.

In [1]:
from datetime import datetime
print(datetime.now())

2023-06-03 12:36:28.337179


In [2]:
import tensorflow as tf
dt='float32'
tf.keras.backend.set_floatx('float32')
tf.random.set_seed(42) # supposedly leads to reproducible results

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
else:
    print('Found GPU at: {}'.format(device_name))

try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/IRP2/'  # must end in "/"
    MODEL_DIR=PATH+'My Drive/data/IRP2/Models/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR="/Users/jasonmiller/WVU/BAM_ML/"
    MODEL_DIR="/Users/jasonmiller/WVU/BAM_ML/Models/"
print(DATA_DIR)
SAVE_MODEL_FILENAME = None 

2023-06-03 12:36:28.402806: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU device not found
/Users/jasonmiller/WVU/BAM_ML/


2023-06-03 12:36:36.673140: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from platform import python_version
print('Python',python_version())
import numpy as np
np.random.seed(42) # supposedly sets scikit-learn
import time # sleep function
from os.path import isfile
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

from tensorflow import keras
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier as RFC

EPOCHS=150 

Python 3.10.0
sklearn 1.1.2


## Data Load

In [4]:
# Full dataset (may exceed RAM)
DATA_FILE_0 = 'MxM_BR4/ml_stats.csv'
DATA_FILE_1 = 'SxS_BR4/ml_stats.csv'
# First million for testing
DATA_FILE_0 = 'MxM_BR4/first_million_stats.csv'
DATA_FILE_1 = 'SxS_BR4/first_million_stats.csv'
# Tiny dataset for debugging
DATA_FILE_0 = 'tiny_MxM/ml_stats.csv'
DATA_FILE_1 = 'tiny_SxS/ml_stats.csv'

print('Data file 0 %s'%DATA_FILE_0)
print('Data file 1 %s'%DATA_FILE_1)
TEST_PORTION=0.00   # set to 20% when using complete files (ml_stats)
VALID_PORTION=0.20
MAX_LINES_TO_LOAD=1000000  # million
print('Maximum lines to load per file for training: %d'%MAX_LINES_TO_LOAD)

Data file 0 tiny_MxM/ml_stats.csv
Data file 1 tiny_SxS/ml_stats.csv
Maximum lines to load per file for training: 1000000


In [5]:
class DataLoader():
    def __init__(self,filepath1,filepath2,verbose=True):
        self.files = [filepath1,filepath2]
        self.alignments=[]
        self.labels=[]
        self.is_primary={'P':1, 'S':0}
        self.verbose = verbose
        self.max_lines = None
        
    def set_max_lines(self,lines):
        self.max_lines = lines
        if self.verbose:
            print('Maximum lines to load per file: %d'%lines)
        
    def _count_lines_(self):
        count0 = 0
        with open (self.files[0],'r') as handle0:
            for row in handle0:
                count0 += 1
        count1 = 0
        with open(self.files[1],'r') as handle1:
            for row in handle1:
                count1 += 1
        minimum = min(count0,count1)
        if self.verbose:
            print('File0 size: %d %s'%(count0,self.files[0]))
            print('File1 size: %d %s'%(count1,self.files[1]))
        return minimum
        
    def _load_line_(self,row):
        line = row.strip()
        fields = line.split(',')
        fields[0] =  self.is_primary[fields[0]]
        fields[6] =  self.is_primary[fields[6]]
        fields[12] = self.is_primary[fields[12]]
        fields[18] = self.is_primary[fields[18]]
        integers = [int(x) for x in fields]
        self.alignments.append(integers)
    
    def load_full_train_set(self):
        '''Load first 80% of the data (assumed to be in random order)'''
        minimum = 0
        try:
            minimum = self._count_lines_()
        except Exception as e:
            print(e)
            raise Exception('CANNOT COUNT LINES IN FILE!')
        train_size = int(minimum - minimum * TEST_PORTION)
        if self.max_lines is not None:
            train_size = min(train_size,self.max_lines)
        if self.verbose:
            print('Trying to load %d lines per file...'%train_size)
        try:
            handle0 = open(self.files[0],'r')
            handle1 = open(self.files[1],'r')
            # Associate label 0 with data from file 0. Same for 1.
            for i in range(train_size):
                row = next(handle0)
                self._load_line_(row)
                self.labels.append(0) 
                row = next(handle1)
                self._load_line_(row)
                self.labels.append(1)
            handle0.close()
            handle1.close()
        except Exception as e:
            print(e)
            raise Exception('CANNOT LOAD DATA FROM FILE!')

    def show_examples(self,head=6):
        head = min(head,len(self.alignments))
        for i in range(head):
            print('From '+self.files[self.labels[i]])
            print('Primary,Score,Edit,Mismatch,GapOpen,GapExtend')
            print(self.alignments[i][0:6])
            print(self.alignments[i][6:12])
            print(self.alignments[i][12:18])
            print(self.alignments[i][18:24])
            
    def get_X_y(self):
        loaded = len(self.alignments)
        divider = int(loaded - loaded * VALID_PORTION)
        X_train = np.array(self.alignments[:divider])
        y_train = np.array(self.labels[:divider])
        X_valid = np.array(self.alignments[divider:])
        y_valid = np.array(self.labels[divider:])
        if self.verbose:
            print('Full train set size = '+str(len(self.alignments)))
            print('Training/Validation partition: %d/%d'%(len(y_train),len(y_valid)))
        return X_train,y_train, X_valid,y_valid

In [6]:
print(datetime.now())
filepath0 = DATA_DIR+DATA_FILE_0
filepath1 = DATA_DIR+DATA_FILE_1
loader=DataLoader(filepath0,filepath1)
loader.set_max_lines(MAX_LINES_TO_LOAD)
print('LOADING')
loader.load_full_train_set()
print(datetime.now())
loader.show_examples()

2023-06-03 12:36:38.396488
Maximum lines to load per file: 1000000
LOADING
File0 size: 4000 /Users/jasonmiller/WVU/BAM_ML/tiny_MxM/ml_stats.csv
File1 size: 4000 /Users/jasonmiller/WVU/BAM_ML/tiny_SxS/ml_stats.csv
Trying to load 4000 lines per file...
2023-06-03 12:36:38.457304
From /Users/jasonmiller/WVU/BAM_ML/tiny_MxM/ml_stats.csv
Primary,Score,Edit,Mismatch,GapOpen,GapExtend
[1, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
From /Users/jasonmiller/WVU/BAM_ML/tiny_SxS/ml_stats.csv
Primary,Score,Edit,Mismatch,GapOpen,GapExtend
[0, -25, 5, 5, 0, 0]
[0, -10, 2, 2, 0, 0]
[1, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0]
From /Users/jasonmiller/WVU/BAM_ML/tiny_MxM/ml_stats.csv
Primary,Score,Edit,Mismatch,GapOpen,GapExtend
[0, -5, 1, 1, 0, 0]
[0, -6, 2, 2, 0, 0]
[1, -5, 1, 1, 0, 0]
[1, -6, 2, 2, 0, 0]
From /Users/jasonmiller/WVU/BAM_ML/tiny_SxS/ml_stats.csv
Primary,Score,Edit,Mismatch,GapOpen,GapExtend
[0, -19, 4, 4, 0, 0]
[0, -24, 6, 6, 0, 0]
[1, 0, 0, 0, 0, 0]
[1, -4, 2, 2,

In [7]:
X_train,y_train, X_valid,y_valid = loader.get_X_y()
print('X train shape: ')
print(np.shape(X_train))
print('y train shape: ')
print(np.shape(y_train))
print('X valid shape: ')
print(np.shape(X_valid))
print('y valid shape: ')
print(np.shape(y_valid))
print('X[5]=')
print(X_train[5])
print('y[5]=')
print(y_train[5])
#loader = None

Full train set size = 8000
Training/Validation partition: 6400/1600
X train shape: 
(6400, 24)
y train shape: 
(6400,)
X valid shape: 
(1600, 24)
y valid shape: 
(1600,)
X[5]=
[  0  -3   1   1   0   0   0 -11   6   6   0   0   1  -3   1   1   0   0
   1 -11   6   6   0   0]
y[5]=
1


## Model

In [8]:
def build_model():
    rfc = RFC()
    return rfc

In [9]:
print(datetime.now())
rfc_model=build_model()
print(rfc_model)

2023-06-03 12:36:38.529428
RandomForestClassifier()


## Training

In [10]:
print(datetime.now())
print("FIT")
rfc_model.fit(X_train, y_train) # sample weight
print(datetime.now())

2023-06-03 12:36:38.548558
FIT
2023-06-03 12:36:39.120654


In [11]:
print(datetime.now())        
print("PREDICT")
yhat_pairs=rfc_model.predict_proba(X_valid)  # [ prob of 0, prob of 1 ]
yhat_pred=[pair[1] for pair in yhat_pairs]
yhat_classes=rfc_model.predict(X_valid)  # 0 or 1

print('debug pred',yhat_pred[:3])
print('debug class',yhat_classes[:3])
print(datetime.now())        

2023-06-03 12:36:39.132388
PREDICT
debug pred [0.5482040256387246, 1.0, 0.0]
debug class [1 1 0]
2023-06-03 12:36:39.245276


In [12]:
print('Distrib of scores:',np.mean(yhat_pred),'mean',np.std(yhat_pred),'std')
print('Range of scores:',np.min(yhat_pred),'to',np.max(yhat_pred))
cm1 = confusion_matrix(y_valid,yhat_classes)
print('Confusion matrix\n',cm1)
cm2 = confusion_matrix(y_valid,yhat_classes,normalize='all')
print('Normalized matrix\n',cm2)

accuracy = accuracy_score(y_valid, yhat_classes)*100.
precision = precision_score(y_valid, yhat_classes)*100.
recall = recall_score(y_valid, yhat_classes)*100.
f1 = f1_score(y_valid, yhat_classes)*100.
prc_Y, prc_X, prc_bins = precision_recall_curve(y_valid, yhat_pred)
auprc = auc(prc_X,prc_Y)*100.
auroc = roc_auc_score(y_valid, yhat_pred)*100.
mcc = matthews_corrcoef(y_valid, yhat_classes)

print('Accuracy: %.2f%% Precision: %.2f%% Recall: %.2f%%' % (accuracy,precision,recall)) 
print('F1: %.2f%% MCC: %.4f' % (f1,mcc)) 
print('AUPRC: %.2f%% AUROC: %.2f%%' % (auprc,auroc)) 


Distrib of scores: 0.5077772668579573 mean 0.38195105222524495 std
Range of scores: 0.0 to 1.0
Confusion matrix
 [[612 188]
 [111 689]]
Normalized matrix
 [[0.3825   0.1175  ]
 [0.069375 0.430625]]
Accuracy: 81.31% Precision: 78.56% Recall: 86.12%
F1: 82.17% MCC: 0.6292
AUPRC: 89.23% AUROC: 90.00%
