# CNN + GlobalAvgPool
Run this model on all lncRNA cell lines using some theshold. This notebook tests zero.

* zero
* mean RCI
* Antilog threshold set by EM/GMM.

Slight disconnect: transcripts are filtered by length, but RCI threshold is computed with the RCI from all genes.

Ran on CoLab Pro. RAM 5.6/12.7 GB. GPU 1.9/15.0 GB.

In [22]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import time # sleep function
from os.path import isfile
from matplotlib import pyplot as plt 
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

2023-02-05 16:14:40.418978
Python 3.8.10
sklearn 1.0.2


We prevously used sklearn.model_selection.ShuffleSplit   
Now we avoid it due to this note in the 
[documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html):
Note: contrary to other cross-validation strategies, random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets.

In [23]:
import tensorflow as tf
dt='float32'
tf.keras.backend.set_floatx('float32')
tf.random.set_seed(42) 

from tensorflow import keras
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

from keras.models import Sequential
from keras.layers import Masking
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import GlobalAveragePooling1D
from keras.layers import GlobalMaxPooling1D
from keras.layers import AveragePooling1D
from keras.layers import MaxPooling1D
from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.losses import BinaryCrossentropy

K=4
CFILTERS=64
FILTERSIZE=16
RCELLS=32
DCELLS=16
EPOCHS=3
FOLDS=5      
EMBED_DIMEN = 4 # arbitrary hyperparameter
BREAK = False   # break after first fold
MINLEN=200
MAXLEN=4000   
PRETTY_PICTURES = False

In [24]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/Localization/TrainTest/'  # must end in "/"
except:
    IN_COLAB = False
    DATA_DIR = 'D:/Adjeroh/Localization/TrainTest/'   # Windows
    DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/'    # Mac
print(DATA_DIR)

Running on CoLab
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/data/Localization/TrainTest/


In [25]:
RCI_FILE =      'CNRCI_noncoding_train_RCI.gc42.csv'
SEQUENCE_FILE = 'CNRCI_noncoding_train_transcripts.gc42.csv'

In [26]:
def get_ordered_list():
    ordered_list = \
    ['A549','H1.hESC','HeLa.S3','HepG2','HT1080',\
      'HUVEC','MCF.7','NCI.H460','NHEK','SK.MEL.5',\
      'SK.N.DZ','SK.N.SH','GM12878','K562','IMR.90']
    return ordered_list
all_cell_lines = get_ordered_list()

EXCLUSIONS = [1,7]   # use these though they are clearly different

## Data Load

In [27]:
class DataLoader():
    def __init__(self):
        self.cache=dict() 
        self.vals = {'A':0, 'C':1, 'G':2, 'T':3}
        
    def load_gene_rci_values(self,filepath,cell_line):
        '''
        Load from RCI csv file.
        Return dict with keys=gene:str and values=RCI:float.
        '''
        gene_to_rci = {}
        overall_sum = 0
        with open (filepath,'r') as handle:
            header = None
            for row in handle:
                if header is None:
                    header = row # skip file's header line
                else:
                    line = row.strip()
                    fields = line.split(',')
                    gene_id = fields.pop(0)
                    rci_str = fields[cell_line]
                    if rci_str != "nan":
                        rci_val = float(rci_str)
                        gene_to_rci[gene_id] = rci_val
        print('Number of RCI values loaded',len(gene_to_rci.keys()))
        return gene_to_rci

    def get_threshold(self,cell_line,gene_to_rci):
        return 0   # IN THIS EXPERIMENT, ALL POSITIVE RCI IS CYTO
    
    def seq_to_kmer_values(self,rna,K):
        # The cache may represent more than one K. Probably not a problem.
        N_indicator = 0 # indicator value
        vec=[] # seq converted to list of K-mers 
        length = len(rna)
        for i in range(length-K+1):
            kmer = rna[i:i+K]
            if 'N' in kmer:
                value = N_indicator
            elif kmer in self.cache.keys():
                value = self.cache[kmer]
            else:
                value = 0
                for j in range(K):
                    value *= 4   
                    nextnuc = kmer[j] 
                    nucval = self.vals[nextnuc]
                    value += nucval
                value += 1   # NNN => 0, AAA => 1
                self.cache[kmer] = value
            vec.append(value)
        return vec

    def rci_to_label(self,rci,rci_threshold):
        CYTO_LABEL = 1
        NUCLEAR_LABEL = 0
        # cnrci = log (cyto-to-nuclear ratio)
        # rci > 0 implies cytoplasmic
        if rci > rci_threshold:
            return CYTO_LABEL
        return NUCLEAR_LABEL

    def load_sequence(self,filepath,rci_threshold):
        labels=[]
        allids=[]
        allseq=[]

        NREPEAT = str('N'*MAXLEN)
        with open (filepath,'r') as handle:
            header = None
            for row in handle:
                if header is None:
                    header = row
                else:
                    line    = row.strip()
                    fields  = line.split(',')
                    tran_id = fields[0]  # with version number
                    gene_id = fields[1]        # without version number
                    seq_len = int(fields[3])
                    seq_txt = fields[4]
                    if seq_len>=MINLEN and seq_len<=MAXLEN and gene_id in gene_to_rci.keys():
                        allids.append( (gene_id,tran_id) )
                        rci_val = gene_to_rci[gene_id]
                        rci_label = self.rci_to_label(rci_val,rci_threshold)
                        labels.append(rci_label)
                        if seq_len<MAXLEN:
                            seq_txt = seq_txt + NREPEAT
                            seq_txt = seq_txt[:MAXLEN]
                        hot_vec = self.seq_to_kmer_values(seq_txt,K)
                        allseq.append(hot_vec)
        return labels,allids,allseq

## Model

In [28]:
def build_model():
    ALPHABET=4**K+1  # NUMBER OF DISTINCT KMERS POSSIBLE, add one if N gets mask value
    ADJUST_LENGTH = MAXLEN-K+1  # fixed length sequences
    cnn = Sequential()
    embed_layer = Embedding(ALPHABET,EMBED_DIMEN,input_length=ADJUST_LENGTH,mask_zero=True)   
    cnn1_layer = Conv1D(CFILTERS, FILTERSIZE)
    pool_layer = MaxPooling1D(pool_size=FILTERSIZE, strides=FILTERSIZE//2)
    #pool_layer = GlobalAveragePooling1D()
    flat_layer = Flatten()
    dens_layer = Dense(DCELLS)
    drop_layer = Dropout(0.5)
    output_layer = Dense(1,activation='sigmoid',dtype=dt)

    cnn.add(embed_layer)
    cnn.add(cnn1_layer)
    cnn.add(pool_layer)
    cnn.add(flat_layer)
    cnn.add(dens_layer)
    cnn.add(drop_layer)
    cnn.add(output_layer)

    bc=BinaryCrossentropy(from_logits=False)
    print("COMPILE")
    cnn.compile(loss=bc, optimizer="Adam",metrics=["accuracy"])
    return cnn

In [29]:
print(datetime.now())
model=build_model()
print(model.summary())  # Print this only once
model=None

2023-02-05 16:14:41.422777
COMPILE
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3997, 4)           1028      
                                                                 
 conv1d (Conv1D)             (None, 3982, 64)          4160      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 496, 64)          0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 31744)             0         
                                                                 
 dense (Dense)               (None, 16)                507920    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                     

## Training

In [30]:
import gc
class CrossValidator():
    def __init__(self,epochs,folds,quick_test=False,score_threshold=0.5):
        self.epochs = epochs
        self.folds = folds
        self.quick_test = quick_test
        self.score_threshold = score_threshold
    
    def get_gene_subset(self,all_genes,sub_index):
        sub_genes = set()
        for index in sub_index:
            one_gene = all_genes[index]
            sub_genes.add(one_gene)
        return sub_genes
    
    def get_X_y(self,gene_set,allids,allX,allY):
        cnt = len(allids)
        subsetX=[]
        subsetY=[]
        if cnt != len(allX) or cnt!= len(allY):
            raise Exception('Lengths differ')
        for i in range(cnt):
            gene_id,tran_id = allids[i]
            if gene_id in gene_set:
                oneX = allX[i]
                oneY = allY[i]
                subsetX.append(oneX)
                subsetY.append(oneY)
        subsetX = np.array(subsetX)
        subsetY = np.array(subsetY).reshape((-1,1))
        return subsetX,subsetY
    
    def do_cross_validation(self):
        cv_accuracy=[]
        cv_precision=[]
        cv_recall=[]
        cv_f1=[]
        cv_auprc=[]
        cv_auroc=[]
        fold=0
        print(datetime.now())
        print('splitting')
        # KFold shuffles once before making the partitions
        splitter = KFold(n_splits=self.folds,shuffle=True,random_state=42)
        splits = splitter.split(all_genes)
        splitter = None
        for train_index,valid_index in splits:
            fold += 1
            print('Fold',fold)
            train_genes = self.get_gene_subset(all_genes,train_index)
            X_train,y_train = self.get_X_y(train_genes,allids,allseq,labels)
            valid_genes = self.get_gene_subset(all_genes,valid_index)
            X_valid,y_valid = self.get_X_y(valid_genes,allids,allseq,labels)

            #print('Training example')
            #print(X_train[0])
            print("BUILD MODEL")
            model=build_model()

            print("FIT")
            print(datetime.now())
            history=model.fit(X_train, y_train, # batch_size=10, default=32 works nicely
              epochs=self.epochs, verbose=0,  # verbose=1 for ascii art, verbose=0 for none
              validation_data=(X_valid,y_valid) )

            print("PREDICT")
            print(datetime.now())
            yhat_pred=model.predict(X_valid, verbose=0)             
            yhat_classes=np.where(yhat_pred > self.score_threshold, 1, 0)

            # accuracy: (tp + tn) / (p + n)
            accuracy = accuracy_score(y_valid, yhat_classes)*100.
            # precision tp / (tp + fp)
            precision = precision_score(y_valid, yhat_classes)*100.
            # recall: tp / (tp + fn)
            recall = recall_score(y_valid, yhat_classes)*100.
            # f1: 2 tp / (2 tp + fp + fn)
            f1 = f1_score(y_valid, yhat_classes)*100.
            # PRC
            prc_Y, prc_X, prc_bins = precision_recall_curve(y_valid, yhat_pred)
            auprc = auc(prc_X,prc_Y)*100.
            auroc = roc_auc_score(y_valid, yhat_pred)*100.

            if PRETTY_PICTURES:
                pd.DataFrame(history.history).plot(figsize=(8,5))
                plt.grid(True)
                plt.gca().set_ylim(0,1)
                plt.show()

                print('Train set ones/size',
                      np.count_nonzero(y_train),'/',len(y_train))
                print("Compute valiation accuracy")
                print('Valid sizes',X_valid.shape,y_valid.shape)
                print('Valid set ones/size',
                      np.count_nonzero(y_valid),'/',len(y_valid))
                print('Range of scores:',np.min(yhat_pred),'to',np.max(yhat_pred))
                print('Score threshold',self.score_threshold)
                print('Prediction set ones/size',
                      np.count_nonzero(yhat_classes),'/',len(yhat_classes))
            
                count_ones= len(y_valid[y_valid==1])
                count_zeros= len(y_valid[y_valid==0])
                guess = max(count_ones,count_zeros) / len(y_valid)
                plt.plot(prc_X, prc_Y, marker='.')
                plt.plot([0, 1], [guess,guess], linestyle='--')
                plt.xlabel('Recall')
                plt.ylabel('Precision')
                plt.show()
                # ROC
                fpr, tpr, roc_bins = roc_curve(y_valid, yhat_pred)
                plt.plot(fpr, tpr, marker='.')
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.show()
            
                print('Accuracy: %.2f%% Precision: %.2f%% Recall: %.2f%%' % (accuracy,precision,recall)) 
                print('F1: %.2f%% AUPRC: %.2f%% AUROC: %.2f%%' % (f1,auprc,auroc)) 
                cv_accuracy.append(accuracy)
            cv_precision.append(precision)
            cv_recall.append(recall)
            cv_f1.append(f1)
            cv_auprc.append(auprc)
            cv_auroc.append(auroc)
            
            print(datetime.now())
            if self.quick_test:   
                print('Break -- this was for code testing only')
                break

            # There is a memory leak within the fit() command!
            # Each successive call to fit() consumes more memory.
            model = None
            history = None
            prc_Y = None
            prc_X = None
            prc_bins = None
            yhat_classes = None
            X_train = None
            y_train = None
            train_genes = None
            X_valid = None
            y_valid = None
            valid_genes = None
            tf.keras.backend.clear_session()
            gc.collect()
            time.sleep(1)  # hope gc kicks in
        print()
        return cv_accuracy, cv_precision, cv_recall, cv_f1, cv_auprc, cv_auroc

In [31]:
for CELL_LINE in range(15):
    print(datetime.now())
    print('Start cell Line',CELL_LINE,all_cell_lines[CELL_LINE])
    loader = DataLoader()
    filepath = DATA_DIR+RCI_FILE
    gene_to_rci = loader.load_gene_rci_values(filepath,CELL_LINE)
    print('Num RCI:', len(gene_to_rci.keys()))
    print('Example RCI:', list(gene_to_rci.items())[:3])
    all_genes = list(gene_to_rci.keys())
    RCI_THRESHOLD = loader.get_threshold(CELL_LINE,gene_to_rci)
    print('RCI threshold',RCI_THRESHOLD)
    print('Load sequence...')
    filepath = DATA_DIR+SEQUENCE_FILE
    labels,allids,allseq = loader.load_sequence(filepath,RCI_THRESHOLD)
    print('Num IDs:',len(allids))
    #print('Example IDs:',[allids[x] for x in [10, 20, 30, 40]] )
    #print('Num labels:',len(labels))
    #print('Example labels:',[labels[x] for x in [10, 20, 30, 40]] )
    #print('Num sequences:',len(allseq))
    #print('Example sequence:',allseq[3])
    loader = None  # drop K-mer cache to save RAM

    print("Cross validation...")
    cvdo = CrossValidator(EPOCHS,FOLDS,BREAK)
    cv_accuracy, cv_precision, cv_recall, cv_f1, cv_auprc, cv_auroc = cvdo.do_cross_validation()   
    cvdo = None
    print("Completed cross validation %d folds %d epochs" % (FOLDS,EPOCHS)) 
    print(" accuracy mean %.2f%% +/- %.2f" % (np.mean(cv_accuracy), np.std(cv_accuracy)))
    print(" precision mean %.2f%% +/- %.2f" % (np.mean(cv_precision), np.std(cv_precision)))
    print(" recall mean %.2f%% +/- %.2f" % (np.mean(cv_recall), np.std(cv_recall)))
    print(" F1 mean %.2f%% +/- %.2f" % (np.mean(cv_f1), np.std(cv_f1)))
    print(" AUPRC mean %.2f%% +/- %.2f" % (np.mean(cv_auprc), np.std(cv_auprc)))
    print(" AUROC mean %.2f%% +/- %.2f" % (np.mean(cv_auroc), np.std(cv_auroc)))
    print('Finished cell Line',CELL_LINE,all_cell_lines[CELL_LINE])
    print()
print(datetime.now())

2023-02-05 16:14:41.585083
Start cell Line 0 A549
Number of RCI values loaded 1447
Num RCI: 1447
Example RCI: [('ENSG00000116652', -1.848), ('ENSG00000117242', -0.25673), ('ENSG00000125462', 0.13734)]
RCI threshold 0
Load sequence...
Num IDs: 8474
Cross validation...
2023-02-05 16:14:51.936307
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:14:53.748967
PREDICT
2023-02-05 16:15:04.769808
2023-02-05 16:15:05.076531
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:15:08.115694
PREDICT
2023-02-05 16:15:19.037609
2023-02-05 16:15:19.353259
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:15:22.353666
PREDICT
2023-02-05 16:15:33.251492
2023-02-05 16:15:33.658978
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:15:36.663576
PREDICT
2023-02-05 16:15:47.548090
2023-02-05 16:15:47.950079
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:15:50.924755
PREDICT
2023-02-05 16:16:01.811043
2023-02-05 16:16:02.222417

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 56.15%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 16145
Cross validation...
2023-02-05 16:16:21.254920
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:16:24.529254
PREDICT
2023-02-05 16:16:45.927446
2023-02-05 16:16:46.435001
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:16:51.019737
PREDICT
2023-02-05 16:17:01.972585
2023-02-05 16:17:02.412571
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:17:06.986127
PREDICT
2023-02-05 16:17:28.221397
2023-02-05 16:17:28.974960
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:17:33.582866
PREDICT
2023-02-05 16:17:44.565498
2023-02-05 16:17:45.013471
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:17:49.640056
PREDICT
2023-02-05 16:18:00.631811
2023-02-05 16:18:01.383794

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 44.06% +/- 3.23
 recall mean 33.64% +/- 10.56
 F1 mean 37.30% +/- 6.77
 AUPRC mean 43.59% +/- 3.45
 AUROC mean 56.72% +/- 2.24
Finished cell Line 1 H1.hESC

2023-02-05 16:18:02.842258
Start cell Line 2 HeLa.S3
Number of RCI values loaded 9

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 5699
Cross validation...
2023-02-05 16:18:09.168816
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:18:10.439791
PREDICT
2023-02-05 16:18:16.205316
2023-02-05 16:18:16.448204
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:18:18.860090
PREDICT
2023-02-05 16:18:24.601125
2023-02-05 16:18:24.844030
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:18:27.262852
PREDICT
2023-02-05 16:18:31.877947
2023-02-05 16:18:32.287904
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:18:34.679104
PREDICT
2023-02-05 16:18:39.209544
2023-02-05 16:18:39.454464
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:18:41.860498
PREDICT
2023-02-05 16:18:46.311310
2023-02-05 16:18:46.553048

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 37.43% +/- 32.29
 recall mean 14.29% +/- 14.74
 F1 mean 13.87% +/- 9.63
 AUPRC mean 28.30% +/- 11.02
 AUROC mean 52.11% +/- 6.99
Finished cell Line 2 HeLa.S3

2023-02-05 16:18:47.809175
Start cell Line 3 HepG2
Number of RCI values loaded 13

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 8220
Cross validation...
2023-02-05 16:18:56.848362
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:18:58.579700
PREDICT
2023-02-05 16:19:09.444550
2023-02-05 16:19:09.864972
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:19:12.820447
PREDICT
2023-02-05 16:19:19.043636
2023-02-05 16:19:19.453503
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:19:22.420985
PREDICT
2023-02-05 16:19:28.230942
2023-02-05 16:19:28.559045
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:19:31.530486
PREDICT
2023-02-05 16:19:37.846971
2023-02-05 16:19:38.260694
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:19:41.207770
PREDICT
2023-02-05 16:19:52.101222
2023-02-05 16:19:52.505851

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 28.37% +/- 9.09
 recall mean 17.74% +/- 8.75
 F1 mean 21.10% +/- 7.87
 AUPRC mean 28.41% +/- 8.19
 AUROC mean 49.66% +/- 1.61
Finished cell Line 3 HepG2

2023-02-05 16:19:53.816052
Start cell Line 4 HT1080
Number of RCI values loaded 936
Nu

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 6895
Cross validation...
2023-02-05 16:20:01.546477
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:20:03.045050
PREDICT
2023-02-05 16:20:08.252007
2023-02-05 16:20:08.499989
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:20:11.184999
PREDICT
2023-02-05 16:20:16.930317
2023-02-05 16:20:17.178900
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:20:19.828494
PREDICT
2023-02-05 16:20:25.582311
2023-02-05 16:20:25.828655
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:20:28.470836
PREDICT
2023-02-05 16:20:34.168798
2023-02-05 16:20:34.417685
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:20:37.096705
PREDICT
2023-02-05 16:20:42.412294
2023-02-05 16:20:42.665526

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 44.43% +/- 11.67
 recall mean 60.13% +/- 14.15
 F1 mean 49.54% +/- 10.35
 AUPRC mean 47.49% +/- 11.66
 AUROC mean 51.20% +/- 4.68
Finished cell Line 4 HT1080

2023-02-05 16:20:43.966488
Start cell Line 5 HUVEC
Number of RCI values loaded 14

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 9317
Cross validation...
2023-02-05 16:20:54.365196
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:20:56.257794
PREDICT
2023-02-05 16:21:07.176628
2023-02-05 16:21:07.500517
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:21:10.648428
PREDICT
2023-02-05 16:21:21.596839
2023-02-05 16:21:21.947708
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:21:25.120908
PREDICT
2023-02-05 16:21:31.870700
2023-02-05 16:21:32.301270
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:21:35.535753
PREDICT
2023-02-05 16:21:46.435254
2023-02-05 16:21:46.761336
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:21:49.926717
PREDICT
2023-02-05 16:22:00.828136
2023-02-05 16:22:01.125051

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 38.65% +/- 9.72
 recall mean 24.08% +/- 13.13
 F1 mean 27.06% +/- 8.72
 AUPRC mean 35.47% +/- 6.93
 AUROC mean 57.80% +/- 2.77
Finished cell Line 5 HUVEC

2023-02-05 16:22:02.457930
Start cell Line 6 MCF.7
Number of RCI values loaded 2156
N

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 11276
Cross validation...
2023-02-05 16:22:14.993409
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:22:17.243806
PREDICT
2023-02-05 16:22:28.193379
2023-02-05 16:22:28.614196
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:22:32.166198
PREDICT
2023-02-05 16:22:39.942388
2023-02-05 16:22:40.297059
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:22:43.922051
PREDICT
2023-02-05 16:22:51.695603
2023-02-05 16:22:52.053873
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:22:55.619028
PREDICT
2023-02-05 16:23:03.831248
2023-02-05 16:23:04.129898
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:23:07.812640
PREDICT
2023-02-05 16:23:15.745917
2023-02-05 16:23:16.068733

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 32.51% +/- 5.91
 recall mean 15.91% +/- 9.49
 F1 mean 19.40% +/- 8.71
 AUPRC mean 30.52% +/- 4.63
 AUROC mean 52.51% +/- 2.95
Finished cell Line 6 MCF.7

2023-02-05 16:23:17.436305
Start cell Line 7 NCI.H460
Number of RCI values loaded 610

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 5067
Cross validation...
2023-02-05 16:23:23.208202
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:23:24.338246
PREDICT
2023-02-05 16:23:28.226691
2023-02-05 16:23:28.471521
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:23:30.749077
PREDICT
2023-02-05 16:23:36.487510
2023-02-05 16:23:36.727678
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:23:39.070843
PREDICT
2023-02-05 16:23:44.800011
2023-02-05 16:23:45.041496
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:23:47.316056
PREDICT
2023-02-05 16:23:51.414802
2023-02-05 16:23:51.606962
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:23:53.868786
PREDICT
2023-02-05 16:23:57.961698
2023-02-05 16:23:58.197421

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 17.69% +/- 7.79
 recall mean 5.91% +/- 3.50
 F1 mean 7.98% +/- 4.17
 AUPRC mean 21.42% +/- 11.37
 AUROC mean 50.96% +/- 5.15
Finished cell Line 7 NCI.H460

2023-02-05 16:23:59.458280
Start cell Line 8 NHEK
Number of RCI values loaded 1080
N

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 7916
Cross validation...
2023-02-05 16:24:08.280315
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:24:10.020457
PREDICT
2023-02-05 16:24:20.878861
2023-02-05 16:24:21.199057
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:24:24.065502
PREDICT
2023-02-05 16:24:34.952016
2023-02-05 16:24:35.357861
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:24:38.287270
PREDICT
2023-02-05 16:24:49.158014
2023-02-05 16:24:49.569637
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:24:52.414739
PREDICT
2023-02-05 16:25:03.290827
2023-02-05 16:25:03.695342
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:25:06.560687
PREDICT
2023-02-05 16:25:12.566255
2023-02-05 16:25:12.831647

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 39.85% +/- 7.45
 recall mean 32.43% +/- 16.19
 F1 mean 32.07% +/- 8.79
 AUPRC mean 40.27% +/- 6.18
 AUROC mean 52.60% +/- 3.29
Finished cell Line 8 NHEK

2023-02-05 16:25:14.137234
Start cell Line 9 SK.MEL.5
Number of RCI values loaded 534


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 5015
Cross validation...
2023-02-05 16:25:19.832186
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:25:20.899055
PREDICT
2023-02-05 16:25:26.631089
2023-02-05 16:25:26.854135
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:25:29.145298
PREDICT
2023-02-05 16:25:33.225325
2023-02-05 16:25:33.459863
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:25:35.725435
PREDICT
2023-02-05 16:25:41.439604
2023-02-05 16:25:41.678516
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:25:43.940678
PREDICT
2023-02-05 16:25:49.663442
2023-02-05 16:25:49.880752
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:25:52.144481
PREDICT
2023-02-05 16:25:56.251581
2023-02-05 16:25:56.431662

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 18.10% +/- 13.46
 recall mean 5.95% +/- 6.28
 F1 mean 7.32% +/- 6.21
 AUPRC mean 21.41% +/- 3.43
 AUROC mean 48.90% +/- 7.68
Finished cell Line 9 SK.MEL.5

2023-02-05 16:25:57.694360
Start cell Line 10 SK.N.DZ
Number of RCI values loaded 60

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 5574
Cross validation...
2023-02-05 16:26:04.017270
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:26:05.178040
PREDICT
2023-02-05 16:26:09.562566
2023-02-05 16:26:09.815165
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:26:12.187764
PREDICT
2023-02-05 16:26:16.452360
2023-02-05 16:26:16.694056
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:26:19.058317
PREDICT
2023-02-05 16:26:24.765720
2023-02-05 16:26:24.984651
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:26:27.380574
PREDICT
2023-02-05 16:26:33.115872
2023-02-05 16:26:33.355027
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:26:35.723775
PREDICT
2023-02-05 16:26:41.449568
2023-02-05 16:26:41.698851

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 32.92% +/- 7.61
 recall mean 18.59% +/- 9.91
 F1 mean 22.81% +/- 9.04
 AUPRC mean 34.85% +/- 5.28
 AUROC mean 48.68% +/- 6.74
Finished cell Line 10 SK.N.DZ

2023-02-05 16:26:42.967903
Start cell Line 11 SK.N.SH
Number of RCI values loaded 1

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 10071
Cross validation...
2023-02-05 16:26:54.158164
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:26:56.150605
PREDICT
2023-02-05 16:27:07.093814
2023-02-05 16:27:07.410705
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:27:10.850649
PREDICT
2023-02-05 16:27:17.667179
2023-02-05 16:27:18.041186
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:27:21.356728
PREDICT
2023-02-05 16:27:32.253511
2023-02-05 16:27:32.623763
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:27:36.007480
PREDICT
2023-02-05 16:27:43.475737
2023-02-05 16:27:43.779651
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:27:47.089553
PREDICT
2023-02-05 16:27:54.656346
2023-02-05 16:27:55.065153

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 45.56% +/- 8.09
 recall mean 21.32% +/- 14.74
 F1 mean 26.62% +/- 14.26
 AUPRC mean 40.54% +/- 5.66
 AUROC mean 60.31% +/- 3.46
Finished cell Line 11 SK.N.SH

2023-02-05 16:27:56.401115
Start cell Line 12 GM12878
Number of RCI values loade

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 9578
Cross validation...
2023-02-05 16:28:06.935597
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:28:08.899652
PREDICT
2023-02-05 16:28:15.851245
2023-02-05 16:28:16.266025
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:28:19.470378
PREDICT
2023-02-05 16:28:30.361346
2023-02-05 16:28:30.699769
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:28:33.910897
PREDICT
2023-02-05 16:28:44.788294
2023-02-05 16:28:45.170193
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:28:48.370156
PREDICT
2023-02-05 16:28:59.282010
2023-02-05 16:28:59.692235
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:29:02.899028
PREDICT
2023-02-05 16:29:13.778925
2023-02-05 16:29:14.194366

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 38.54% +/- 10.33
 recall mean 28.51% +/- 16.47
 F1 mean 29.44% +/- 11.03
 AUPRC mean 38.76% +/- 7.14
 AUROC mean 56.02% +/- 1.85
Finished cell Line 12 GM12878

2023-02-05 16:29:15.528185
Start cell Line 13 K562
Number of RCI values loaded 9

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 6173
Cross validation...
2023-02-05 16:29:22.381864
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:29:23.685369
PREDICT
2023-02-05 16:29:28.556039
2023-02-05 16:29:28.795212
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:29:31.317843
PREDICT
2023-02-05 16:29:37.029537
2023-02-05 16:29:37.445862
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:29:39.975996
PREDICT
2023-02-05 16:29:44.906380
2023-02-05 16:29:45.151830
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:29:47.735688
PREDICT
2023-02-05 16:29:52.542387
2023-02-05 16:29:52.791857
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:29:55.300839
PREDICT
2023-02-05 16:30:01.055482
2023-02-05 16:30:01.300978

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 35.49% +/- 7.96
 recall mean 12.82% +/- 8.72
 F1 mean 17.49% +/- 8.89
 AUPRC mean 34.46% +/- 6.27
 AUROC mean 53.58% +/- 5.86
Finished cell Line 13 K562

2023-02-05 16:30:02.573122
Start cell Line 14 IMR.90
Number of RCI values loaded 390
N

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


Num IDs: 2386
Cross validation...
2023-02-05 16:30:05.308518
splitting
Fold 1
BUILD MODEL
COMPILE
FIT
2023-02-05 16:30:05.868814
PREDICT
2023-02-05 16:30:09.045650
2023-02-05 16:30:09.197430
Fold 2
BUILD MODEL
COMPILE
FIT
2023-02-05 16:30:10.920712
PREDICT
2023-02-05 16:30:14.066588
2023-02-05 16:30:14.218007
Fold 3
BUILD MODEL
COMPILE
FIT
2023-02-05 16:30:15.928593
PREDICT
2023-02-05 16:30:18.240794
2023-02-05 16:30:18.394315
Fold 4
BUILD MODEL
COMPILE
FIT
2023-02-05 16:30:20.100453
PREDICT
2023-02-05 16:30:22.462773
2023-02-05 16:30:22.614193
Fold 5
BUILD MODEL
COMPILE
FIT
2023-02-05 16:30:24.351087
PREDICT
2023-02-05 16:30:27.481270
2023-02-05 16:30:27.625225

Completed cross validation 5 folds 3 epochs
 accuracy mean nan% +/- nan
 precision mean 46.02% +/- 12.76
 recall mean 28.29% +/- 13.97
 F1 mean 31.17% +/- 14.05
 AUPRC mean 43.08% +/- 9.48
 AUROC mean 57.74% +/- 8.19
Finished cell Line 14 IMR.90

2023-02-05 16:30:28.828724


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
