# Accuracy on Diploid
Run each aligner on the diploid and trust its primary alignment.

In [1]:
from datetime import datetime
print(datetime.now())

2023-07-12 12:16:15.019681


In [2]:
import tensorflow as tf
dt='float32'
tf.keras.backend.set_floatx('float32')
tf.random.set_seed(42) # supposedly leads to reproducible results

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
else:
    print('Found GPU at: {}'.format(device_name))

try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/IRP2/'  # must end in "/"
    MODEL_DIR=PATH+'My Drive/data/IRP2/Models/'  # must end in "/"
except:
    IN_COLAB = False
    print('Running on Mac')
    DATA_DIR="/Users/jasonmiller/WVU/BAM_ML/"
    MODEL_DIR="/Users/jasonmiller/WVU/BAM_ML/Models/"
SAVE_MODEL_FILENAME = None

2023-07-12 12:16:15.090335: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU device not found
Running on Mac


2023-07-12 12:16:24.431521: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from platform import python_version
print('Python',python_version())
import random
import numpy as np
np.random.seed(42) # supposedly sets scikit-learn
import pandas as pd  # for plotting
import time # sleep function
from os.path import isfile
import gzip
from matplotlib import pyplot as plt
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

from tensorflow import keras
# consider sklearn.metrics.classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier as RFC

EPOCHS=150

Python 3.10.0
sklearn 1.1.2


## Comparisons

In [4]:
def show_performance(correct1,total1,correct2,total2):
    # class 1 is negative, class 2 is positive
    grand_total = total1+total2
    wrong1 = total1-correct1
    wrong2 = total2-correct2
    TN = correct1
    FN = wrong2
    TP = correct2
    FP = wrong1
    accuracy = 100 * (TP+TN)/grand_total
    f1 = 100 * 2*TP / (2*TP+FP+FN)
    sensitivity = 100 * TP / (TP+FN)
    recall = sensitivity
    specificity = 100 * TN / (TN+FP)
    precision = 100 * TP / (TP+FP)
    numer = TP*TN - FP*FN
    square = (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)
    denom = np.sqrt(square)
    mcc = numer/denom
    auprc = 0
    auroc = 0
    print('Accuracy: %.2f%% F1: %.2f%% MCC: %.4f' % (accuracy,f1,mcc))
    print('Precision: %.2f%% Recall: %.2f%% AUPRC: %.2f%%' % (precision,recall,auprc))
    print('Sensitivity: %.2f%% Specificity: %.2f%% AUROC: %.2f%%' % (sensitivity,specificity,auroc))

def show_ppm(correct1,correct2):
    show_performance(correct1,1000000,correct2,1000000)

In [5]:
show_performance(10,20,15,20)

Accuracy: 62.50% F1: 66.67% MCC: 0.2582
Precision: 60.00% Recall: 75.00% AUPRC: 0.00%
Sensitivity: 75.00% Specificity: 50.00% AUROC: 0.00%


## Arabidopsis

In [6]:
print('Arabidopsis: Salmon')
lyrata = 1480670/2
halleri = 1538246/2
show_ppm(lyrata,halleri)

Arabidopsis: Salmon
Accuracy: 75.47% F1: 75.82% MCC: 0.5097
Precision: 74.76% Recall: 76.91% AUPRC: 0.00%
Sensitivity: 76.91% Specificity: 74.03% AUROC: 0.00%


In [7]:
print('Arabidopsis: Bowtie')
lyrata = 1617258/2
halleri = 1692966/2
show_ppm(lyrata,halleri)

Arabidopsis: Bowtie
Accuracy: 82.76% F1: 83.08% MCC: 0.6556
Precision: 81.56% Recall: 84.65% AUPRC: 0.00%
Sensitivity: 84.65% Specificity: 80.86% AUROC: 0.00%


In [8]:
print('Arabidopsis: STAR RNA')
lyrata = 1603576/2
halleri = 1632714/2
show_ppm(lyrata,halleri)

Arabidopsis: STAR RNA
Accuracy: 80.91% F1: 81.05% MCC: 0.6182
Precision: 80.46% Recall: 81.64% AUPRC: 0.00%
Sensitivity: 81.64% Specificity: 80.18% AUROC: 0.00%


In [9]:
print('Arabidopsis: HiSat')
lyrata = 1907012/2
halleri = 766564/2
show_ppm(lyrata,halleri)

Arabidopsis: HiSat
Accuracy: 66.84% F1: 53.61% MCC: 0.4100
Precision: 89.18% Recall: 38.33% AUPRC: 0.00%
Sensitivity: 38.33% Specificity: 95.35% AUROC: 0.00%


In [10]:
print('Arabidopsis: STAR DNA')
lyrata = 1889359/2
halleri = 808975/2
show_ppm(lyrata,halleri)

Arabidopsis: STAR DNA
Accuracy: 67.46% F1: 55.42% MCC: 0.4149
Precision: 87.97% Recall: 40.45% AUPRC: 0.00%
Sensitivity: 40.45% Specificity: 94.47% AUROC: 0.00%


## Brassica

In [11]:
print('Brassica: Salmon')
rapa = 1606314/2
oleracea = 1777328/2
show_ppm(rapa,oleracea)

Brassica: Salmon
Accuracy: 84.59% F1: 85.22% MCC: 0.6944
Precision: 81.87% Recall: 88.87% AUPRC: 0.00%
Sensitivity: 88.87% Specificity: 80.32% AUROC: 0.00%


In [12]:
print('Brassica: Bowtie')
rapa = 1702108/2
oleracea = 1846870/2
show_ppm(rapa,oleracea)

Brassica: Bowtie
Accuracy: 88.72% F1: 89.12% MCC: 0.7765
Precision: 86.11% Recall: 92.34% AUPRC: 0.00%
Sensitivity: 92.34% Specificity: 85.11% AUROC: 0.00%


In [13]:
print('Brassica: STAR RNA')
rapa = 1741396/2
oleracea = 1867873/2
show_ppm(rapa,oleracea)

Brassica: STAR RNA
Accuracy: 90.23% F1: 90.53% MCC: 0.8062
Precision: 87.84% Recall: 93.39% AUPRC: 0.00%
Sensitivity: 93.39% Specificity: 87.07% AUROC: 0.00%


In [14]:
print('Brassica: HiSat')
rapa = 1771726/2
oleracea = 1901984/2
show_ppm(rapa,oleracea)

Brassica: HiSat
Accuracy: 91.84% F1: 92.10% MCC: 0.8386
Precision: 89.28% Recall: 95.10% AUPRC: 0.00%
Sensitivity: 95.10% Specificity: 88.59% AUROC: 0.00%


In [15]:
print('Brassica: STAR DNA')
rapa = 1801146/2
oleracea = 1920009/2
show_ppm(rapa,oleracea)

Brassica: STAR DNA
Accuracy: 93.03% F1: 93.23% MCC: 0.8621
Precision: 90.62% Recall: 96.00% AUPRC: 0.00%
Sensitivity: 96.00% Specificity: 90.06% AUROC: 0.00%


## Mouse

In [16]:
print('Mouse: Bowtie')
B6 = 1291778/2
D2 = 926958/2
show_ppm(B6,D2)

Mouse: Bowtie
Accuracy: 55.47% F1: 51.00% MCC: 0.1112
Precision: 56.69% Recall: 46.35% AUPRC: 0.00%
Sensitivity: 46.35% Specificity: 64.59% AUROC: 0.00%


In [17]:
print('Mouse: HiSat')
B6 = 1191870/2
D2 = 912994/2
show_ppm(B6,D2)

Mouse: HiSat
Accuracy: 52.62% F1: 49.07% MCC: 0.0529
Precision: 53.05% Recall: 45.65% AUPRC: 0.00%
Sensitivity: 45.65% Specificity: 59.59% AUROC: 0.00%


## Equus

In [18]:
print('Equus: Bowtie')
asinus = 1312344/2
caballus = 1563156/2
show_ppm(asinus,caballus)

Equus: Bowtie
Accuracy: 71.89% F1: 73.55% MCC: 0.4412
Precision: 69.45% Recall: 78.16% AUPRC: 0.00%
Sensitivity: 78.16% Specificity: 65.62% AUROC: 0.00%
