# Accuracy on Diploid
Run each aligner on the diploid and trust its primary alignment.

From every BAM file, take the first 2 million lines = 1 million read pairs.

Count how many of those aligned to the true parent.

In [1]:
from datetime import datetime
print(datetime.now())

2023-07-22 13:43:03.039725


In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    print('Running on CoLab')
    PATH='/content/drive/'
    drive.mount(PATH)
    DATA_DIR=PATH+'My Drive/data/IRP2/'  # must end in "/"
    MODEL_DIR=PATH+'My Drive/data/IRP2/Models/'  # must end in "/"
except:
    IN_COLAB = False
    print('Running on Mac')
    DATA_DIR="/Users/jasonmiller/WVU/BAM_ML/"
    MODEL_DIR="/Users/jasonmiller/WVU/BAM_ML/Models/"
SAVE_MODEL_FILENAME = None

Running on Mac


In [3]:
from platform import python_version
print('Python',python_version())
import random
import numpy as np
np.random.seed(42) # supposedly sets scikit-learn
import pandas as pd  # for plotting
import time # sleep function
from os.path import isfile
import gzip
from matplotlib import pyplot as plt
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)

from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix

Python 3.10.0
sklearn 1.1.2


## Comparisons

In [4]:
def show_performance(correct1,total1,correct2,total2):
    # class 1 is negative, class 2 is positive
    grand_total = total1+total2
    wrong1 = total1-correct1
    wrong2 = total2-correct2
    TN = correct1
    FN = wrong2
    TP = correct2
    FP = wrong1
    accuracy = 100 * (TP+TN)/grand_total
    f1 = 100 * 2*TP / (2*TP+FP+FN)
    sensitivity = 100 * TP / (TP+FN)
    recall = sensitivity
    specificity = 100 * TN / (TN+FP)
    precision = 100 * TP / (TP+FP)
    numer = TP*TN - FP*FN
    square = (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)
    denom = np.sqrt(square)
    mcc = numer/denom
    auprc = 0
    auroc = 0
    preference = 100*(TP+FP)/(TP+FP+TN+FN) # % positive class
    print('TP FP TOT: %6d %6d %7d' % (TP,FP,TP+FP))
    print('FN TN TOT: %6d %6d %7d' % (FN,TN,FN+TN))
    print('Pos pref:    %.2f%%' % (preference) )
    print('Accuracy:    %.2f%% F1-score:    %.2f%% MCC:   %.4f' % (accuracy,f1,mcc))
    print('Precision:   %.2f%% Recall:      %.2f%% AUPRC: %.2f%%' % (precision,recall,auprc))
    print('Sensitivity: %.2f%% Specificity: %.2f%% AUROC: %.2f%%' % (sensitivity,specificity,auroc))

def show_ppm(correct1,correct2):
    show_performance(correct1,1000000,correct2,1000000)

In [5]:
show_performance(10,20,15,20)

TP FP TOT:     15     10      25
FN TN TOT:      5     10      15
Pos pref:    62.50%
Accuracy:    62.50% F1-score:    66.67% MCC:   0.2582
Precision:   60.00% Recall:      75.00% AUPRC: 0.00%
Sensitivity: 75.00% Specificity: 50.00% AUROC: 0.00%


## Arabidopsis

In [6]:
print('Arabidopsis: Salmon')
lyrata =  1480670/2
halleri = 1538246/2
show_ppm(lyrata,halleri)

Arabidopsis: Salmon
TP FP TOT: 769123 259665 1028788
FN TN TOT: 230877 740335  971212
Pos pref:    51.44%
Accuracy:    75.47% F1-score:    75.82% MCC:   0.5097
Precision:   74.76% Recall:      76.91% AUPRC: 0.00%
Sensitivity: 76.91% Specificity: 74.03% AUROC: 0.00%


In [7]:
print('Arabidopsis: Bowtie')
lyrata =  1617258/2
halleri = 1692966/2
show_ppm(lyrata,halleri)

Arabidopsis: Bowtie
TP FP TOT: 846483 191371 1037854
FN TN TOT: 153517 808629  962146
Pos pref:    51.89%
Accuracy:    82.76% F1-score:    83.08% MCC:   0.6556
Precision:   81.56% Recall:      84.65% AUPRC: 0.00%
Sensitivity: 84.65% Specificity: 80.86% AUROC: 0.00%


In [8]:
print('Arabidopsis: STAR RNA')
lyrata =  1603576/2
halleri = 1632714/2
show_ppm(lyrata,halleri)

Arabidopsis: STAR RNA
TP FP TOT: 816357 198212 1014569
FN TN TOT: 183643 801788  985431
Pos pref:    50.73%
Accuracy:    80.91% F1-score:    81.05% MCC:   0.6182
Precision:   80.46% Recall:      81.64% AUPRC: 0.00%
Sensitivity: 81.64% Specificity: 80.18% AUROC: 0.00%


In [9]:
print('Arabidopsis: HiSat')
lyrata = 1907012/2
halleri = 766564/2
show_ppm(lyrata,halleri)

Arabidopsis: HiSat
TP FP TOT: 383282  46494  429776
FN TN TOT: 616718 953506 1570224
Pos pref:    21.49%
Accuracy:    66.84% F1-score:    53.61% MCC:   0.4100
Precision:   89.18% Recall:      38.33% AUPRC: 0.00%
Sensitivity: 38.33% Specificity: 95.35% AUROC: 0.00%


In [10]:
print('Arabidopsis: STAR DNA')
lyrata = 1889359/2
halleri = 808975/2
show_ppm(lyrata,halleri)

Arabidopsis: STAR DNA
TP FP TOT: 404487  55320  459808
FN TN TOT: 595512 944679 1540192
Pos pref:    22.99%
Accuracy:    67.46% F1-score:    55.42% MCC:   0.4149
Precision:   87.97% Recall:      40.45% AUPRC: 0.00%
Sensitivity: 40.45% Specificity: 94.47% AUROC: 0.00%


## Brassica

In [11]:
print('Brassica: Salmon')
rapa = 1606314/2
oleracea = 1777328/2
show_ppm(rapa,oleracea)

Brassica: Salmon
TP FP TOT: 888664 196843 1085507
FN TN TOT: 111336 803157  914493
Pos pref:    54.28%
Accuracy:    84.59% F1-score:    85.22% MCC:   0.6944
Precision:   81.87% Recall:      88.87% AUPRC: 0.00%
Sensitivity: 88.87% Specificity: 80.32% AUROC: 0.00%


In [12]:
print('Brassica: Bowtie')
rapa = 1702108/2
oleracea = 1846870/2
show_ppm(rapa,oleracea)

Brassica: Bowtie
TP FP TOT: 923435 148946 1072381
FN TN TOT:  76565 851054  927619
Pos pref:    53.62%
Accuracy:    88.72% F1-score:    89.12% MCC:   0.7765
Precision:   86.11% Recall:      92.34% AUPRC: 0.00%
Sensitivity: 92.34% Specificity: 85.11% AUROC: 0.00%


In [13]:
print('Brassica: STAR RNA')
rapa = 1741396/2
oleracea = 1867873/2
show_ppm(rapa,oleracea)

Brassica: STAR RNA
TP FP TOT: 933936 129302 1063238
FN TN TOT:  66063 870698  936761
Pos pref:    53.16%
Accuracy:    90.23% F1-score:    90.53% MCC:   0.8062
Precision:   87.84% Recall:      93.39% AUPRC: 0.00%
Sensitivity: 93.39% Specificity: 87.07% AUROC: 0.00%


In [14]:
print('Brassica: HiSat')
rapa = 1771726/2
oleracea = 1901984/2
show_ppm(rapa,oleracea)

Brassica: HiSat
TP FP TOT: 950992 114137 1065129
FN TN TOT:  49008 885863  934871
Pos pref:    53.26%
Accuracy:    91.84% F1-score:    92.10% MCC:   0.8386
Precision:   89.28% Recall:      95.10% AUPRC: 0.00%
Sensitivity: 95.10% Specificity: 88.59% AUROC: 0.00%


In [15]:
print('Brassica: STAR DNA')
rapa = 1801146/2
oleracea = 1920009/2
show_ppm(rapa,oleracea)

Brassica: STAR DNA
TP FP TOT: 960004  99427 1059431
FN TN TOT:  39995 900573  940568
Pos pref:    52.97%
Accuracy:    93.03% F1-score:    93.23% MCC:   0.8621
Precision:   90.62% Recall:      96.00% AUPRC: 0.00%
Sensitivity: 96.00% Specificity: 90.06% AUROC: 0.00%


## Mouse

In [16]:
print('Mouse: Salmon')
B6 = 1361274/2
D2 = 692334/2
show_ppm(B6,D2)

Mouse: Salmon
TP FP TOT: 346167 319363  665530
FN TN TOT: 653833 680637 1334470
Pos pref:    33.28%
Accuracy:    51.34% F1-score:    41.57% MCC:   0.0284
Precision:   52.01% Recall:      34.62% AUPRC: 0.00%
Sensitivity: 34.62% Specificity: 68.06% AUROC: 0.00%


In [17]:
print('Mouse: Bowtie')
B6 = 1291778/2
D2 = 926958/2
show_ppm(B6,D2)

Mouse: Bowtie
TP FP TOT: 463479 354111  817590
FN TN TOT: 536521 645889 1182410
Pos pref:    40.88%
Accuracy:    55.47% F1-score:    51.00% MCC:   0.1112
Precision:   56.69% Recall:      46.35% AUPRC: 0.00%
Sensitivity: 46.35% Specificity: 64.59% AUROC: 0.00%


In [18]:
print('Mouse: STAR RNA')
B6 = 1213650/2
D2 = 929285/2
show_ppm(B6,D2)

Mouse: STAR RNA
TP FP TOT: 464642 393175  857817
FN TN TOT: 535357 606825 1142182
Pos pref:    42.89%
Accuracy:    53.57% F1-score:    50.02% MCC:   0.0722
Precision:   54.17% Recall:      46.46% AUPRC: 0.00%
Sensitivity: 46.46% Specificity: 60.68% AUROC: 0.00%


In [19]:
print('Mouse: HiSat')
B6 = 1191870/2
D2 = 912994/2
show_ppm(B6,D2)

Mouse: HiSat
TP FP TOT: 456497 404065  860562
FN TN TOT: 543503 595935 1139438
Pos pref:    43.03%
Accuracy:    52.62% F1-score:    49.07% MCC:   0.0529
Precision:   53.05% Recall:      45.65% AUPRC: 0.00%
Sensitivity: 45.65% Specificity: 59.59% AUROC: 0.00%


In [20]:
print('Mouse: STAR DNA')
B6 = 1207618/2
D2 = 893403/2
show_ppm(B6,D2)

Mouse: STAR DNA
TP FP TOT: 446701 396191  842892
FN TN TOT: 553298 603809 1157107
Pos pref:    42.14%
Accuracy:    52.53% F1-score:    48.48% MCC:   0.0511
Precision:   53.00% Recall:      44.67% AUPRC: 0.00%
Sensitivity: 44.67% Specificity: 60.38% AUROC: 0.00%


## Equus

In [21]:
print('Equus: Salmon')
asinus = 1050422/2
caballus = 1496400/2
show_ppm(asinus,caballus)

Equus: Salmon
TP FP TOT: 748200 474789 1222989
FN TN TOT: 251800 525211  777011
Pos pref:    61.15%
Accuracy:    63.67% F1-score:    67.31% MCC:   0.2805
Precision:   61.18% Recall:      74.82% AUPRC: 0.00%
Sensitivity: 74.82% Specificity: 52.52% AUROC: 0.00%


In [22]:
print('Equus: Bowtie')
asinus = 1312344/2
caballus = 1563156/2
show_ppm(asinus,caballus)

Equus: Bowtie
TP FP TOT: 781578 343828 1125406
FN TN TOT: 218422 656172  874594
Pos pref:    56.27%
Accuracy:    71.89% F1-score:    73.55% MCC:   0.4412
Precision:   69.45% Recall:      78.16% AUPRC: 0.00%
Sensitivity: 78.16% Specificity: 65.62% AUROC: 0.00%


In [23]:
print('Equus: STAR RNA')
asinus = 1266262/2
caballus = 1579987/2
show_ppm(asinus,caballus)

Equus: STAR RNA
TP FP TOT: 789993 366869 1156862
FN TN TOT: 210006 633131  843137
Pos pref:    57.84%
Accuracy:    71.16% F1-score:    73.25% MCC:   0.4284
Precision:   68.29% Recall:      79.00% AUPRC: 0.00%
Sensitivity: 79.00% Specificity: 63.31% AUROC: 0.00%


In [24]:
print('Equus: Hisat')
asinus = 1473588/2
caballus = 1448398/2
show_ppm(asinus,caballus)

Equus: Hisat
TP FP TOT: 724199 263206  987405
FN TN TOT: 275801 736794 1012595
Pos pref:    49.37%
Accuracy:    73.05% F1-score:    72.88% MCC:   0.4610
Precision:   73.34% Recall:      72.42% AUPRC: 0.00%
Sensitivity: 72.42% Specificity: 73.68% AUROC: 0.00%


In [25]:
print('Equus: STAR DNA')
asinus = 1443462/2
caballus = 1420902/2
show_ppm(asinus,caballus)

Equus: STAR DNA
TP FP TOT: 710451 278269  988720
FN TN TOT: 289549 721731 1011280
Pos pref:    49.44%
Accuracy:    71.61% F1-score:    71.45% MCC:   0.4322
Precision:   71.86% Recall:      71.05% AUPRC: 0.00%
Sensitivity: 71.05% Specificity: 72.17% AUROC: 0.00%
