# Import libraries and define functions + Initial setup

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
name: relative_transcript_abundance

purpose: calculate relative transcript abundance

input: a dataframe with a ref_gene_id column identifying the transcript gene of origin and a cov columns with 
the coverage for the transcripts.

output: the same dataframe with a relative abundance column added
'''



def relative_transcript_abundance(df):
    
    ## Group by gene_id and get total expression for each gene (not counting introns)
    df_sums = df[["gene_id", "total_CPM"]].groupby("gene_id").sum()
    df_sums["total_CPM_gene"] = df_sums["total_CPM"]
    df_sums.drop(columns="total_CPM", inplace=True)
    
    ## Merge dataframe with total gene level CPM with regular transcript level CPM dataframe
    merged_df = pd.merge(df, df_sums, how='inner', on="gene_id")
    
    ## Calculater relative percent abundance for each transcript within its gene
    merged_df["relative_abundance_percent"] = ((merged_df["total_CPM"]/merged_df["total_CPM_gene"]) * 100)
    
    ## Rename total_CPM for transcript column
    merged_df["total_CPM_transcript"] = merged_df["total_CPM"]
    merged_df.drop(columns="total_CPM", inplace=True)


    return merged_df

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        df["gene_id"] = df.index
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        df.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [4]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False, delete_other=True):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        if delete_other:
            df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [5]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = df.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = df.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        df[col] = round(df[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        df[cpm_name] = round(((df[col]/(df[col].sum())) * 1000000), 2)
    
    return df  

In [6]:
## define ggplot colors
ggplot2_colors = ["#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#C77CFF", "#FF61CC"]

## - Read references

In [7]:
## Open original reference
original_ref = pd.read_csv("../../../references/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

original_ref = original_ref.loc[~original_ref["chr"].str.startswith("ERCC-")]

## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True)

In [8]:
## Import and parse through extended annotations
ref = pd.read_csv("../../../data/raw/nextflow_pipeline_output/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

ref = ref.loc[~ref["chr"].str.startswith("ERCC-")]

ref = parse_df_columns(ref, is_ref=False)

ref_transcripts = ref.loc[ref["type"] == "transcript"].copy()

In [9]:
## Create Category for transcripts and store values

ref_transcripts = ref_transcripts.loc[ref_transcripts["transcript_id"].str.startswith("BambuTx")].copy()

ref_transcripts.loc[ref_transcripts["gene_id"].str.startswith("BambuGene"), "discovery_type"] = "New transcript from new gene body"
ref_transcripts.loc[~(ref_transcripts["gene_id"].str.startswith("BambuGene")), "discovery_type"] = "New transcript from known gene body"
ref_transcripts.loc[ref_transcripts["chr"] == "MT", "discovery_type"] = "New mitochondrially encoded spliced transcript"

## - Make output table

In [10]:
## Download data

leung = pd.read_csv("../../../data/processed/1st_rebuttal/discovery_comparison_bambu/tracking_files/OURS_ALL_vs_leung.tracking", sep="\t", header=None, usecols=[2,3,4],
                         names=["leung_name", "category", "our_name"])

glinos = pd.read_csv("../../../data/processed/1st_rebuttal/discovery_comparison_bambu/tracking_files/OURS_ALL_vs_glinos.tracking", sep="\t", header=None, usecols=[2,3,4],
                         names=["glinos_name", "category", "our_name"])

In [11]:
## Was same transcript idenpendently discovered in Glinos or Leung paper?

leung.loc[leung["category"] == "=", "is_in_leung"] = True
leung.loc[~(leung["category"] == "="), "is_in_leung"] = False

glinos.loc[glinos["category"] == "=", "is_in_glinos"] = True
glinos.loc[~(glinos["category"] == "="), "is_in_glinos"] = False

In [12]:
## Drop category column
leung.drop(columns="category", inplace=True)
glinos.drop(columns="category", inplace=True)

In [13]:
## Merge dataframes
df = leung.merge(glinos, on="our_name", how="inner")

In [14]:
## Create gene_id and transcript_id
df["gene_id"] = df["our_name"].str.split(":", expand=True)[1].str.split("|", expand=True)[0]
df["transcript_id"] = df["our_name"].str.split(":", expand=True)[1].str.split("|", expand=True)[1]

## Drop our name
df.drop(columns="our_name", inplace=True)

In [15]:
## Create flag to determine if transcript is in either, or in both
df.loc[((df["is_in_leung"] == True) & (df["is_in_glinos"] == True)), "is_in_both"] = True
df.loc[~((df["is_in_leung"] == True) & (df["is_in_glinos"] == True)), "is_in_both"] = False

df.loc[((df["is_in_leung"] == True) | (df["is_in_glinos"] == True)), "is_in_either"] = True
df.loc[~((df["is_in_leung"] == True) | (df["is_in_glinos"] == True)), "is_in_either"] = False

In [16]:
## Import high-confidence annotation
high_confidence_annotation = pd.read_csv("../../../references/high_confidence_transcripts.tsv", sep="\t")

In [17]:
## Annotate high-confidence transcripts
df.loc[df["transcript_id"].isin(high_confidence_annotation["transcript_id"]), "is_high-confidence"] = True
df.loc[~(df["transcript_id"].isin(high_confidence_annotation["transcript_id"])), "is_high-confidence"] = False

In [18]:
## Add gene name and chromosome
df = df.merge(orig_ref[["gene_id", "gene_name", "chr"]], on="gene_id", how="left")
df['gene_name'].fillna(df['gene_id'], inplace=True)

In [19]:
## Add type of new transcript
df = df.merge(ref_transcripts[["transcript_id", "discovery_type"]], on="transcript_id", how="inner")

In [20]:
df.columns

Index(['leung_name', 'is_in_leung', 'glinos_name', 'is_in_glinos', 'gene_id',
       'transcript_id', 'is_in_both', 'is_in_either', 'is_high-confidence',
       'gene_name', 'chr', 'discovery_type'],
      dtype='object')

In [21]:
## Reorder columns
df = df[['chr', 'gene_id', 'gene_name', 'transcript_id', "leung_name", "glinos_name", "is_in_glinos", "is_in_leung",
         "is_in_either", "is_in_both", "is_high-confidence", "discovery_type"]].copy()

In [22]:
## Save file
df.to_csv("../../../data/processed/1st_rebuttal/discovery_comparison_bambu/ours_ALL_vs_glinos_and_leung.tsv",
                               sep="\t", index=False)

## - Open tracking data

In [23]:
## Open data
dff = pd.read_csv("../../../data/processed/1st_rebuttal/discovery_comparison_bambu/ours_ALL_vs_glinos_and_leung.tsv",
                               sep="\t")

In [24]:
## Number of new transcripts, high-confidence & no filter (all)
all_number_ours = dff.shape[0]

hf_number_ours = dff.loc[dff["is_high-confidence"] == True].shape[0]

In [25]:
## Number of transcripts in glinos, leung, in both, and in neither
number_all_three = dff.loc[dff["is_in_both"] == True].shape[0]

number_leung_and_ours = dff.loc[dff["is_in_leung"] == True].shape[0]

number_glinos_and_ours = dff.loc[dff["is_in_glinos"] == True].shape[0]
only_ours = dff.loc[dff["is_in_either"] == False].shape[0]
is_in_either = dff.loc[dff["is_in_either"] == True].shape[0]

number_in_ours_not_in_glinos = dff.loc[dff["is_in_glinos"] == False].shape[0]
number_in_ours_not_in_leung = dff.loc[dff["is_in_leung"] == False].shape[0]

In [26]:
## Create high-confidence data frame
dff_hf = dff.loc[dff["is_high-confidence"] == True].copy()

In [27]:
## Number of transcripts in glinos, leung, in both, and in neither
number_all_three_hf = dff_hf.loc[dff_hf["is_in_both"] == True].shape[0]

number_leung_and_ours_hf = dff_hf.loc[dff_hf["is_in_leung"] == True].shape[0]

number_glinos_and_ours_hf = dff_hf.loc[dff_hf["is_in_glinos"] == True].shape[0]
only_ours_hf = dff_hf.loc[dff_hf["is_in_either"] == False].shape[0]
is_in_either_hf = dff_hf.loc[dff["is_in_either"] == True].shape[0]

number_in_ours_not_in_glinos_hf = dff_hf.loc[dff_hf["is_in_glinos"] == False].shape[0]
number_in_ours_not_in_leung_hf = dff_hf.loc[dff_hf["is_in_leung"] == False].shape[0]

In [28]:
## Number of transcripts in glinos, leung, in both, and in neither
dff_nfk= dff.loc[dff["discovery_type"] == "New transcript from known gene body"].copy()


number_all_three_nfk = dff_nfk.loc[dff_nfk["is_in_both"] == True].shape[0]

number_leung_and_ours_nfk = dff_nfk.loc[dff_nfk["is_in_leung"] == True].shape[0]

number_glinos_and_ours_nfk = dff_nfk.loc[dff_nfk["is_in_glinos"] == True].shape[0]
only_ours_nfk = dff_nfk.loc[dff_nfk["is_in_either"] == False].shape[0]
is_in_either_nfk = dff_nfk.loc[dff_nfk["is_in_either"] == True].shape[0]

number_in_ours_not_in_glinos_nfk = dff_nfk.loc[dff_nfk["is_in_glinos"] == False].shape[0]
number_in_ours_not_in_leung_nfk = dff_nfk.loc[dff_nfk["is_in_leung"] == False].shape[0]


## Create high-confidence data frame
dff_nfk_hf = dff_nfk.loc[dff_nfk["is_high-confidence"] == True].copy()

## Number of transcripts in glinos, leung, in both, and in neither
number_all_three_hf_nfk = dff_nfk_hf.loc[dff_nfk_hf["is_in_both"] == True].shape[0]

number_leung_and_ours_hf_nfk = dff_nfk_hf.loc[dff_nfk_hf["is_in_leung"] == True].shape[0]

number_glinos_and_ours_hf_nfk = dff_nfk_hf.loc[dff_nfk_hf["is_in_glinos"] == True].shape[0]
only_ours_hf_nfk = dff_nfk_hf.loc[dff_nfk_hf["is_in_either"] == False].shape[0]
is_in_either_hf_nfk = dff_nfk_hf.loc[dff_nfk["is_in_either"] == True].shape[0]

number_in_ours_not_in_glinos_hf_nfk = dff_nfk_hf.loc[dff_nfk_hf["is_in_glinos"] == False].shape[0]
number_in_ours_not_in_leung_hf_nfk = dff_nfk_hf.loc[dff_nfk_hf["is_in_leung"] == False].shape[0]

In [29]:
## Number of transcripts in glinos, leung, in both, and in neither
dff_nfn= dff.loc[dff["discovery_type"] == "New transcript from new gene body"].copy()


number_all_three_nfn = dff_nfn.loc[dff_nfn["is_in_both"] == True].shape[0]

number_leung_and_ours_nfn = dff_nfn.loc[dff_nfn["is_in_leung"] == True].shape[0]

number_glinos_and_ours_nfn = dff_nfn.loc[dff_nfn["is_in_glinos"] == True].shape[0]
only_ours_nfn = dff_nfn.loc[dff_nfn["is_in_either"] == False].shape[0]
is_in_either_nfn = dff_nfn.loc[dff_nfn["is_in_either"] == True].shape[0]

number_in_ours_not_in_glinos_nfn = dff_nfn.loc[dff_nfn["is_in_glinos"] == False].shape[0]
number_in_ours_not_in_leung_nfn = dff_nfn.loc[dff_nfn["is_in_leung"] == False].shape[0]


## Create high-confidence data frame
dff_nfn_hf = dff_nfn.loc[dff_nfn["is_high-confidence"] == True].copy()

## Number of transcripts in glinos, leung, in both, and in neither
number_all_three_hf_nfn = dff_nfn_hf.loc[dff_nfn_hf["is_in_both"] == True].shape[0]

number_leung_and_ours_hf_nfn = dff_nfn_hf.loc[dff_nfn_hf["is_in_leung"] == True].shape[0]

number_glinos_and_ours_hf_nfn = dff_nfn_hf.loc[dff_nfn_hf["is_in_glinos"] == True].shape[0]
only_ours_hf_nfn = dff_nfn_hf.loc[dff_nfn_hf["is_in_either"] == False].shape[0]
is_in_either_hf_nfn = dff_nfn_hf.loc[dff_nfn["is_in_either"] == True].shape[0]

number_in_ours_not_in_glinos_hf_nfn = dff_nfn_hf.loc[dff_nfn_hf["is_in_glinos"] == False].shape[0]
number_in_ours_not_in_leung_hf_nfn = dff_nfn_hf.loc[dff_nfn_hf["is_in_leung"] == False].shape[0]

In [30]:
## Number of transcripts in glinos, leung, in both, and in neither
dff_nfm= dff.loc[dff["discovery_type"] == "New mitochondrially encoded spliced transcript"].copy()


number_all_three_nfm = dff_nfm.loc[dff_nfm["is_in_both"] == True].shape[0]

number_leung_and_ours_nfm = dff_nfm.loc[dff_nfm["is_in_leung"] == True].shape[0]

number_glinos_and_ours_nfm = dff_nfm.loc[dff_nfm["is_in_glinos"] == True].shape[0]
only_ours_nfm = dff_nfm.loc[dff_nfm["is_in_either"] == False].shape[0]
is_in_either_nfm = dff_nfm.loc[dff_nfm["is_in_either"] == True].shape[0]

number_in_ours_not_in_glinos_nfm = dff_nfm.loc[dff_nfm["is_in_glinos"] == False].shape[0]
number_in_ours_not_in_leung_nfm = dff_nfm.loc[dff_nfm["is_in_leung"] == False].shape[0]


## Create high-confidence data frame
dff_nfm_hf = dff_nfm.loc[dff_nfm["is_high-confidence"] == True].copy()

## Number of transcripts in glinos, leung, in both, and in neither
number_all_three_hf_nfm = dff_nfm_hf.loc[dff_nfm_hf["is_in_both"] == True].shape[0]

number_leung_and_ours_hf_nfm = dff_nfm_hf.loc[dff_nfm_hf["is_in_leung"] == True].shape[0]

number_glinos_and_ours_hf_nfm = dff_nfm_hf.loc[dff_nfm_hf["is_in_glinos"] == True].shape[0]
only_ours_hf_nfm = dff_nfm_hf.loc[dff_nfm_hf["is_in_either"] == False].shape[0]
is_in_either_hf_nfm = dff_nfm_hf.loc[dff_nfm["is_in_either"] == True].shape[0]

number_in_ours_not_in_glinos_hf_nfm = dff_nfm_hf.loc[dff_nfm_hf["is_in_glinos"] == False].shape[0]
number_in_ours_not_in_leung_hf_nfm = dff_nfm_hf.loc[dff_nfm_hf["is_in_leung"] == False].shape[0]

## - Display results (All transcripts)

In [31]:
print("In ours (no filter) total:", ref_transcripts.shape[0])
print("In ours (no filter) and in leung:", number_leung_and_ours)
print("Only in ours:", number_in_ours_not_in_leung)
print()
print("Percentage ours (No filter) validated in leung:", str(round((number_leung_and_ours/ref_transcripts.shape[0])*100, 1)) + "%")

In ours (no filter) total: 3428
In ours (no filter) and in leung: 150
Only in ours: 3278

Percentage ours (No filter) validated in leung: 4.4%


In [32]:
print("In ours (high-confidence) total:", 700)
print("In ours (high-confidence) and in leung:", number_leung_and_ours_hf)
print("Only in ours:", number_in_ours_not_in_leung_hf)
print()
print("Percentage ours (high-confidence) validated in leung:", str(round((number_leung_and_ours_hf/700)*100, 1)) + "%")

In ours (high-confidence) total: 700
In ours (high-confidence) and in leung: 103
Only in ours: 597

Percentage ours (high-confidence) validated in leung: 14.7%


In [33]:
print("In ours (no filter) total:", ref_transcripts.shape[0])
print("In ours (no filter) and in glinos:", number_glinos_and_ours_hf)
print("Only in ours:", number_in_ours_not_in_glinos_hf)
print()
print("Percentage ours (no filter) validated in glinos:", str(round((number_glinos_and_ours/ref_transcripts.shape[0])*100, 1)) + "%")

In ours (no filter) total: 3428
In ours (no filter) and in glinos: 65
Only in ours: 635

Percentage ours (no filter) validated in glinos: 3.6%


In [34]:
print("In ours (high-confidence) total:", 700)
print("In ours (high-confidence) and in glinos:", number_glinos_and_ours_hf)
print("Only in ours:", number_in_ours_not_in_glinos_hf)
print()
print("Percentage ours (high-confidence) validated in glinos:", str(round((number_glinos_and_ours_hf/700)*100, 1)) + "%")

In ours (high-confidence) total: 700
In ours (high-confidence) and in glinos: 65
Only in ours: 635

Percentage ours (high-confidence) validated in glinos: 9.3%


In [35]:
print("In either (no filter)", is_in_either)
print("Only in all three (no filter):", number_all_three)

print()
print("Percentage ours (no filter) validated in either:", str(round((is_in_either/ref_transcripts.shape[0])*100, 1)) + "%")
print("Percentage ours (no filter) validated in both:", str(round((number_all_three/ref_transcripts.shape[0])*100, 1)) + "%")

In either (no filter) 229
Only in all three (no filter): 45

Percentage ours (no filter) validated in either: 6.7%
Percentage ours (no filter) validated in both: 1.3%


In [36]:
print("In either (high-confidence)", is_in_either_hf)
print("Only in all three (high-confidence):", number_all_three_hf)

print()
print("Percentage ours (high-confidence) validated in either:", str(round((is_in_either_hf/700)*100, 1)) + "%")
print("Percentage ours (high-confidence) validated in both:", str(round((number_all_three_hf/700)*100, 1)) + "%")

In either (high-confidence) 136
Only in all three (high-confidence): 32

Percentage ours (high-confidence) validated in either: 19.4%
Percentage ours (high-confidence) validated in both: 4.6%


## - Display results (New from Known)

In [37]:
print("In ours NFK (no filter) total:", 1534)
print("In ours NFK (no filter) and in leung:", number_leung_and_ours_nfk)
print("Only in ours NFK:", number_in_ours_not_in_leung_nfk)
print()
print("Percentage ours NFK (No filter) validated in leung:", str(round((number_leung_and_ours_nfk/1534)*100, 1)) + "%")

In ours NFK (no filter) total: 1534
In ours NFK (no filter) and in leung: 141
Only in ours NFK: 1393

Percentage ours NFK (No filter) validated in leung: 9.2%


In [38]:
print("In ours NFK (high-confidence) total:", 428)
print("In ours NFK (high-confidence) and in leung:", number_leung_and_ours_hf_nfk)
print("Only in ours NFK:", number_in_ours_not_in_leung_hf_nfk)
print()
print("Percentage ours NFK (high-confidence) validated in leung:", str(round((number_leung_and_ours_hf_nfk/428)*100, 1)) + "%")

In ours NFK (high-confidence) total: 428
In ours NFK (high-confidence) and in leung: 98
Only in ours NFK: 330

Percentage ours NFK (high-confidence) validated in leung: 22.9%


In [39]:
print("In ours NFK (no filter) total:", 1534)
print("In ours NFK (no filter) and in glinos:", number_glinos_and_ours_nfk)
print("Only in ours NFK:", number_in_ours_not_in_glinos_nfk)
print()
print("Percentage ours NFK (No filter) validated in glinos:", str(round((number_glinos_and_ours_nfk/1534)*100, 1)) + "%")

In ours NFK (no filter) total: 1534
In ours NFK (no filter) and in glinos: 124
Only in ours NFK: 1410

Percentage ours NFK (No filter) validated in glinos: 8.1%


In [40]:
print("In ours NFK (high-confidence) total:", 428)
print("In ours NFK (high-confidence) and in glinos:", number_glinos_and_ours_hf_nfk)
print("Only in ours NFK:", number_in_ours_not_in_glinos_hf_nfk)
print()
print("Percentage ours NFK (high-confidence) validated in glinos:", str(round((number_glinos_and_ours_hf_nfk/428)*100, 1)) + "%")

In ours NFK (high-confidence) total: 428
In ours NFK (high-confidence) and in glinos: 65
Only in ours NFK: 363

Percentage ours NFK (high-confidence) validated in glinos: 15.2%


In [41]:
print("In either NFK (no filter)", is_in_either_nfk)
print("Only in all three NFK (no filter):", number_all_three_nfk)

print()
print("Percentage ours NFK (no filter) validated in either:", str(round((is_in_either_nfk/1534)*100, 1)) + "%")
print("Percentage ours NFK (no filter) validated in both:", str(round((number_all_three_nfk/1534)*100, 1)) + "%")

In either NFK (no filter) 220
Only in all three NFK (no filter): 45

Percentage ours NFK (no filter) validated in either: 14.3%
Percentage ours NFK (no filter) validated in both: 2.9%


In [42]:
print("In either NFK (high-confidence)", is_in_either_hf_nfk)
print("Only in all three NFK (high-confidence):", number_all_three_hf_nfk)

print()
print("Percentage ours NFK (high-confidence) validated in either:", str(round((is_in_either_hf_nfk/428)*100, 1)) + "%")
print("Percentage ours NFK (high-confidence) validated in both:", str(round((number_all_three_hf_nfk/428)*100, 1)) + "%")

In either NFK (high-confidence) 131
Only in all three NFK (high-confidence): 32

Percentage ours NFK (high-confidence) validated in either: 30.6%
Percentage ours NFK (high-confidence) validated in both: 7.5%


## - Display results (New from New)

In [43]:
print("In ours NFN (no filter) total:", 1860)
print("In ours NFN (no filter) and in leung:", number_leung_and_ours_nfn)
print("Only in ours NFN:", number_in_ours_not_in_leung_nfn)
print()
print("Percentage ours NFN (No filter) validated in leung:", str(round((number_leung_and_ours_nfn/1860)*100, 1)) + "%")

In ours NFN (no filter) total: 1860
In ours NFN (no filter) and in leung: 9
Only in ours NFN: 1851

Percentage ours NFN (No filter) validated in leung: 0.5%


In [44]:
print("In ours NFN (high-confidence) total:", 267)
print("In ours NFN (high-confidence) and in leung:", number_leung_and_ours_hf_nfn)
print("Only in ours NFN:", number_in_ours_not_in_leung_hf_nfn)
print()
print("Percentage ours NFN (high-confidence) validated in leung:", str(round((number_leung_and_ours_hf_nfn/267)*100, 1)) + "%")

In ours NFN (high-confidence) total: 267
In ours NFN (high-confidence) and in leung: 5
Only in ours NFN: 262

Percentage ours NFN (high-confidence) validated in leung: 1.9%


In [45]:
print("In ours NFN (no filter) total:", 1860)
print("In ours NFN (no filter) and in glinos:", number_glinos_and_ours_nfn)
print("Only in ours NFN:", number_in_ours_not_in_glinos_nfn)
print()
print("Percentage ours NFN (No filter) validated in glinos:", str(round((number_glinos_and_ours_nfn/1860)*100, 1)) + "%")

In ours NFN (no filter) total: 1860
In ours NFN (no filter) and in glinos: 0
Only in ours NFN: 1860

Percentage ours NFN (No filter) validated in glinos: 0.0%


In [46]:
print("In ours NFN (high-confidence) total:", 267)
print("In ours NFN (high-confidence) and in glinos:", number_glinos_and_ours_hf_nfn)
print("Only in ours NFN:", number_in_ours_not_in_glinos_hf_nfn)
print()
print("Percentage ours NFN (high-confidence) validated in glinos:", str(round((number_glinos_and_ours_hf_nfn/267)*100, 1)) + "%")

In ours NFN (high-confidence) total: 267
In ours NFN (high-confidence) and in glinos: 0
Only in ours NFN: 267

Percentage ours NFN (high-confidence) validated in glinos: 0.0%


In [47]:
print("In either NFN (no filter)", is_in_either_nfn)
print("Only in all three NFN (no filter):", number_all_three_nfn)

print()
print("Percentage ours NFN (no filter) validated in either:", str(round((is_in_either_nfn/1860)*100, 1)) + "%")
print("Percentage ours NFN (no filter) validated in both:", str(round((number_all_three_nfn/1860)*100, 1)) + "%")

In either NFN (no filter) 9
Only in all three NFN (no filter): 0

Percentage ours NFN (no filter) validated in either: 0.5%
Percentage ours NFN (no filter) validated in both: 0.0%


In [48]:
print("In either NFN (high-confidence)", is_in_either_hf_nfn)
print("Only in all three NFN (high-confidence):", number_all_three_hf_nfn)

print()
print("Percentage ours NFN (high-confidence) validated in either:", str(round((is_in_either_hf_nfn/267)*100, 1)) + "%")
print("Percentage ours NFN (high-confidence) validated in both:", str(round((number_all_three_hf_nfn/267)*100, 1)) + "%")

In either NFN (high-confidence) 5
Only in all three NFN (high-confidence): 0

Percentage ours NFN (high-confidence) validated in either: 1.9%
Percentage ours NFN (high-confidence) validated in both: 0.0%


## - Display results (New from Mito)

In [49]:
print("In ours NFM (no filter) total:", 34)
print("In ours NFM (no filter) and in leung:", number_leung_and_ours_nfm)
print("Only in ours NFM:", number_in_ours_not_in_leung_nfm)
print()
print("Percentage ours NFM (No filter) validated in leung:", str(round((number_leung_and_ours_nfm/34)*100, 1)) + "%")

In ours NFM (no filter) total: 34
In ours NFM (no filter) and in leung: 0
Only in ours NFM: 34

Percentage ours NFM (No filter) validated in leung: 0.0%


In [50]:
print("In ours NFM (high-confidence) total:", 5)
print("In ours NFM (high-confidence) and in leung:", number_leung_and_ours_hf_nfm)
print("Only in ours NFM:", number_in_ours_not_in_leung_hf_nfm)
print()
print("Percentage ours NFM (high-confidence) validated in leung:", str(round((number_leung_and_ours_hf_nfm/5)*100, 1)) + "%")

In ours NFM (high-confidence) total: 5
In ours NFM (high-confidence) and in leung: 0
Only in ours NFM: 5

Percentage ours NFM (high-confidence) validated in leung: 0.0%


In [51]:
print("In ours NFM (no filter) total:", 34)
print("In ours NFM (no filter) and in glinos:", number_glinos_and_ours_nfm)
print("Only in ours NFM:", number_in_ours_not_in_glinos_nfm)
print()
print("Percentage ours NFM (No filter) validated in glinos:", str(round((number_glinos_and_ours_nfm/34)*100, 1)) + "%")

In ours NFM (no filter) total: 34
In ours NFM (no filter) and in glinos: 0
Only in ours NFM: 34

Percentage ours NFM (No filter) validated in glinos: 0.0%


In [52]:
print("In ours NFM (high-confidence) total:", 5)
print("In ours NFM (high-confidence) and in glinos:", number_glinos_and_ours_hf_nfm)
print("Only in ours NFM:", number_in_ours_not_in_glinos_hf_nfm)
print()
print("Percentage ours NFM (high-confidence) validated in glinos:", str(round((number_glinos_and_ours_hf_nfm/5)*100, 1)) + "%")

In ours NFM (high-confidence) total: 5
In ours NFM (high-confidence) and in glinos: 0
Only in ours NFM: 5

Percentage ours NFM (high-confidence) validated in glinos: 0.0%


In [53]:
print("In either NFM (no filter)", is_in_either_nfm)
print("Only in all three NFM (no filter):", number_all_three_nfm)

print()
print("Percentage ours NFM (no filter) validated in either:", str(round((is_in_either_nfm/34)*100, 1)) + "%")
print("Percentage ours NFM (no filter) validated in both:", str(round((number_all_three_nfm/34)*100, 1)) + "%")

In either NFM (no filter) 0
Only in all three NFM (no filter): 0

Percentage ours NFM (no filter) validated in either: 0.0%
Percentage ours NFM (no filter) validated in both: 0.0%


In [54]:
print("In either NFM (high-confidence)", is_in_either_hf_nfm)
print("Only in all three NFM (high-confidence):", number_all_three_hf_nfm)

print()
print("Percentage ours NFM (high-confidence) validated in either:", str(round((is_in_either_hf_nfm/5)*100, 1)) + "%")
print("Percentage ours NFM (high-confidence) validated in both:", str(round((number_all_three_hf_nfm/5)*100, 1)) + "%")

In either NFM (high-confidence) 0
Only in all three NFM (high-confidence): 0

Percentage ours NFM (high-confidence) validated in either: 0.0%
Percentage ours NFM (high-confidence) validated in both: 0.0%
