# Table of contents

### - Import libraries and define functions + Initial setup

### - Data pre-processing (Our Data)

### - Data pre-processing (GTEX Data)

### - Data pre-processing (Merge Data)

### - Make figures

# Import libraries and define functions + Initial setup

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
name: relative_transcript_abundance

purpose: calculate relative transcript abundance

input: a dataframe with a ref_gene_id column identifying the transcript gene of origin and a cov columns with 
the coverage for the transcripts.

output: the same dataframe with a relative abundance column added
'''



def relative_transcript_abundance(df):
    
    dff = df.copy()
    
    ## Group by gene_id and get total expression for each gene (not counting introns)
    dff_sums = dff[["gene_id", "total_CPM"]].groupby("gene_id").sum()
    dff_sums["total_CPM_gene"] = dff_sums["total_CPM"]
    dff_sums.drop(columns="total_CPM", inplace=True)
    
    ## Merge dataframe with total gene level CPM with regular transcript level CPM dataframe
    merged_dff = pd.merge(dff, dff_sums, how='inner', on="gene_id")
    
    ## Calculater relative percent abundance for each transcript within its gene
    merged_dff["relative_abundance_percent"] = ((merged_dff["total_CPM"]/merged_dff["total_CPM_gene"]) * 100)
    
    ## Rename total_CPM for transcript column
    merged_dff["total_CPM_transcript"] = merged_dff["total_CPM"]
    merged_dff.drop(columns="total_CPM", inplace=True)


    return merged_dff

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    
    dff = df.copy()
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = dff.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        dff["gene_id"] = dff.index
        cols = list(dff.columns)
        cols = [cols[-1]] + cols[:-1]
        dff = dff[cols]
        dff.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = dff.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    dff.columns = list_new_names
    
    return dff 

In [4]:
'''
function name: parse_dff_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False, delete_other=True):

    dff = df.copy()
    
    if is_ref:

        ## Get gene ids
        dff["gene_id"] = dff["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        dff["gene_name"] = dff["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        dff["gene_biotype"] = dff["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            dff["transcript_id"] = dff["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            dff["transcript_biotype"] = dff["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                dff["protein_id"] = dff["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                dff["ccds_id"] = dff["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                dff["exon_number"] = dff["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        dff.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        dff["gene_id"] = dff["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        dff["transcript_id"] = dff["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        dff["exon_number"] = dff["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        if delete_other:
            dff.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in dff.columns:
        dff.loc[dff[col].isnull(), col] = np.NaN
        

    return dff

In [5]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    dff = df.copy()
    
    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = dff.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = dff.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        dff[col] = round(dff[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        dff[cpm_name] = round(((dff[col]/(dff[col].sum())) * 1000000), 2)
    
    return dff  

In [6]:
## define ggplot colors
ggplot2_colors = ["#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#C77CFF", "#FF61CC"]

In [7]:
## Open original reference
original_ref = pd.read_csv("../../references/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

original_ref = original_ref.loc[~original_ref["chr"].str.startswith("ERCC-")]

## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True)

## Import disease relevant genes
disease_relevant_genes = pd.read_csv("../../references/medically_relevant_genes_02-04-2023_UPDATED.tsv", sep="\t")

## Brain disease genes
brain_disease_gene_ids = pd.read_csv("../../references/brain_disease_genes_only_IDs.tsv", sep="\t")
brain_disease_annotations = pd.read_csv("../../references/brain_disease_genes_with_disease.tsv", sep="\t")

## Import AD Genes
ad_names = pd.read_csv("../../references/AD_gwas_genes.tsv", sep="\t")

## Create disease relevant list including chromosome
disease_relevant_genes_annotated = disease_relevant_genes.merge(orig_ref[["gene_id", "gene_name", "chr"]], 
                                                               how="inner", on=["gene_id", "gene_name"])

## Create list of protein coding genes
protein_coding_ref = orig_ref.loc[orig_ref["gene_biotype"] == "protein_coding"].copy()

## Import and parse through extended annotations
ref = pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

ref = ref.loc[~ref["chr"].str.startswith("ERCC-")]

ref = parse_df_columns(ref, is_ref=False)

ref_transcripts = ref.loc[ref["type"] == "transcript"].copy()

# - Data pre-processing (Our Data)

In [57]:
## Import data for transcript level counts and fix column names

df_ours = pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

df_ours = fix_column_names(df_ours, is_gene=False)

In [58]:
## Calculate total counts
df_ours["total_counts"] = df_ours[df_ours.filter(regex='count').columns].sum(axis=1)

In [59]:
## Calculate CPM and drop count columns
df_ours = calculate_cpm(df_ours, is_gene=False)

df_ours = df_ours[df_ours.columns.drop(list(df_ours.filter(regex='counts')))].copy()

In [60]:
## Drop ERCCs
df_ours = df_ours.loc[~df_ours["gene_id"].str.startswith("ERCC")].copy()

In [61]:
## Calculate median CPM
df_ours["median_CPM"] = df_ours[df_ours.filter(regex='[0-9]_CPM').columns].median(axis=1)

In [62]:
## Annotate transcritps in counts matrix
df_ours = df_ours.merge(ref_transcripts[["gene_id", "transcript_id", "chr"]], on=["gene_id", "transcript_id"], how="inner")

In [63]:
## Add relative transcript abundance to dataframe
df_ours = relative_transcript_abundance(df_ours)

In [64]:
## Only keep genes and transcripts with median CPM > 1
df_ours = df_ours.loc[df_ours["median_CPM"] > 1].copy()

In [65]:
## Only keep high confidence mitochondrial transcripts
df_ours_mito_high = df_ours.loc[df_ours["transcript_id"].isin(["BambuTx1845", "BambuTx1846", "BambuTx1847", "BambuTx1848", 
                                                "BambuTx1850"])].copy()

df_ours = df_ours.loc[~((df_ours["chr"] == "MT") & (df_ours["transcript_id"].str.startswith("BambuTx")))].copy()

df_ours = pd.concat([df_ours, df_ours_mito_high])

In [68]:
## Only keep relevant columns
df_ours = df_ours[["transcript_id", "gene_id", "chr", "median_CPM",
                   "total_CPM_gene", "total_CPM_transcript", "relative_abundance_percent"]].copy()

## Rename columns to indicate they are from our data
df_ours.columns = ["transcript_id", "gene_id", "chr", "median_CPM_OURS",
                   "total_CPM_gene_OURS", "total_CPM_transcript_OURS", "relative_abundance_percent_OURS"]

In [66]:
## Separate dataframe in three (known, new_from_known, new_from_new)
df_ours_known = df_ours.loc[~(df_ours["transcript_id"].str.startswith("Bambu"))].copy()

df_ours_new_from_known = df_ours.loc[((df_ours["transcript_id"].str.startswith("Bambu")) & 
                                ~(df_ours["gene_id"].str.startswith("Bambu")))].copy()

df_ours_new_from_new = df_ours.loc[((df_ours["transcript_id"].str.startswith("Bambu")) & 
                                (df_ours["gene_id"].str.startswith("Bambu")))].copy()

# - Data pre-processing (GTEX BA9 data)

In [29]:
## Import data for transcript level counts and fix column names

df_gtex = pd.read_csv("../../data/raw/nextflow_GTEX_pipeline_output/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

## Only keep BA9 brain samples
df_gtex = df_gtex[["TXNAME", "GENEID","GTEX-1192X-0011-R10a-SM-4RXXZ", "GTEX-13X6J-0011-R10b-SM-5CEKT", 
                   "GTEX-14BIL-0011-R10a-SM-5EQV4", "GTEX-15DCD-0011-R10b-SM-5S51M", "GTEX-QDT8-0011-R10A-SM-2FKJB",
                   "GTEX-T5JC-0011-R10A-SM-2TT23"]]

df_gtex = fix_column_names(df_gtex, is_gene=False)

In [30]:
## Calculate total counts
df_gtex["total_counts"] = df_gtex[df_gtex.filter(regex='count').columns].sum(axis=1)

In [31]:
## Calculate CPM and drop count columns
df_gtex = calculate_cpm(df_gtex, is_gene=False)

df_gtex = df_gtex[df_gtex.columns.drop(list(df_gtex.filter(regex='counts')))].copy()

In [32]:
## Drop ERCCs
df_gtex = df_gtex.loc[~df_gtex["gene_id"].str.startswith("ERCC")].copy()

In [33]:
## Calculate median CPM
df_gtex["median_CPM"] = df_gtex[df_gtex.filter(regex='[0-9]_CPM').columns].median(axis=1)

In [34]:
## Annotate transcritps in counts matrix
df_gtex = df_gtex.merge(ref_transcripts[["gene_id", "transcript_id", "chr"]], on=["gene_id", "transcript_id"], how="inner")

In [35]:
## Add relative transcript abundance to dataframe
df_gtex = relative_transcript_abundance(df_gtex)

In [None]:
## Only keep relevant columns
df_gtex = df_gtex[["transcript_id", "gene_id", "chr", "median_CPM",
                   "total_CPM_gene", "total_CPM_transcript", "relative_abundance_percent"]].copy()

## Rename columns to indicate they are from our data
df_gtex.columns = ["transcript_id", "gene_id", "chr", "median_CPM_GTEX",
                   "total_CPM_gene_GTEX", "total_CPM_transcript_GTEX", "relative_abundance_percent_GTEX"]

# - Merge ours and GTEX data

In [None]:
df_known = df_ours_known.merge(df_gtex, on=["transcript_id", "gene_id", "chr"], how="inner")

df_new_from_known = df_ours_new_from_known.merge(df_gtex, on=["transcript_id", "gene_id", "chr"], how="inner")

df_new_from_new = df_ours_new_from_new.merge(df_gtex, on=["transcript_id", "gene_id", "chr"], how="inner")

# - Make figures (Relative Abundance)

In [None]:
sns.regplot(data=df_known, x="relative_abundance_percent_GTEX", y="relative_abundance_percent_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX relative abundance")
plt.xlabel("OURS relative abundance")

spearman_coeff, spearman_p = stats.spearmanr(df_known["relative_abundance_percent_GTEX"].copy(),
                                                                     df_known["relative_abundance_percent_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_known["relative_abundance_percent_GTEX"].copy(),
                                                                     df_known["relative_abundance_percent_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")



In [None]:
sns.regplot(data=df_new_from_known, x="relative_abundance_percent_GTEX", y="relative_abundance_percent_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX relative abundance")
plt.xlabel("OURS relative abundance")

spearman_coeff, spearman_p = stats.spearmanr(df_new_from_known["relative_abundance_percent_GTEX"].copy(),
                                                                     df_new_from_known["relative_abundance_percent_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_new_from_known["relative_abundance_percent_GTEX"].copy(),
                                                                     df_new_from_known["relative_abundance_percent_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")


In [None]:
sns.regplot(data=df_new_from_new, x="relative_abundance_percent_GTEX", y="relative_abundance_percent_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX relative abundance")
plt.xlabel("OURS relative abundance")

spearman_coeff, spearman_p = stats.spearmanr(df_new_from_new["relative_abundance_percent_GTEX"].copy(),
                                                                     df_new_from_new["relative_abundance_percent_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_new_from_new["relative_abundance_percent_GTEX"].copy(),
                                                                     df_new_from_new["relative_abundance_percent_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")


# - Make figures (median CPM)

In [None]:
sns.regplot(data=df_known, x="median_CPM_GTEX", y="median_CPM_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX median CPM")
plt.xlabel("OURS median CPM")

spearman_coeff, spearman_p = stats.spearmanr(df_known["median_CPM_GTEX"].copy(),
                                                                     df_known["median_CPM_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_known["median_CPM_GTEX"].copy(),
                                                                     df_known["median_CPM_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")


In [None]:
sns.regplot(data=df_new_from_known, x="median_CPM_GTEX", y="median_CPM_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX median CPM")
plt.xlabel("OURS median CPM")

spearman_coeff, spearman_p = stats.spearmanr(df_new_from_known["median_CPM_GTEX"].copy(),
                                                                     df_new_from_known["median_CPM_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_new_from_known["median_CPM_GTEX"].copy(),
                                                                     df_new_from_known["median_CPM_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")


In [None]:
sns.regplot(data=df_new_from_new, x="median_CPM_GTEX", y="median_CPM_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX median CPM")
plt.xlabel("OURS median CPM")

spearman_coeff, spearman_p = stats.spearmanr(df_new_from_new["median_CPM_GTEX"].copy(),
                                                                     df_new_from_new["median_CPM_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_new_from_new["median_CPM_GTEX"].copy(),
                                                                     df_new_from_new["median_CPM_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")


# - Make figures (total CPM)

In [None]:
sns.regplot(data=df_known, x="total_CPM_transcript_GTEX", y="total_CPM_transcript_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX total CPM")
plt.xlabel("OURS total CPM")

spearman_coeff, spearman_p = stats.spearmanr(df_known["total_CPM_transcript_GTEX"].copy(),
                                                                     df_known["total_CPM_transcript_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_known["total_CPM_transcript_GTEX"].copy(),
                                                                     df_known["total_CPM_transcript_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")

In [None]:
sns.regplot(data=df_new_from_known, x="total_CPM_transcript_GTEX", y="total_CPM_transcript_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX total CPM")
plt.xlabel("OURS total CPM")

spearman_coeff, spearman_p = stats.spearmanr(df_new_from_known["total_CPM_transcript_GTEX"].copy(),
                                                                     df_new_from_known["total_CPM_transcript_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_new_from_known["total_CPM_transcript_GTEX"].copy(),
                                                                     df_new_from_known["total_CPM_transcript_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")

In [None]:
sns.regplot(data=df_new_from_new, x="total_CPM_transcript_GTEX", y="total_CPM_transcript_OURS", color=ggplot2_colors[0], 
           scatter_kws={"alpha": 0.5, "s":5}, line_kws={"linewidth": 1})

plt.ylabel("GTEX total CPM")
plt.xlabel("OURS total CPM")

spearman_coeff, spearman_p = stats.spearmanr(df_new_from_new["total_CPM_transcript_GTEX"].copy(),
                                                                     df_new_from_new["total_CPM_transcript_OURS"].copy())

slope, intercept, r_value, p_value, std_err = stats.linregress(df_new_from_new["total_CPM_transcript_GTEX"].copy(),
                                                                     df_new_from_new["total_CPM_transcript_OURS"].copy())

print("Spearman coefficient:", str(round(spearman_coeff, 2)))
print("Spearman R-squared:", str(round((spearman_coeff * spearman_coeff), 2)))
print("Spearman p-value:", str(spearman_p))

print("\nPearson coefficient:", str(round(r_value, 2)))
print("Pearson R-squared:", str(round(r_value*r_value,2)))
print("Pearson Slope:", str(round(slope,2)))
print("Pearson Intercept:", str(round(intercept,2)))
print("Pearson P-value:", str(p_value))
print("Pearson Standard Error:", str(round(std_err,4)), "\n")

#plt.savefig('../../figures/paper_figures/supplement/ercc_correlation_by_mean_cpm.pdf',
#            dpi=600, transparent=True, bbox_inches="tight")