# Table of contents

### - Import libraries and define functions + Initial setup

### - Get factoids

# Import libraries and define functions + Initial setup

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
name: relative_transcript_abundance

purpose: calculate relative transcript abundance

input: a dataframe with a ref_gene_id column identifying the transcript gene of origin and a cov columns with 
the coverage for the transcripts.

output: the same dataframe with a relative abundance column added
'''



def relative_transcript_abundance(df):
    
    ## Group by gene_id and get total expression for each gene (not counting introns)
    df_sums = df[["gene_id", "total_CPM"]].groupby("gene_id").sum()
    df_sums["total_CPM_gene"] = df_sums["total_CPM"]
    df_sums.drop(columns="total_CPM", inplace=True)
    
    ## Merge dataframe with total gene level CPM with regular transcript level CPM dataframe
    merged_df = pd.merge(df, df_sums, how='inner', on="gene_id")
    
    ## Calculater relative percent abundance for each transcript within its gene
    merged_df["relative_abundance_percent"] = ((merged_df["total_CPM"]/merged_df["total_CPM_gene"]) * 100)
    
    ## Rename total_CPM for transcript column
    merged_df["total_CPM_transcript"] = merged_df["total_CPM"]
    merged_df.drop(columns="total_CPM", inplace=True)


    return merged_df

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        df["gene_id"] = df.index
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        df.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [4]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [5]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = df.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = df.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        df[col] = round(df[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        df[cpm_name] = round(((df[col]/(df[col].sum())) * 1000000), 2)
    
    return df  

In [6]:
## define ggplot colors
ggplot2_colors = ["#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#C77CFF", "#FF61CC"]

In [7]:
## Open original reference
original_ref = pd.read_csv("../../references/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

original_ref = original_ref.loc[~original_ref["chr"].str.startswith("ERCC-")]

## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True)

## Import disease relevant genes
disease_relevant_genes = pd.read_csv("../../references/medically_relevant_genes_02-04-2023_UPDATED.tsv", sep="\t")

## Brain disease genes
brain_disease_gene_ids = pd.read_csv("../../references/brain_disease_genes_only_IDs.tsv", sep="\t")
brain_disease_annotations = pd.read_csv("../../references/brain_disease_genes_with_disease.tsv", sep="\t")

## Import AD Genes
ad_names = pd.read_csv("../../references/AD_gwas_genes.tsv", sep="\t")

## Create disease relevant list including chromosome
disease_relevant_genes_annotated = disease_relevant_genes.merge(orig_ref[["gene_id", "gene_name", "chr"]], 
                                                               how="inner", on=["gene_id", "gene_name"])

## Create list of protein coding genes
protein_coding_ref = orig_ref.loc[orig_ref["gene_biotype"] == "protein_coding"].copy()

## Import and parse through extended annotations
ref = pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

ref = ref.loc[~ref["chr"].str.startswith("ERCC-")]

ref = parse_df_columns(ref, is_ref=False)

ref_transcripts = ref.loc[ref["type"] == "transcript"].copy()

In [8]:
## Parse through original GTF 107 Ensembl reference to get transcript biotypes
orig_ref_types = original_ref.loc[original_ref["type"]=="transcript"].copy()
orig_ref_types = parse_df_columns(orig_ref_types, is_ref=True, is_transcript=True)

In [9]:
## Make reference only including protein coding transcripts with a defined CDS
orig_ref_cds = original_ref.loc[original_ref["type"]=="CDS"].copy()
orig_ref_cds = parse_df_columns(orig_ref_cds, is_ref=True, is_transcript=True, is_prot=True)

orig_ref_cds = orig_ref_cds.loc[orig_ref_cds["transcript_biotype"] == "protein_coding"].copy()

In [10]:
## Create unique identifier for protein id
unique_protein_id = pd.DataFrame()
string_orig_ref_cds = orig_ref_cds[['protein_id', 'start', 'end']].copy().astype(str)

unique_protein_id = string_orig_ref_cds.groupby(['protein_id'])[["protein_id", "start", "end"]].transform(lambda x: '-'.join(x))

unique_protein_id["protein_id"] = unique_protein_id["protein_id"].str.split("-", expand=True)[0]
unique_protein_id["unique_identifier"] = "Start coordinates: " + unique_protein_id["start"].copy() + "   End coordinates:" + unique_protein_id["end"].copy()
unique_protein_id = unique_protein_id[["protein_id", "unique_identifier"]].copy()
unique_protein_id.drop_duplicates(inplace=True)
unique_protein_id = unique_protein_id.merge(orig_ref_cds["transcript_id"], left_index=True, right_index=True)

orig_ref_cds = orig_ref_cds.merge(unique_protein_id, on=["protein_id", "transcript_id"], how="inner")
orig_ref_cds["protein_id"] = orig_ref_cds["unique_identifier"].copy()
orig_ref_cds.drop(columns="unique_identifier", inplace=True)

In [11]:
## Only keep protein coding

ref_cds = orig_ref_cds.loc[~orig_ref_cds["protein_id"].isna()].copy()

ref_cds = ref_cds[["transcript_id", "protein_id"]].copy().drop_duplicates()

In [12]:
ref_transcripts.head()

Unnamed: 0,chr,source,type,start,end,strand,gene_id,transcript_id,exon_number
0,1,Bambu,transcript,11869.0,14409.0,+,ENSG00000223972,ENST00000456328,
2,1,Bambu,transcript,12010.0,13670.0,+,ENSG00000223972,ENST00000450305,
11,1,Bambu,transcript,14404.0,15040.0,-,ENSG00000227232,BambuTx1,
12,1,Bambu,transcript,14404.0,15943.0,-,ENSG00000227232,BambuTx2,
13,1,Bambu,transcript,14404.0,29570.0,-,ENSG00000227232,ENST00000488147,


In [13]:
ref_prot_gene_transcripts = ref_transcripts.loc[ref_transcripts["gene_id"].isin(protein_coding_ref["gene_id"])].copy()

ref_prot_gene_transcripts_num_isoforms = ref_prot_gene_transcripts["gene_id"].value_counts()

In [14]:
ref_prot_gene_transcripts_num_isoforms.describe()

count    20023.000000
mean         8.479149
std          9.255858
min          1.000000
25%          3.000000
50%          6.000000
75%         11.000000
max        192.000000
Name: gene_id, dtype: float64

In [15]:
ref_cds_gene = ref_cds.merge(ref_transcripts[["gene_id", "transcript_id"]], on="transcript_id", how="inner")

In [16]:
ref_cds_gene.drop_duplicates(subset="protein_id", keep="first", inplace=True)

In [17]:
ref_cds_gene_number = ref_cds_gene["gene_id"].value_counts()

In [18]:
ref_cds_gene_number.describe()

count    19673.000000
mean         3.854928
std          3.758709
min          1.000000
25%          1.000000
50%          3.000000
75%          5.000000
max         79.000000
Name: gene_id, dtype: float64

In [19]:
ref_prot_gene_transcripts_ad = ref_prot_gene_transcripts.merge(ad_names, on="gene_id", how="inner")

In [20]:
ref_prot_gene_transcripts_ad_number = ref_prot_gene_transcripts_ad["gene_id"].value_counts()

In [21]:
ref_prot_gene_transcripts_ad_number.describe()

count    80.000000
mean     13.750000
std      14.146163
min       1.000000
25%       5.000000
50%      10.000000
75%      17.250000
max      88.000000
Name: gene_id, dtype: float64

In [22]:
ref_cds_gene_ad = ref_cds_gene.merge(ad_names, on="gene_id", how="inner")

In [23]:
ref_cds_gene_ad_number = ref_cds_gene_ad["gene_id"].value_counts()

In [24]:
ref_cds_gene_ad_number.describe()

count    80.000000
mean      5.287500
std       4.137491
min       1.000000
25%       2.750000
50%       4.000000
75%       7.000000
max      24.000000
Name: gene_id, dtype: float64

In [25]:
ref_prot_gene_transcripts.shape

(169778, 9)

In [26]:
orig_ref_exons = original_ref.loc[original_ref["type"] == "exon"].copy()

In [27]:
orig_ref_exons_parse = parse_df_columns(orig_ref_exons.copy(), is_ref=True, is_transcript=True)

In [28]:
orig_ref_exons_parse_prot = orig_ref_exons_parse.loc[orig_ref_exons_parse["transcript_id"].isin(ref_prot_gene_transcripts["transcript_id"])].copy()

In [29]:
orig_ref_exons_parse_prot_num = orig_ref_exons_parse_prot["transcript_id"].value_counts()

In [30]:
orig_ref_exons_parse_prot_num.describe()

count    169101.000000
mean          8.068988
std           7.897253
min           1.000000
25%           3.000000
50%           5.000000
75%          10.000000
max         363.000000
Name: transcript_id, dtype: float64

In [31]:
orig_ref_exons_parse_prot_num.head()

ENST00000589042    363
ENST00000591111    313
ENST00000342992    312
ENST00000460472    191
ENST00000359218    191
Name: transcript_id, dtype: int64

## Get number of counts for a median CPM = 1

In [10]:
df = pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/counts_transcript.txt", sep="\t")

In [11]:
df = fix_column_names(df, is_gene=False)

In [12]:
## Calculate total counts
df["total_counts"] = df[df.filter(regex='count').columns].sum(axis=1)

In [13]:
df = calculate_cpm(df, is_gene=False)

In [14]:
df_cpm_1 = df.loc[df["total_CPM"] == 1].copy()

In [16]:
df_cpm_1["average_counts"] = df_cpm_1[df_cpm_1.filter(regex='[0-9]_counts').columns].mean(axis=1)

In [19]:
df_cpm_1["average_counts"].mean()

24.48487654320988