In [1]:
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
name: relative_transcript_abundance

purpose: calculate relative transcript abundance

input: a dataframe with a ref_gene_id column identifying the transcript gene of origin and a cov columns with 
the coverage for the transcripts.

output: the same dataframe with a relative abundance column added
'''



def relative_transcript_abundance(df):
    
    dff = df.copy()
    
    first=True
    
    for col in dff.filter(regex='[0-9]_CPM').columns:
        
        col_gene_name = col.split("_CP")[0] + "_total_gene_CPM"
        col_relative_abundance = col.split("_CP")[0] + "_relative_abundance"
    
        dff_sums = dff[["gene_id", col]].groupby("gene_id").sum()

        dff_sums[col_gene_name] = dff_sums[col].copy()

        dff_sums.drop(columns=col, inplace=True)

        if first:
            merged_dff = pd.merge(dff, dff_sums, how='inner', on="gene_id")
            merged_dff[col_relative_abundance] = ((merged_dff[col]/merged_dff[col_gene_name]) * 100)
            
        else:
            merged_dff = pd.merge(merged_dff, dff_sums, how='inner', on="gene_id")
            merged_dff[col_relative_abundance] = ((merged_dff[col]/merged_dff[col_gene_name]) * 100)
        
        first=False
        
    for col_count in merged_dff.filter(regex='[0-9]_count').columns:
        
        col_gene_name = col_count.split("_count")[0] + "_total_gene_counts"
        
        dff_sums = merged_dff[["gene_id", col_count]].groupby("gene_id").sum()
        dff_sums[col_gene_name] = dff_sums[col_count].copy()
    
        
        dff_sums.drop(columns=col_count, inplace=True)
        
        merged_dff = pd.merge(merged_dff, dff_sums, how='inner', on="gene_id")
        
    merged_dff.fillna(value=0, inplace=True)
        
    return merged_dff

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    dff = df.copy()
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        dff["gene_id"] = dff[dff.columns[0]]
        dff.drop(columns=dff.columns[0], inplace=True)
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        cols = list(dff.columns)
        cols = [cols[-1]] + cols[:-1]
        dff = dff[cols]
        dff.reset_index(inplace=True, drop=True)
        
        ## Define counts columns and initiate new_columns list
        count_columns = dff.columns[1:].tolist()
        list_new_names = ["gene_id"]
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = dff.columns[2:].tolist()
        list_new_names = ["transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    dff.columns = list_new_names
    
    return dff 

In [4]:
'''
function name: parse_dff_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_exon=False):

    dff = df.copy()
    
    
    if is_ref:

        ## Get gene ids
        dff["gene_id"] = dff["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        dff["gene_name"] = dff["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        dff["gene_biotype"] = dff["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_transcript:
            dff["transcript_id"] = dff["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            dff["transcript_biotype"] = dff["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_exon:
            dff["transcript_id"] = dff["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            dff["transcript_biotype"] = dff["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            dff["exon_number"] = dff["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

            
        ## Drop "other" column
        dff.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        dff["gene_id"] = dff["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        dff["transcript_id"] = dff["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        dff["exon_number"] = dff["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        dff.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in dff.columns:
        dff.loc[dff[col].isnull(), col] = np.NaN
        

    return dff

In [5]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    dff = df.copy()
    
    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = dff.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = dff.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        dff[col] = dff[col]
        cpm_name = col.replace("_counts", "_CPM")
        dff[cpm_name] = ((dff[col]/(dff[col].sum())) * 1000000)
    
    return dff  

In [6]:
## Import Data

## Open original reference
original_ref = pd.read_csv("../../references/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")



## Bambu reference with novel and annotated transcripts
bambu_ref = pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

## New Transcript Events
df_events = pd.read_csv("../../data/processed/paper_figures/novel_events.tsv", sep="\t")


## Bambu counts matrix
df = pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

In [7]:
## Get all transcript IDs from ENSEMBL 94 GTF

df_ensembl_94 = pd.read_csv("../../references/Homo_sapiens.GRCh38.94.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

df_ensembl_94 = df_ensembl_94.loc[df_ensembl_94["type"] == "transcript"].copy()

df_ensembl_94 = parse_df_columns(df_ensembl_94, is_ref=True, is_transcript=True)

ensembl_94_transcript_ids = df_ensembl_94["transcript_id"].copy()

In [8]:
## Parse through original reference to get all needed info

orig_ref_transcripts = original_ref.loc[original_ref["type"]=="transcript"].copy()
orig_ref_exons = original_ref.loc[((original_ref["type"]=="exon") | (original_ref["type"]=="CDS"))].copy()

orig_ref_transcripts = parse_df_columns(orig_ref_transcripts, is_ref=True, is_transcript=True)
orig_ref_exons = parse_df_columns(orig_ref_exons, is_ref=True, is_exon=True)

orig_ref_transcripts.drop(columns=["source"], inplace=True)
orig_ref_transcripts["exon_number"] = np.nan

orig_ref_exons.drop(columns=["source"], inplace=True)

orig_ref = pd.concat([orig_ref_transcripts, orig_ref_exons]).sort_values(by=["chr", "start", "type", "end"], 
                                                                                    ascending=[True, True, False, True])


orig_ref.loc[orig_ref["transcript_id"].isin(ensembl_94_transcript_ids), "annotation_status"] = "Annotated in ENSEMBL 94 (2019)"
orig_ref.loc[~orig_ref["transcript_id"].isin(ensembl_94_transcript_ids), "annotation_status"] = "NOT annotated in ENSEMBL 94 (2019)"

orig_ref["discovery_category"] = "Annotated"

In [9]:
## Cleanup df_events
df_events = df_events[["TXNAME", "txClassDescription"]]

## Cleanup net transcript classes
df_events.loc[df_events["txClassDescription"] == "newGene-spliced", "txClassDescription"] = "New Gene Body"

## Cleanup net transcript classes
df_events.loc[df_events["txClassDescription"] == "newWithin", "txClassDescription"] = "New combination of known exons/junctions"
df_events.loc[df_events["txClassDescription"] == "newFirstJunction:newFirstExon", "txClassDescription"] = "New exon"
df_events.loc[df_events["txClassDescription"] == "newLastJunction:newLastExon", "txClassDescription"] = "New exon"
df_events.loc[df_events["txClassDescription"] == "allNew", "txClassDescription"] = "All new exons & junctions"

df_events.loc[df_events["txClassDescription"] == "newFirstJunction", "txClassDescription"] = "New junction"
df_events.loc[df_events["txClassDescription"] == "newLastJunction:newJunction", "txClassDescription"] = "New junction"
df_events.loc[df_events["txClassDescription"] == "newFirstJunction:newJunction", "txClassDescription"] = "New junction"
df_events.loc[df_events["txClassDescription"] == "newLastJunction", "txClassDescription"] = "New junction"
df_events.loc[df_events["txClassDescription"] == "newJunction", "txClassDescription"] = "New junction"
df_events.loc[df_events["txClassDescription"] == "newLastJunction:newFirstJunction:newJunction", "txClassDescription"] = "New junction"


df_events.loc[df_events["txClassDescription"] == "newLastJunction:newJunction:newLastExon", "txClassDescription"] = "New exon & new junction"
df_events.loc[df_events["txClassDescription"] == "newFirstJunction:newJunction:newFirstExon", "txClassDescription"] = "New exon & new junction"
df_events.loc[df_events["txClassDescription"] == "newLastJunction:newFirstJunction:newJunction:newFirstExon:newLastExon", "txClassDescription"] = "New exon & new junction"
df_events.loc[df_events["txClassDescription"] == "newLastJunction:newFirstJunction:newJunction:newFirstExon", "txClassDescription"] = "New exon & new junction"
df_events.loc[df_events["txClassDescription"] == "New first exon & new junction", "txClassDescription"] = "New exon & new junction"
df_events.loc[df_events["txClassDescription"] == "newLastJunction:newFirstJunction:newJunction:newLastExon", "txClassDescription"] = "New exon & new junction"

## Change column names
df_events.columns = ["transcript_id", "discovery_category"]

In [10]:
## Parse through bambu reference
bambu_ref = parse_df_columns(bambu_ref, is_ref=False)

## Only keep novel genes and Transcripts
bambu_ref = bambu_ref.loc[bambu_ref["transcript_id"].str.startswith("BambuTx")]

## Create discovery category column
bambu_ref["annotation_status"] = "Unnanotated (newly discovered)"
bambu_ref["transcript_biotype"] = "Unnanotated (newly discovered)"
bambu_ref = bambu_ref.merge(df_events, on="transcript_id", how="inner").drop_duplicates()

## Get gene names
gene_names = original_ref.loc[original_ref["type"]=="gene"].copy()
gene_names = parse_df_columns(gene_names, is_ref=True)
gene_names = gene_names[["gene_id", "gene_name"]]

## Add gene_name
bambu_ref = bambu_ref.merge(gene_names, on="gene_id", how="left").drop_duplicates().drop(columns="source")

In [11]:
## Final Reference
final_ref = pd.concat([orig_ref, bambu_ref]).sort_values(by=["chr", "start", "type", "end"], 
                                                                                    ascending=[True, True, False, True])

In [12]:
### Fix transcript biotypes

## Create CDS_not_defined classification
final_ref.loc[((final_ref["gene_biotype"] == "protein_coding") & (final_ref["transcript_biotype"] == "processed_transcript")), "transcript_biotype"] = "cds_not_defined"

## Create other classification
final_ref.loc[~final_ref["transcript_biotype"].isin(["protein_coding", "nonsense_mediated_decay", "lncRNA", "retained_intron", "cds_not_defined", "processed_transcript", "Unnanotated (newly discovered)"]), "transcript_biotype"] = "other"

## Drop gene_biotype column
final_ref.drop(columns="gene_biotype", inplace=True)

## Only keep CDS regions for protein_coding transcripts
final_ref_cds = final_ref.loc[final_ref["type"] == "CDS"].copy()
final_ref_cds_prot = final_ref_cds.loc[final_ref_cds["transcript_biotype"] == "protein_coding"].copy()
final_ref_2 = final_ref.loc[final_ref["type"] != "CDS"].copy()
final_final_ref = pd.concat([final_ref_2, final_ref_cds_prot]).sort_values(by=["chr", "start", "type", "end"], 
                                                                                    ascending=[True, True, False, True])

In [13]:
## Save files
final_final_ref.to_csv("../../data/processed/other/annotation_for_maddy_r_shiny_app/annotation_r_shiny.tsv", sep="\t", 
                index=False)

In [14]:
## Bambu counts matrix
df = pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

In [15]:
## Fix expression matrix
df = fix_column_names(df, is_gene=False)
df = calculate_cpm(df, is_gene=False)

In [16]:
## Calculate relative abundance
df = relative_transcript_abundance(df)

In [17]:
## Save expression Matrix
df.to_csv("../../data/processed/other/annotation_for_maddy_r_shiny_app/expression_matrix_r_shiny.tsv", sep="\t",
         index=False)

# Create gene level file with total counts and CPM

In [18]:
## Import transcript level counts and sum to gene level
df =  pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

df = fix_column_names(df, is_gene=False)

df_gene = df.drop(columns="transcript_id").groupby("gene_id").sum().reset_index()

In [19]:
## Calculate total_counts, drop other counts columns
df_gene = calculate_cpm(df_gene, is_gene=True)

counts_columns = df_gene.filter(regex='counts').columns.to_list()
df_gene["median_counts"] = df_gene[counts_columns].median(axis=1)

df_gene.drop(columns=counts_columns, inplace=True)

In [20]:
## Calculate CPM and give it proper name
cpm_cols = df_gene.filter(regex='CPM').columns.to_list()
df_gene["median_cpm"] = df_gene[cpm_cols].median(axis=1)
df_gene.drop(columns=cpm_cols, inplace=True)

In [21]:
## Save file
df_gene.to_csv("../../data/processed/other/annotation_for_maddy_r_shiny_app/expression_matrix_r_shiny_GENE.tsv",
              sep="\t", index=False)

# Create annotations to check for protein sequences for Maddy

In [22]:
## Open original reference
original_ref = pd.read_csv("../../references/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")



## Bambu reference with novel and annotated transcripts
bambu_ref = pd.read_csv("../../data/raw/nextflow_pipeline_output/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])


In [23]:
## Filter only new transcripts
bambu_ref = bambu_ref.loc[bambu_ref["other"].str.contains('transcript_id "BambuTx')]


## Filter only protein coding transcripts
original_ref = original_ref.loc[original_ref["other"].str.contains('transcript_biotype "protein_coding";')]



In [24]:
## Save files
bambu_ref.to_csv("../../data/processed/other/annotation_for_maddy_r_shiny_app/only_new_transcripts_no_filter.gtf", sep="\t", 
                index=False, header=False, quoting=csv.QUOTE_NONE)

original_ref.to_csv("../../data/processed/other/annotation_for_maddy_r_shiny_app/only_protein_coding_transcripts_no_filter.gtf", sep="\t", 
                index=False, header=False, quoting=csv.QUOTE_NONE)