In [1]:
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = df.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = df.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        df[col] = round(df[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        df[cpm_name] = round(((df[col]/(df[col].sum())) * 1000000), 2)
    
    return df  

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        df["gene_id"] = df.index
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        df.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [4]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_exon=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_exon:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

            
        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [5]:
## Bambu reference with novel and annotated transcripts
bambu_ref = pd.read_csv("../../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])


bambu_ref = bambu_ref.loc[~bambu_ref["chr"].str.startswith("ERCC-")]

## Parse through bambu reference
bambu_ref = parse_df_columns(bambu_ref, is_ref=False)

## Only keep exons
bambu_ref = bambu_ref.loc[bambu_ref["type"] == "exon"].copy()

## Change start and end columns to string
bambu_ref["start"] = bambu_ref["start"].astype("str")
bambu_ref["end"] = bambu_ref["end"].astype("str")

In [6]:
## Fix column names in counts matrix

df = pd.read_csv("../../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

df = fix_column_names(df, is_gene=False)

## Calculate CPM and drop count columns
df = calculate_cpm(df, is_gene=False)

df = df[df.columns.drop(list(df.filter(regex='counts')))].copy()

## Drop ERCCs
df = df.loc[~df["gene_id"].str.startswith("ERCC")].copy()

## Calculate median CPM and only keep transcripts with median CPM > 1
df["median_CPM"] = df[df.filter(regex='[0-9]_CPM').columns].median(axis=1)
df = df.loc[df["median_CPM"] > 1].copy()

In [7]:
## Filter reference to only contain transcripts with a CPM > 1
bambu_ref = bambu_ref.loc[bambu_ref["transcript_id"].isin(df["transcript_id"])].copy()

In [8]:
## Get last exon number for each transcript

df_last_exons = pd.DataFrame()
df_last_exons["last_exon"] = bambu_ref["transcript_id"].value_counts()
df_last_exons["transcript_id"] = df_last_exons.index
df_last_exons.reset_index(drop=True, inplace=True)

bambu_ref = bambu_ref.merge(df_last_exons, on="transcript_id", how="inner")

In [9]:
## Get names of all transcripts with more than one exon
bambu_ref_num_exons_per_transcript = bambu_ref.loc[bambu_ref["type"] == "exon"]["transcript_id"].value_counts()
bambu_ref_multi_exon_transcripts = bambu_ref_num_exons_per_transcript.loc[bambu_ref_num_exons_per_transcript > 1].index

In [10]:
## Create different dataframes, novel transcripts, novel genes, known genes 

bambu_ref.loc[bambu_ref["transcript_id"].str.startswith("BambuTx"), "Novel Status"] = "Novel Transcript"
bambu_ref.loc[bambu_ref["gene_id"].str.startswith("BambuGene"), "Novel Status"] = "Novel Gene"
bambu_ref.loc[bambu_ref["transcript_id"].str.startswith("ENS"), "Novel Status"] = "Known"

bambu_new_transcripts = bambu_ref.loc[bambu_ref["Novel Status"] == "Novel Transcript"].copy()
bambu_new_transcripts_mt = bambu_new_transcripts.loc[bambu_new_transcripts["chr"] == "MT"]
bambu_new_transcripts = bambu_new_transcripts.loc[bambu_new_transcripts["chr"] != "MT"]
bambu_new_genes = bambu_ref.loc[bambu_ref["Novel Status"] == "Novel Gene"].copy()
bambu_known_transcripts = bambu_ref.loc[bambu_ref["Novel Status"] == "Known"].copy()

In [11]:
bambu_new_transcripts_mt = bambu_new_transcripts_mt.loc[bambu_new_transcripts_mt["transcript_id"].isin(["BambuTx1848",
                            "BambuTx1847", "BambuTx1850", "BambuTx1845", "BambuTx1846"])].copy()

In [12]:
bambu_new_transcripts_mt

Unnamed: 0,chr,source,type,start,end,strand,gene_id,transcript_id,exon_number,last_exon,Novel Status
267672,MT,Bambu,exon,1671.0,2169.0,+,ENSG00000210082,BambuTx1845,1,2,Novel Transcript
267673,MT,Bambu,exon,3147.0,3234.0,+,ENSG00000210082,BambuTx1845,2,2,Novel Transcript
267675,MT,Bambu,exon,1676.0,2242.0,+,ENSG00000210082,BambuTx1846,1,2,Novel Transcript
267676,MT,Bambu,exon,2350.0,3234.0,+,ENSG00000210082,BambuTx1846,2,2,Novel Transcript
267677,MT,Bambu,exon,1969.0,2607.0,+,ENSG00000210082,BambuTx1847,1,2,Novel Transcript
267678,MT,Bambu,exon,2777.0,3234.0,+,ENSG00000210082,BambuTx1847,2,2,Novel Transcript
267679,MT,Bambu,exon,1970.0,2463.0,+,ENSG00000210082,BambuTx1848,1,2,Novel Transcript
267680,MT,Bambu,exon,2621.0,3233.0,+,ENSG00000210082,BambuTx1848,2,2,Novel Transcript
267688,MT,Bambu,exon,3439.0,4181.0,-,ENSG00000210107,BambuTx1850,2,2,Novel Transcript
267689,MT,Bambu,exon,4380.0,4626.0,-,ENSG00000210107,BambuTx1850,1,2,Novel Transcript


In [13]:
print(bambu_new_transcripts.shape[0])
print(bambu_new_transcripts_mt.shape[0])
print(bambu_new_genes.shape[0])
print(bambu_known_transcripts.shape[0])

1651
10
546
274855


In [14]:
## Create unique junctions

bambu_new_transcripts["chr_start"] = bambu_new_transcripts["chr"].copy() + bambu_new_transcripts["start"].copy() + bambu_new_transcripts["strand"].copy()
bambu_new_transcripts["chr_end"] = bambu_new_transcripts["chr"].copy() + bambu_new_transcripts["end"].copy() + bambu_new_transcripts["strand"].copy()

bambu_new_transcripts_mt["chr_start"] = bambu_new_transcripts_mt["chr"].copy() + bambu_new_transcripts_mt["start"].copy() + bambu_new_transcripts_mt["strand"].copy()
bambu_new_transcripts_mt["chr_end"] = bambu_new_transcripts_mt["chr"].copy() + bambu_new_transcripts_mt["end"].copy() + bambu_new_transcripts_mt["strand"].copy()

bambu_new_genes["chr_start"] = bambu_new_genes["chr"].copy() + bambu_new_genes["start"].copy() + bambu_new_genes["strand"].copy()
bambu_new_genes["chr_end"] = bambu_new_genes["chr"].copy() + bambu_new_genes["end"].copy() + bambu_new_genes["strand"].copy()

bambu_known_transcripts["chr_start"] = bambu_known_transcripts["chr"].copy() + bambu_known_transcripts["start"].copy() + bambu_known_transcripts["strand"].copy()
bambu_known_transcripts["chr_end"] = bambu_known_transcripts["chr"].copy() + bambu_known_transcripts["end"].copy() + bambu_known_transcripts["strand"].copy()

In [15]:
## Filter exons to only contain new exons in the annotations

bambu_new_transcripts_start = bambu_new_transcripts.loc[~bambu_new_transcripts["chr_start"].isin(bambu_known_transcripts["chr_start"])].copy()
bambu_new_transcripts_end = bambu_new_transcripts.loc[~bambu_new_transcripts["chr_end"].isin(bambu_known_transcripts["chr_end"])].copy()

bambu_new_transcripts_mt_start = bambu_new_transcripts_mt.loc[~bambu_new_transcripts_mt["chr_start"].isin(bambu_known_transcripts["chr_start"])].copy()
bambu_new_transcripts_mt_end = bambu_new_transcripts_mt.loc[~bambu_new_transcripts_mt["chr_end"].isin(bambu_known_transcripts["chr_end"])].copy()

bambu_new_genes_start = bambu_new_genes.loc[~bambu_new_genes["chr_start"].isin(bambu_known_transcripts["chr_start"])].copy()
bambu_new_genes_end = bambu_new_genes.loc[~bambu_new_genes["chr_end"].isin(bambu_known_transcripts["chr_end"])].copy()


In [16]:
print(bambu_new_transcripts_start.shape[0])
print(bambu_new_transcripts_end.shape[0])

print(bambu_new_transcripts_mt_start.shape[0])
print(bambu_new_transcripts_mt_end.shape[0])

print(bambu_new_genes_start.shape[0])
print(bambu_new_genes_end.shape[0])

741
735
9
10
546
546


In [17]:
## Drop any duplicate exon entries

bambu_new_transcripts_start.drop_duplicates(subset="chr_start", inplace=True)
bambu_new_transcripts_end.drop_duplicates(subset="chr_end", inplace=True)

bambu_new_transcripts_mt_start.drop_duplicates(subset="chr_start", inplace=True)
bambu_new_transcripts_mt_end.drop_duplicates(subset="chr_end", inplace=True)

bambu_new_genes_start.drop_duplicates(subset="chr_start", inplace=True)
bambu_new_genes_end.drop_duplicates(subset="chr_end", inplace=True)

bambu_known_transcripts_start = bambu_known_transcripts.drop_duplicates(subset="chr_start")
bambu_known_transcripts_end = bambu_known_transcripts.drop_duplicates(subset="chr_end")

In [18]:
print(bambu_new_transcripts_start.shape[0])
print(bambu_new_transcripts_end.shape[0])

print(bambu_new_transcripts_mt_start.shape[0])
print(bambu_new_transcripts_mt_end.shape[0])

print(bambu_new_genes_start.shape[0])
print(bambu_new_genes_end.shape[0])

print(bambu_known_transcripts_start.shape[0])
print(bambu_known_transcripts_end.shape[0])

694
695
9
8
527
527
178182
178002


In [19]:
## Only keep exons from multi exon transcripts and drop duplicates

bambu_known_transcripts_multi_exon_start = bambu_known_transcripts_start.loc[bambu_known_transcripts_start["transcript_id"].isin(bambu_ref_multi_exon_transcripts)].copy()

bambu_known_transcripts_multi_exon_end = bambu_known_transcripts_end.loc[bambu_known_transcripts_end["transcript_id"].isin(bambu_ref_multi_exon_transcripts)].copy()

In [20]:
## Print shapes of the annotations after removing known exons

print(bambu_known_transcripts_multi_exon_start.shape[0])
print(bambu_known_transcripts_multi_exon_end.shape[0])

176663
176488


In [21]:
## Save annotations

bambu_new_transcripts_start.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exon_START_boundaries_from_new_transcripts_in_known_genes.tsv",
                            sep="\t", index=False)

bambu_new_transcripts_end.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exon_END_boundaries_from_new_transcripts_in_known_genes.tsv",
                            sep="\t", index=False)


bambu_new_transcripts_mt_start.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exon_START_boundaries_from_new_transcripts_mitochondrial_genes.tsv",
                            sep="\t", index=False)

bambu_new_transcripts_mt_end.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exon_END_boundaries_from_new_transcripts_mitochondrial_genes.tsv",
                            sep="\t", index=False)


bambu_new_genes_start.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exon_START_boundaries_from_new_transcripts_in_new_genes.tsv",
                            sep="\t", index=False)

bambu_new_genes_end.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exon_END_boundaries_from_new_transcripts_in_new_genes.tsv",
                            sep="\t", index=False)


bambu_known_transcripts_multi_exon_start.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/exon_START_boundaries_from_known_spliced_genes.tsv",
                            sep="\t", index=False)

bambu_known_transcripts_multi_exon_end.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/exon_END_boundaries_from_known_spliced_genes.tsv",
                            sep="\t", index=False)