In [65]:
## Import libraries
import pandas as pd
import numpy as np
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [66]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [67]:
## Import full length counts matrix

df = pd.read_csv("../../../data/bernardo/raw/uky_aged_stringent/bambu_discovery/fullLengthCounts_transcript.txt", sep="\t")

In [68]:
## Import and parse through extended annotations
ref = pd.read_csv("../../../data/bernardo/raw/uky_aged_stringent/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])


ref = parse_df_columns(ref, is_ref=False)

In [69]:
## Get novel mitochondrial transcript ids
ref_mito_novel_ids = ref.loc[((ref["chr"] == "MT") & (ref["transcript_id"].str.startswith("Bambu")))]["transcript_id"].copy()
ref_mito_novel_ids = ref_mito_novel_ids.drop_duplicates().to_list()

In [70]:
## Create counts matrix countaining only 
df_novel_mito = df.loc[df["TXNAME"].isin(ref_mito_novel_ids)].copy()

In [71]:
## Sum counts for each novel mito transcripts
df_novel_mito["total_counts"] = df_novel_mito[df_novel_mito.columns[2:]].sum(axis=1)

In [72]:
## Display results
df_novel_mito["total_counts"].mean()

865.3239581244652

In [73]:
## Repeat process for known mito transcripts

## Get novel mitochondrial transcript ids
ref_mito_known_ids = ref.loc[((ref["chr"] == "MT") & (ref["transcript_id"].str.startswith("E")))]["transcript_id"].copy()
ref_mito_known_ids = ref_mito_known_ids.drop_duplicates().to_list()


## Create counts matrix countaining only 
df_known_mito = df.loc[df["TXNAME"].isin(ref_mito_known_ids)].copy()


## Sum counts for each novel mito transcripts
df_known_mito["total_counts"] = df_known_mito[df_known_mito.columns[2:]].sum(axis=1)

In [74]:
## Display results
df_known_mito["total_counts"].mean()

17535.046776396768

In [75]:
## Import and parse through extended annotations
orig_ref = pd.read_csv("../../../data/bernardo/raw/uky_aged_stringent/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])


In [76]:
## Create GTF annotation that will work with ENSEMBL genome browser for newly discovered mitochondrial transcripts
mito_ref = orig_ref[((orig_ref["chr"] == "MT") & (orig_ref["other"].str.contains("BambuTx")))].copy()

mito_ref["chr"] = "chrM"

## Save it
mito_ref.to_csv("../../../data/bernardo/processed/05.mitochondrial_novel_transcripts_probing/mito_novel_annotation_for_genome_browser.gtf", 
                sep="\t", header=False, index=False, quoting = csv.QUOTE_NONE)