In [8]:
## Import libraries
import pandas as pd
import numpy as np

In [9]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [10]:
## Import full length counts matrix

df = pd.read_csv("../../../data/bernardo/raw/uky_aged_stringent/bambu_discovery/fullLengthCounts_transcript.txt", sep="\t")

In [11]:
## Import and parse through extended annotations
ref = pd.read_csv("../../../data/bernardo/raw/uky_aged_stringent/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])


ref = parse_df_columns(ref, is_ref=False)

In [21]:
ref_mito_novel_ids = ref.loc[((ref["chr"] == "MT") & (ref["transcript_id"].str.startswith("Bambu")))]["transcript_id"].copy()

In [25]:
ref_mito_novel_ids = ref_mito_novel_ids.drop_duplicates().to_list()

AttributeError: 'NoneType' object has no attribute 'to_list'

1810026    BambuTx279
1810053    BambuTx280
1810055    BambuTx322
1810063    BambuTx281
1810072    BambuTx283
1810074    BambuTx282
Name: transcript_id, dtype: object

In [3]:
df_novel_mito = ("")

Unnamed: 0,TXNAME,GENEID,PAM54902_1291_nanopore_mapped_filtered_sorted,PAM54335_356_nanopore_mapped_filtered_sorted,PAM54401_1271_nanopore_mapped_filtered_sorted,PAM54788_1304_nanopore_mapped_filtered_sorted
0,BambuTx1,ENSG00000078808,37.0,30.0,191.0,244.0
1,BambuTx2,ENSG00000130775,23.0,44.0,157.0,60.0
2,BambuTx3,ENSG00000198492,182.0,275.0,438.0,409.0
3,BambuTx4,BambuGene566,92.0,85.0,83.0,226.0
4,BambuTx5,BambuGene3448,113.0,12.0,26.0,61.0
