In [2]:
## Import libraries
import pandas as pd
import numpy as np

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)

In [3]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split("source_gene=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split("source_transcript=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get CHM gene_ids
        df["CHM_gene_id"] = df["other"].str.split("gene_id=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get transcript ids
        df["CHM_transcript_id"] = df["other"].str.split("transcript_id=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get transcript names
        df["transcript_name"] = df["other"].str.split("source_transcript_name=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("source_gene_common_name=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get start codon
        df["start_codon"] = df["other"].str.split("adj_start=", expand=True)[1].str.split(";", expand=True)[0]
        
        ## Get stop codon
        df["stop_codon"] = df["other"].str.split("adj_stop=", expand=True)[1].str.split(";", expand=True)[0]        

        ## Only keep relevant
        df = df[["chr", "start", "end", "strand", "type", "gene_id", "transcript_id", "CHM_gene_id",
                 "CHM_transcript_id", "transcript_name", "gene_name", "start_codon", "stop_codon"]].copy()

        ## Drop duplicates
        df.drop_duplicates(inplace=True)
        

    else:

        ## Get CHM gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get CHM transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Label novel transcripts
        df.loc[df["transcript_id"].str.startswith("tx."), "is_novel_transcript"] = True
        df.loc[~df["transcript_id"].str.startswith("tx."), "is_novel_transcript"] = False

        ## Label novel genes
        df.loc[df["gene_id"].str.startswith("gene."), "is_novel_gene"] = True
        df.loc[~df["gene_id"].str.startswith("gene."), "is_novel_gene"] = False

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [7]:
bambu_annotation = pd.read_csv("C:/Users/bag22/Desktop/current_files/python_files/ebbert_lab/data/2022-07-14_cDNA_data_ebbert_lab_nextflow_pipeline_output_raw/ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

In [8]:
bambu_annotation = parse_df_columns(bambu_annotation, is_ref=False)

In [44]:
bambu_annotation_mt = bambu_annotation.loc[bambu_annotation["chr"]=="MT"].copy()

In [45]:
bambu_annotation = pd.read_csv("C:/Users/bag22/Desktop/2022_ebbert_generated_ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

In [46]:
bambu_annotation_final = bambu_annotation.merge(bambu_annotation_mt[["chr", "source", "type", "start", "end", "strand"]], on=["chr", "source", "type", "start", "end", "strand"], how="inner")

In [48]:
bambu_annotation_final["chr"] = "chrM"

In [49]:
import csv

bambu_annotation_final.to_csv("~/Desktop/grch38_mt_only.gtf", sep="\t", index=False, header=False, quoting=csv.QUOTE_NONE)

In [15]:
bambu_annotation.head()

Unnamed: 0,chr,source,type,start,end,strand,gene_id,transcript_id,exon_number,is_novel_transcript,is_novel_gene
0,1,Bambu,transcript,11869.0,14409.0,+,ENSG00000223972,ENST00000456328,,False,False
1,1,Bambu,exon,11869.0,12227.0,+,ENSG00000223972,ENST00000456328,1.0,False,False
2,1,Bambu,transcript,12010.0,13670.0,+,ENSG00000223972,ENST00000450305,,False,False
3,1,Bambu,exon,12010.0,12057.0,+,ENSG00000223972,ENST00000450305,1.0,False,False
4,1,Bambu,exon,12179.0,12227.0,+,ENSG00000223972,ENST00000450305,2.0,False,False


In [30]:
bambu_mt_genes = bambu_annotation.loc[bambu_annotation["chr"]=="MT"]["gene_id"]

In [31]:
bambu_mt_genes

1758967    ENSG00000210049
1758968    ENSG00000210049
1758969    ENSG00000211459
1758970    ENSG00000211459
1758971    ENSG00000210077
1758972    ENSG00000210077
1758973    ENSG00000210082
1758974    ENSG00000210082
1758975    ENSG00000210107
1758976    ENSG00000210107
1758977    ENSG00000209082
1758978    ENSG00000209082
1758979    ENSG00000198888
1758980    ENSG00000198888
1758981    ENSG00000210100
1758982    ENSG00000210100
1758983    ENSG00000210107
1758984    ENSG00000210107
1758985    ENSG00000210107
1758986    ENSG00000210112
1758987    ENSG00000210112
1758988    ENSG00000198763
1758989    ENSG00000198763
1758990    ENSG00000210117
1758991    ENSG00000210117
1758992    ENSG00000210127
1758993    ENSG00000210127
1758994    ENSG00000210135
1758995    ENSG00000210135
1758996    ENSG00000210140
1758997    ENSG00000210140
1758998    ENSG00000210144
1758999    ENSG00000210144
1759000    ENSG00000198804
1759001    ENSG00000198804
1759002    ENSG00000210151
1759003    ENSG00000210151
1

In [33]:
grch_counts = pd.read_csv("C:/Users/bag22/Desktop/2022_ebbert_generated_ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/counts_transcript.txt",
                         sep="\t")

new_col_names = []
for col in grch_counts.columns:
        new_col_names.append(col.split("_nanopore")[0])
        
grch_counts.columns = new_col_names

In [40]:
grch_counts["total_counts"] = grch_counts[["sample_PAM54401", "sample_PAM54902", "sample_PAM54335", "sample_PAM54788"]].sum(axis=1)

In [42]:
grch_counts.loc[grch_counts["GENEID"].isin(bambu_mt_genes)]

Unnamed: 0,TXNAME,GENEID,sample_PAM54401,sample_PAM54902,sample_PAM54335,sample_PAM54788,total_counts
754,tx.755,ENSG00000210107,837043.9,1193660.0,684640.0,936779.0,3652123.0
755,tx.756,ENSG00000210151,44022.99,41343.57,25807.72,56902.52,168076.8
18712,ENST00000361227,ENSG00000198840,52548.0,137371.0,98633.0,91806.0,380358.0
18757,ENST00000361335,ENSG00000212907,14245.0,41077.0,9262.0,99686.0,164270.0
18777,ENST00000361381,ENSG00000198886,453399.0,668068.0,343292.0,594741.0,2059500.0
18782,ENST00000361390,ENSG00000198888,663512.0,1336354.0,472136.0,979848.0,3451850.0
18809,ENST00000361453,ENSG00000198763,73929.0,129714.0,127961.0,188443.0,520047.0
18866,ENST00000361567,ENSG00000198786,127786.0,212337.0,117201.0,208098.0,665422.0
18890,ENST00000361624,ENSG00000198804,393720.0,572056.0,258067.0,470916.0,1694759.0
18916,ENST00000361681,ENSG00000198695,73569.0,97956.0,54856.0,96951.0,323332.0


In [9]:
bambu_annotation_grch = pd.read_csv("C:/Users/bag22/Desktop/current_files/python_files/ebbert_lab/data/2022-07-14_cDNA_data_ebbert_lab_nextflow_pipeline_output_raw/ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

bambu_annotation_grch = parse_df_columns(bambu_annotation_grch, is_ref=False)

bambu_annotation_grch_mt = bambu_annotation_grch.loc[bambu_annotation_grch["chr"]=="MT"].copy()

In [10]:
bambu_annotation_chm13 = pd.read_csv("C:/Users/bag22/Desktop/current_files/python_files/ebbert_lab/data/2022-07-14_cDNA_data_ebbert_lab_nextflow_pipeline_output_raw/ONT_only_data/CHM13_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

bambu_annotation_chm13 = parse_df_columns(bambu_annotation_chm13, is_ref=False)

bambu_annotation_chm13_mt = bambu_annotation_chm13.loc[bambu_annotation_chm13["chr"]=="chrM"].copy()

In [13]:
bambu_annotation_grch_mt.loc[bambu_annotation_grch_mt['transcript_id']=="tx.755"]

Unnamed: 0,chr,source,type,start,end,strand,gene_id,transcript_id,exon_number,is_novel_transcript,is_novel_gene
1758975,MT,Bambu,transcript,1689.0,5449.0,-,ENSG00000210107,tx.755,,True,False
1758976,MT,Bambu,exon,1689.0,3502.0,-,ENSG00000210107,tx.755,2.0,True,False
1758985,MT,Bambu,exon,4380.0,5449.0,-,ENSG00000210107,tx.755,1.0,True,False


In [14]:
bambu_annotation_chm13_mt.loc[bambu_annotation_chm13_mt['transcript_id']=="tx.783"]

Unnamed: 0,chr,source,type,start,end,strand,gene_id,transcript_id,exon_number,is_novel_transcript,is_novel_gene
1623754,chrM,Bambu,transcript,1113.0,4872.0,-,ENSG00000210107.1,tx.783,,True,False
1623755,chrM,Bambu,exon,1113.0,2925.0,-,ENSG00000210107.1,tx.783,2.0,True,False
1623765,chrM,Bambu,exon,3803.0,4872.0,-,ENSG00000210107.1,tx.783,1.0,True,False
