In [1]:
## Import libraries
import pandas as pd
import numpy as np

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)

In [2]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split("source_gene=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split("source_transcript=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get CHM gene_ids
        df["CHM_gene_id"] = df["other"].str.split("gene_id=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get transcript ids
        df["CHM_transcript_id"] = df["other"].str.split("transcript_id=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get transcript names
        df["transcript_name"] = df["other"].str.split("source_transcript_name=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("source_gene_common_name=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get start codon
        df["start_codon"] = df["other"].str.split("adj_start=", expand=True)[1].str.split(";", expand=True)[0]
        
        ## Get stop codon
        df["stop_codon"] = df["other"].str.split("adj_stop=", expand=True)[1].str.split(";", expand=True)[0]        

        ## Only keep relevant
        df = df[["chr", "start", "end", "strand", "type", "gene_id", "transcript_id", "CHM_gene_id",
                 "CHM_transcript_id", "transcript_name", "gene_name", "start_codon", "stop_codon"]].copy()

        ## Drop duplicates
        df.drop_duplicates(inplace=True)
        

    else:

        ## Get CHM gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get CHM transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Label novel transcripts
        df.loc[df["transcript_id"].str.startswith("tx."), "is_novel_transcript"] = True
        df.loc[~df["transcript_id"].str.startswith("tx."), "is_novel_transcript"] = False

        ## Label novel genes
        df.loc[df["gene_id"].str.startswith("gene."), "is_novel_gene"] = True
        df.loc[~df["gene_id"].str.startswith("gene."), "is_novel_gene"] = False

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [3]:
bambu_annotation = pd.read_csv("C:/Users/bag22/Desktop/current_files/python_files/ebbert_lab/data/2022-07-14_cDNA_data_ebbert_lab_nextflow_pipeline_output_raw/ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

In [4]:
bambu_annotation = parse_df_columns(bambu_annotation, is_ref=False)


KeyboardInterrupt



In [None]:
bambu_annotation_mt = bambu_annotation.loc[bambu_annotation["chr"]=="MT"].copy()

In [None]:
bambu_annotation = pd.read_csv("C:/Users/bag22/Desktop/2022_ebbert_generated_ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

In [None]:
bambu_annotation_final = bambu_annotation.merge(bambu_annotation_mt[["chr", "source", "type", "start", "end", "strand"]], on=["chr", "source", "type", "start", "end", "strand"], how="inner")

In [None]:
bambu_annotation_final["chr"] = "chrM"

In [None]:
import csv

bambu_annotation_final.to_csv("~/Desktop/grch38_mt_only.gtf", sep="\t", index=False, header=False, quoting=csv.QUOTE_NONE)

In [None]:
bambu_annotation.head()

In [None]:
bambu_mt_genes = bambu_annotation.loc[bambu_annotation["chr"]=="MT"]["gene_id"]

In [None]:
bambu_mt_genes

In [None]:
grch_counts = pd.read_csv("C:/Users/bag22/Desktop/2022_ebbert_generated_ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/counts_transcript.txt",
                         sep="\t")

new_col_names = []
for col in grch_counts.columns:
        new_col_names.append(col.split("_nanopore")[0])
        
grch_counts.columns = new_col_names

In [None]:
grch_counts["total_counts"] = grch_counts[["sample_PAM54401", "sample_PAM54902", "sample_PAM54335", "sample_PAM54788"]].sum(axis=1)

In [None]:
grch_counts.loc[grch_counts["GENEID"].isin(bambu_mt_genes)]

In [3]:
bambu_annotation_grch = pd.read_csv("C:/Users/bag22/Desktop/current_files/python_files/ebbert_lab/data/2022-07-14_cDNA_data_ebbert_lab_nextflow_pipeline_output_raw/ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

bambu_annotation_grch = parse_df_columns(bambu_annotation_grch, is_ref=False)

bambu_annotation_grch_mt = bambu_annotation_grch.loc[bambu_annotation_grch["chr"]=="MT"].copy()

In [4]:
bambu_annotation_chm13 = pd.read_csv("C:/Users/bag22/Desktop/current_files/python_files/ebbert_lab/data/2022-07-14_cDNA_data_ebbert_lab_nextflow_pipeline_output_raw/ONT_only_data/CHM13_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

bambu_annotation_chm13 = parse_df_columns(bambu_annotation_chm13, is_ref=False)

bambu_annotation_chm13_mt = bambu_annotation_chm13.loc[bambu_annotation_chm13["chr"]=="chrM"].copy()

In [5]:
bambu_annotation_grch_mt.loc[bambu_annotation_grch_mt['transcript_id']=="tx.755"]

Unnamed: 0,chr,source,type,start,end,strand,gene_id,transcript_id,exon_number,is_novel_transcript,is_novel_gene
1758975,MT,Bambu,transcript,1689.0,5449.0,-,ENSG00000210107,tx.755,,True,False
1758976,MT,Bambu,exon,1689.0,3502.0,-,ENSG00000210107,tx.755,2.0,True,False
1758985,MT,Bambu,exon,4380.0,5449.0,-,ENSG00000210107,tx.755,1.0,True,False


In [6]:
bambu_annotation_chm13_mt.loc[bambu_annotation_chm13_mt['transcript_id']=="tx.783"]

Unnamed: 0,chr,source,type,start,end,strand,gene_id,transcript_id,exon_number,is_novel_transcript,is_novel_gene
1623754,chrM,Bambu,transcript,1113.0,4872.0,-,ENSG00000210107.1,tx.783,,True,False
1623755,chrM,Bambu,exon,1113.0,2925.0,-,ENSG00000210107.1,tx.783,2.0,True,False
1623765,chrM,Bambu,exon,3803.0,4872.0,-,ENSG00000210107.1,tx.783,1.0,True,False


In [2]:
bambu_annotation_grch = pd.read_csv("C:/Users/bag22/Desktop/current_files/python_files/ebbert_lab/data/2022-07-14_cDNA_data_ebbert_lab_nextflow_pipeline_output_raw/ONT_only_data/GRCh38-106_good_ONT/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

In [9]:
grch_mito_novel = bambu_annotation_grch.loc[bambu_annotation_grch["other"].str.contains("tx.755")].copy()

In [10]:
grch_mito_novel

Unnamed: 0,chr,source,type,start,end,dot_1,strand,dot_2,other
1758975,MT,Bambu,transcript,1689,5449,.,-,.,"gene_id ""ENSG00000210107""; transcript_id ""tx.7..."
1758976,MT,Bambu,exon,1689,3502,.,-,.,"gene_id ""ENSG00000210107""; transcript_id ""tx.7..."
1758985,MT,Bambu,exon,4380,5449,.,-,.,"gene_id ""ENSG00000210107""; transcript_id ""tx.7..."


In [11]:
grch_mito_novel["chr"] = "chrM"

In [12]:
import csv

grch_mito_novel.to_csv("~/Desktop/grch_mito_novel_transcript.gtf", sep="\t", index=False, header=False,
                       quoting=csv.QUOTE_NONE)