In [1]:
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: parse_dff_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_exon=False):

    dff = df.copy()
    
    
    if is_ref:

        ## Get gene ids
        dff["gene_id"] = dff["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        dff["gene_name"] = dff["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        dff["gene_biotype"] = dff["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_transcript:
            dff["transcript_id"] = dff["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            dff["transcript_biotype"] = dff["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_exon:
            dff["transcript_id"] = dff["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            dff["transcript_biotype"] = dff["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            dff["exon_number"] = dff["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

            
        ## Drop "other" column
        dff.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        dff["gene_id"] = dff["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        dff["transcript_id"] = dff["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        dff["exon_number"] = dff["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        dff.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in dff.columns:
        dff.loc[dff[col].isnull(), col] = np.NaN
        

    return dff

In [3]:
## Import Data

## Open original reference
original_ref = pd.read_csv("../references/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")



## Bambu reference with novel and annotated transcripts
bambu_ref = pd.read_csv("../data/raw/nextflow_pipeline_output/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])


In [17]:
orig_ref_transcripts = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref_transcripts = parse_df_columns(orig_ref_transcripts, is_ref=True)



bambu_ref = parse_df_columns(bambu_ref, is_ref=False)
bambu_ref_transcripts = bambu_ref.loc[bambu_ref["type"]=="transcript"].copy()

KeyError: 'other'

In [None]:
keep = ["BambuTx1025", "BambuTx1138", "BambuTx1322", "BambuTx1324", "BambuTx151", "BambuTx1711", "BambuTx1845",
"BambuTx1879", "BambuTx2506", "BambuTx2804", "BambuTx2703", "BambuTx1847", "BambuTx1850", "BambuTx956",
"BambuTx2603", "BambuTx594",  "BambuTx2813", "BambuTx860", "BambuTx861", "BambuTx1474", "BambuTx1473",
"BambuTx978", "BambuTx977", "BambuTx458", "BambuTx1803", "BambuTx1879", "BambuTx1850", "BambuTx2710",
"BambuTx545", "BambuTx123", "BambuTx1323", "BambuTx1607", "BambuTx1602", "BambuTx2900"]

In [6]:
bambu_ref = bambu_ref.loc[bambu_ref["transcript_id"].isin(keep)].copy()

In [9]:
bambu_ref_final = bambu_ref.merge(orig_ref_transcripts[["gene_id", "gene_name"]], on="gene_id", how="left")

bambu_ref_final = bambu_ref_final[["gene_name", "gene_id", "transcript_id"]].copy()

bambu_ref_final['gene_name'].fillna(bambu_ref_final['gene_id'], inplace=True)

In [12]:
bambu_ref_final.drop_duplicates(inplace=True)

In [16]:
bambu_ref_final.shape

(32, 3)