In [1]:
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_exon=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_exon:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

            
        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [3]:
## Bambu reference with novel and annotated transcripts
bambu_ref = pd.read_csv("../../../../data/bernardo/raw/uky_aged_stringent/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])


## Parse through bambu reference
bambu_ref = parse_df_columns(bambu_ref, is_ref=False)

## Only keep exons
bambu_ref = bambu_ref.loc[bambu_ref["type"] == "exon"].copy()

## Change start and end columns to string
bambu_ref["start"] = bambu_ref["start"].astype("str")
bambu_ref["end"] = bambu_ref["end"].astype("str")

In [4]:
## Create different dataframes, novel transcripts, novel genes, known genes 

bambu_ref.loc[bambu_ref["transcript_id"].str.startswith("BambuTx"), "Novel Status"] = "Novel Transcript"
bambu_ref.loc[bambu_ref["gene_id"].str.startswith("BambuGene"), "Novel Status"] = "Novel Gene"
bambu_ref.loc[bambu_ref["transcript_id"].str.startswith("ENS"), "Novel Status"] = "Known"

bambu_new_transcripts = bambu_ref.loc[bambu_ref["Novel Status"] == "Novel Transcript"].copy()
bambu_new_transcripts_mt = bambu_new_transcripts.loc[bambu_new_transcripts["chr"] == "MT"]
bambu_new_genes = bambu_ref.loc[bambu_ref["Novel Status"] == "Novel Gene"].copy()
bambu_known_transcripts = bambu_ref.loc[bambu_ref["Novel Status"] == "Known"].copy()

In [5]:
## Create unique junctions

bambu_new_transcripts["junctions"] = bambu_new_transcripts["chr"].copy() + bambu_new_transcripts["start"].copy() + bambu_new_transcripts["end"].copy() + bambu_new_transcripts["strand"].copy()


bambu_new_transcripts_mt["junctions"] = bambu_new_transcripts_mt["chr"].copy() + bambu_new_transcripts["start"].copy() + bambu_new_transcripts_mt["end"].copy() + bambu_new_transcripts_mt["strand"].copy()
    
    
bambu_new_genes["junctions"] = bambu_new_genes["chr"].copy() + bambu_new_genes["start"].copy() +  bambu_new_genes["end"].copy() + bambu_new_genes["strand"].copy()
    
    
bambu_known_transcripts["junctions"] = bambu_known_transcripts["chr"].copy() + bambu_known_transcripts["start"].copy() + bambu_known_transcripts["end"].copy() + bambu_known_transcripts["strand"].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bambu_new_transcripts_mt["junctions"] = bambu_new_transcripts_mt["chr"].copy() + bambu_new_transcripts["start"].copy() + bambu_new_transcripts_mt["end"].copy() + bambu_new_transcripts_mt["strand"].copy()


In [6]:
## Print shapes of the annotations

print(bambu_new_transcripts.shape)
print(bambu_new_transcripts_mt.shape)
print(bambu_new_genes.shape)
print(bambu_known_transcripts.shape)

(1203, 11)
(12, 11)
(155, 11)
(1624585, 11)


In [7]:
## Filter exons to only contain new exons in the annotations

bambu_new_transcripts = bambu_new_transcripts.loc[~bambu_new_transcripts["junctions"].isin(bambu_known_transcripts["junctions"])].copy()

bambu_new_transcripts_mt = bambu_new_transcripts_mt.loc[~bambu_new_transcripts_mt["junctions"].isin(bambu_known_transcripts["junctions"])].copy()

bambu_new_genes = bambu_new_genes.loc[~bambu_new_genes["junctions"].isin(bambu_known_transcripts["junctions"])].copy()

In [8]:
## Remove junctions columns

bambu_new_transcripts.drop(columns="junctions", inplace=True)

bambu_new_transcripts_mt.drop(columns="junctions", inplace=True)

bambu_new_genes.drop(columns="junctions", inplace=True)

In [9]:
## Print shapes of the annotations after removing known exons

print(bambu_new_transcripts.shape)
print(bambu_new_transcripts_mt.shape)
print(bambu_new_genes.shape)
print(bambu_known_transcripts.shape)

(454, 10)
(12, 10)
(155, 10)
(1624585, 11)


In [10]:
## Save annotations

bambu_new_transcripts.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exons_from_new_transcripts_in_known_genes.tsv",
                            sep="\t", index=False)

bambu_new_transcripts_mt.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exons_from_new_transcripts_mitochondrial_genes.tsv",
                            sep="\t", index=False)

bambu_new_genes.to_csv("../../../../data/bernardo/processed/99.other/create_annotations/meme_annotation/new_exons_from_new_transcripts_in_new_genes.tsv",
                            sep="\t", index=False)