In [13]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)

In [14]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split("source_gene=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split("source_transcript=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get CHM gene_ids
        df["CHM_gene_id"] = df["other"].str.split("gene_id=", expand=True)[1].str.split(';', expand=True)[0]

        ## Get transcript ids
        df["CHM_transcript_id"] = df["other"].str.split("transcript_id=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get transcript names
        df["transcript_name"] = df["other"].str.split("source_transcript_name=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("source_gene_common_name=", expand=True)[1].str.split(';', expand=True)[0]
        
        ## Get start codon
        df["start_codon"] = df["other"].str.split("adj_start=", expand=True)[1].str.split(";", expand=True)[0]
        
        ## Get stop codon
        df["stop_codon"] = df["other"].str.split("adj_stop=", expand=True)[1].str.split(";", expand=True)[0]        

        ## Only keep relevant
        df = df[["chr", "start", "end", "strand", "type", "gene_id", "transcript_id", "CHM_gene_id",
                 "CHM_transcript_id", "transcript_name", "gene_name", "start_codon", "stop_codon"]].copy()

        ## Drop duplicates
        df.drop_duplicates(inplace=True)
        

    else:

        ## Get CHM gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get CHM transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Label novel transcripts
        df.loc[df["transcript_id"].str.startswith("tx."), "is_novel_transcript"] = True
        df.loc[~df["transcript_id"].str.startswith("tx."), "is_novel_transcript"] = False

        ## Label novel genes
        df.loc[df["gene_id"].str.startswith("gene."), "is_novel_gene"] = True
        df.loc[~df["gene_id"].str.startswith("gene."), "is_novel_gene"] = False

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [15]:
'''
function name: merge_annotations

purpose: Merge useful/relevant information from both annotations while removing repeated and irrelevant information

input: Two different GTF annotations

output: One GTF annotation containing all the relevant information
'''

def merge_annotations(ref_gtf, bambu_gtf):
    
    ## Merge the two annotations
    names_ref_gtf = ref_gtf[["transcript_id", "gene_id", "gene_name", "transcript_name"]].copy()
    merged_gtf = pd.merge(bambu_gtf, names_ref_gtf, on=['gene_id', 'transcript_id'], how='left')
    merged_gtf.drop_duplicates(inplace=True)

    ## Label novel transcripts
    merged_gtf.loc[merged_gtf["transcript_id"].str.startswith("tx."), "is_novel_transcript"] = True
    merged_gtf.loc[~merged_gtf["transcript_id"].str.startswith("tx."), "is_novel_transcript"] = False

    ## Label novel genes
    merged_gtf.loc[merged_gtf["gene_id"].str.startswith("gene."), "is_novel_gene"] = True
    merged_gtf.loc[~merged_gtf["gene_id"].str.startswith("gene."), "is_novel_gene"] = False

    ## Create temporary variable only containing novel transcripts
    temp = merged_gtf.loc[merged_gtf["is_novel_transcript"] == True]

    ## Annotate novel transcripts
    merged_tmp = pd.merge(temp, ref_gtf[["gene_id", "gene_name"]], on=['gene_id'], how='left')
    merged_tmp.drop_duplicates(inplace=True)
    merged_tmp["gene_name"] = merged_tmp["gene_name_y"]
    merged_tmp.drop(columns=["source", "gene_name_y", "gene_name_x"], inplace=True)

    ## Return novel transcripts to original annotation
    merged_final = pd.merge(merged_gtf, merged_tmp, on=['chr', 'type', 'start', 'end', 'strand', 'transcript_id',
                    'transcript_name', 'gene_id', 'is_novel_transcript', 'is_novel_gene', 'exon_number'], how="left")

    ## Get gene names for novel transcripts of known genes
    merged_final.gene_name_x.fillna(merged_final.gene_name_y, inplace=True)
    merged_final["gene_name"] = merged_final["gene_name_x"]
    merged_final.drop(columns =["gene_name_x", "gene_name_y"], inplace=True)
    
    ## Get start and stop codons for known transcripts and exons of protein coding genes
    ref_gtf = ref_gtf[["chr", "type", "start", "end", "strand", "transcript_id", "gene_id", "start_codon", "stop_codon"]]
    merged_final = pd.merge(merged_final, ref_gtf, on=["chr", "type", "start", "end", "strand", "transcript_id", "gene_id"], how="left")
    
    
    merged_final = merged_final[["chr", "type", "start", "end", "strand", "transcript_id", "gene_id", "gene_name", 
                    "exon_number", "transcript_name", "start_codon", "stop_codon", "is_novel_gene", "is_novel_transcript"]]


    
    return merged_final 

In [16]:
'''
name: make_gene_and_transcript_converter

input: The CHM13 CAT/Liftoff gff annotation version 2.0

output: A dataframe with ["gene_id", "transcript_id", "gene_name", "transcript_name"] formatted in the same way as the 
bambu reference, so that we can properly assign gene and transcript names.

purpose: Creating a list that allows us to assign transcript and gene names to the bambu annotation based on the transcript
id and gene ID
'''


def make_gene_and_transcript_converter(gff):
    
    ## Change name of duplicate Ensembl IDs to CHM IDs
    gff.loc[gff["transcript_id"] == "N/A", "transcript_id"] = gff["CHM_transcript_id"]
    gff_transcripts = gff.loc[gff["type"] == "transcript"].copy()
    gff_transcripts = gff_transcripts[["transcript_id", "CHM_transcript_id"]].drop_duplicates()
    gff_transcripts = gff_transcripts[gff_transcripts['transcript_id'].duplicated() == True]
    dup_trans = gff_transcripts["transcript_id"].dropna().values.tolist()
    gff.loc[gff["transcript_id"].isin(dup_trans), "transcript_id"] = gff["transcript_id"] + "(" + gff["CHM_transcript_id"] + ")"

    ## Change name of duplicate gene ids to CHM ids
    gff.loc[gff["gene_id"] == "None", "gene_id"] = gff["CHM_gene_id"]
    gff_genes = gff.loc[gff["type"] == "transcript"].copy()
    gff_genes = gff_genes[["gene_id", "CHM_gene_id"]].drop_duplicates()
    gff_genes = gff_genes[gff_genes['gene_id'].duplicated() == True]
    dup_genes = gff_genes["gene_id"].dropna().values.tolist()
    gff.loc[gff["gene_id"].isin(dup_genes), "gene_id"] = gff["gene_id"] + "(" + gff["CHM_gene_id"] + ")"

    
    ## Fix gene names for MSTRG Genes
    gff_names = gff.loc[gff["type"] == "transcript"].copy()
    gff_names = gff_names[["gene_id", "gene_name"]].copy()
    gff_names.loc[gff_names["gene_name"].str.contains("MSTRG."), "gene_name"] = np.NaN
    gff_names.dropna(inplace=True)
    gff_names.drop_duplicates(inplace=True, subset=["gene_id"])
    gff.drop(columns="gene_name", inplace=True)
    gff = pd.merge(gff, gff_names, on="gene_id", how="left")
    gff = gff[["gene_id", "transcript_id", "gene_name", "transcript_name",
              "start", "end", "type", "start_codon", "stop_codon", "chr", "strand"]].copy()


    return gff

In [17]:
chm13_counts = pd.read_csv("../novel_gene_body_raw_data/chm13_bambu/counts_transcript.txt", delimiter="\t", low_memory=False, 
                           header=0,)
grch_counts = pd.read_csv("../novel_gene_body_raw_data/GRCh38_bambu/counts_transcript.txt", delimiter="\t", low_memory=False, 
                           header=0,)

list_counts = [chm13_counts, grch_counts]

for df in list_counts:
    
    new_col_names = []
    
    for col in df.columns:
        new_col_names.append(col.split("_nanopore")[0])
        
    df.columns = new_col_names

In [18]:
chm13_counts.columns

Index(['TXNAME', 'GENEID', 'sample_421', 'sample_20484', 'sample_27527',
       'sample_356', 'sample_20814'],
      dtype='object')

In [19]:
print("CHM13 counts per sample:\n")
print(chm13_counts[['sample_421', 'sample_20484', 'sample_27527', 'sample_356', 'sample_20814']].sum())

print("\n GRCH38 counts per sample:\n")
print(grch_counts[['sample_421', 'sample_20484', 'sample_27527', 'sample_356', 'sample_20814']].sum())

print("\nDifference in Counts per Sample:\n")
difference = chm13_counts[['sample_421', 'sample_20484', 'sample_27527', 'sample_356', 'sample_20814']].sum() - grch_counts[['sample_421', 'sample_20484', 'sample_27527', 'sample_356', 'sample_20814']].sum()
print(difference)

print("\nPercent difference in Counte per Sample:\n")
percent_difference = difference/(chm13_counts[['sample_421', 'sample_20484', 'sample_27527', 'sample_356', 'sample_20814']].sum())
print(percent_difference * 100)


CHM13 counts per sample:

sample_421      4.647257e+07
sample_20484    4.207083e+07
sample_27527    5.270878e+07
sample_356      4.217850e+07
sample_20814    3.331663e+07
dtype: float64

 GRCH38 counts per sample:

sample_421      4.625321e+07
sample_20484    4.160675e+07
sample_27527    5.220217e+07
sample_356      4.187732e+07
sample_20814    3.312449e+07
dtype: float64

Difference in Counts per Sample:

sample_421      219357.822522
sample_20484    464089.000000
sample_27527    506608.315461
sample_356      301172.705238
sample_20814    192136.046704
dtype: float64

Percent difference in Counte per Sample:

sample_421      0.472016
sample_20484    1.103113
sample_27527    0.961146
sample_356      0.714043
sample_20814    0.576697
dtype: float64


In [20]:
## Find novel transcripts in both annotations
chm13_counts.loc[chm13_counts["TXNAME"].str.startswith("tx."), "is_novel_transcript"] = True
chm13_counts.loc[~chm13_counts["TXNAME"].str.startswith("tx."), "is_novel_transcript"] = False

grch_counts.loc[grch_counts["TXNAME"].str.startswith("tx."), "is_novel_transcript"] = True
grch_counts.loc[~grch_counts["TXNAME"].str.startswith("tx."), "is_novel_transcript"] = False

## Find novel genes in both annotations
chm13_counts.loc[chm13_counts["GENEID"].str.startswith("gene."), "is_novel_gene"] = True
chm13_counts.loc[~chm13_counts["GENEID"].str.startswith("gene."), "is_novel_gene"] = False

grch_counts.loc[grch_counts["GENEID"].str.startswith("gene."), "is_novel_gene"] = True
grch_counts.loc[~grch_counts["GENEID"].str.startswith("gene."), "is_novel_gene"] = False

In [21]:
## Create dataframes with either only novel genes or only novel transcripts that are not from novel genes
chm13_novel_genes = chm13_counts.loc[chm13_counts["is_novel_gene"]]
grch_novel_genes = grch_counts.loc[grch_counts["is_novel_gene"]]

chm13_novel_transcripts = chm13_counts.loc[((chm13_counts["is_novel_transcript"]) & (~chm13_counts["is_novel_gene"]))]
grch_novel_transcripts = grch_counts.loc[((grch_counts["is_novel_transcript"]) & (~grch_counts["is_novel_gene"]))]

In [22]:
## Filter novel genes and transcripts, must be present in every sample with 5+ counts.
count_cols = ['sample_421', 'sample_20484', 'sample_27527', 'sample_356', 'sample_20814']

for col in count_cols:
    chm13_novel_genes = chm13_novel_genes.loc[chm13_novel_genes[col] > 5]
    grch_novel_genes = grch_novel_genes.loc[grch_novel_genes[col] > 5]
    chm13_novel_transcripts = chm13_novel_transcripts.loc[chm13_novel_transcripts[col] > 5]
    grch_novel_transcripts = grch_novel_transcripts.loc[grch_novel_transcripts[col] > 5]

In [23]:
## Count number of novel genes and novel transcripts in each dataframe
print("We found", chm13_novel_genes.shape[0], "novel genes in CHM13")
print("We found", grch_novel_genes.shape[0], "novel genes in GRCh38")

print("We found", chm13_novel_transcripts.shape[0], "novel transcripts of known genes in CHM13")
print("We found", grch_novel_transcripts.shape[0], "novel transcripts of known genes in GRCh38")

We found 104 novel genes in CHM13
We found 105 novel genes in GRCh38
We found 238 novel transcripts of known genes in CHM13
We found 248 novel transcripts of known genes in GRCh38


In [24]:
## Open extended annotations and parse through them
chm13_ref = pd.read_csv("../novel_gene_body_raw_data/chm13_bambu/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])
grch_ref = pd.read_csv("../novel_gene_body_raw_data/GRCh38_bambu/extended_annotations.gtf", header=None, delimiter="\t",
                       low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

chm13_ref = parse_df_columns(chm13_ref, is_ref=False)
grch_ref = parse_df_columns(grch_ref, is_ref=False)

In [25]:
## Fix up extended annotations to merge with counts matrices
chm13_ref.columns = ['chr', 'source', 'type', 'start', 'end', 'strand', 'GENEID', 
                     'TXNAME', 'exon_number', 'is_novel_transcript', 'is_novel_gene']
chm13_ref = chm13_ref.loc[chm13_ref["type"] == "transcript"].copy()
chm13_ref = chm13_ref[["chr", "start", "end", "GENEID", "TXNAME"]]

grch_ref.columns = ['chr', 'source', 'type', 'start', 'end', 'strand', 'GENEID', 
                     'TXNAME', 'exon_number', 'is_novel_transcript', 'is_novel_gene']
grch_ref = grch_ref.loc[grch_ref["type"] == "transcript"].copy()
grch_ref = grch_ref[["chr", "start", "end", "GENEID", "TXNAME"]]

In [26]:
## Create gene location extractor for novel genes in both dataframes
chm13_extractor = pd.merge(chm13_novel_genes[["GENEID", "TXNAME"]], chm13_ref, how="left", on=["GENEID", "TXNAME"])
grch_extractor = pd.merge(grch_novel_genes[["GENEID", "TXNAME"]], grch_ref, how="left", on=["GENEID", "TXNAME"])

In [27]:
## Create extraction column and drop others
chm13_extractor[["chr", "start", "end", "GENEID", "TXNAME"]] = chm13_extractor[["chr", "start", "end"]].astype("int")
grch_extractor[["chr", "start", "end", "GENEID", "TXNAME"]] = grch_extractor[["chr", "start", "end"]].astype("int")


chm13_extractor[["chr", "start", "end", "GENEID", "TXNAME"]] = chm13_extractor[["chr", "start", "end", "GENEID", "TXNAME"]].astype("str")
grch_extractor[["chr", "start", "end", "GENEID", "TXNAME"]] = grch_extractor[["chr", "start", "end", "GENEID", "TXNAME"]].astype("str")


chm13_extractor["extraction"] = chm13_extractor["chr"] + ":" + chm13_extractor["start"] + "-" + chm13_extractor["end"]
grch_extractor["extraction"] = grch_extractor["chr"] + ":" + grch_extractor["start"] + "-" + grch_extractor["end"]

chm13_extractor = chm13_extractor[["GENEID", "TXNAME", "extraction"]].copy()
grch_extractor = grch_extractor[["GENEID", "TXNAME", "extraction"]].copy()

In [28]:
## Save novel gene sequence extractors for samtools
chm13_extractor.to_csv("chm13_novel_gene_extractor.txt", sep="\t", index=False, header=False)
grch_extractor.to_csv("grch_novel_gene_extractor.txt", sep="\t", index=False, header=False)

In [None]:
## Make venn diagram for genes expressing novel isoforms.

chm13_novel_transcripts["GENEID"] = chm13_novel_transcripts["GENEID"].str.split(".", expand=True)[0]

chm13_set = chm13_novel_transcripts["GENEID"]
grch_set = grch_novel_transcripts["GENEID"]

venn2_unweighted(subsets = [set(chm13_set), set(grch_set)], set_labels = ('CHM13', 'GRCh38'), 
                 set_colors=('#648FFF', '#FE6100'), alpha = 0.75);

plt.title("Genes with Novel Transcripts", size=16)

In [None]:
## Save novel transcript in known genes for GRCh38 so that we can compare them to known trancripts in ENSEMBL tracks
grch_ref = pd.read_csv("../novel_gene_body_raw_data/GRCh38_bambu/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])


grch_ref = parse_df_columns(grch_ref, is_ref=False)

In [None]:
grch_novel_transcripts = grch_ref.loc[((~grch_ref["is_novel_gene"]) & (grch_ref["is_novel_transcript"]))].copy()

In [None]:
grch_ref = pd.read_csv("../novel_gene_body_raw_data/GRCh38_bambu/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

In [None]:
novel_transcript_annotation_grch = pd.merge(grch_novel_transcripts["gene_id"], grch_ref, left_index=True, right_index=True)

In [None]:
disease_relevant_genes = pd.read_csv("../../cDNA-comparison/article_analysis/annotations/disease_relevant_gene_names.csv", names=["gene_id", "gene_name"],
                                    header=0)

## Create list of novel transcripts in disease relevant genes
novel_relevant = pd.merge(disease_relevant_genes, novel_transcript_annotation_grch, how='inner', on="gene_id")

In [None]:
novel_relevant

In [None]:
novel_relevant.drop(columns=["gene_id", "gene_name"], inplace=True)

In [None]:
novel_relevant

In [48]:
novel_relevant.to_csv("../novel_gene_body_processed_data/novel_med_relevant_transcript_annotation_GRCh38.gtf", 
                                        sep="\t", header=False, index=False)