In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
from wordcloud import WordCloud
import seaborn as sns
import csv

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = df.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = df.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        df[col] = round(df[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        df[cpm_name] = round(((df[col]/(df[col].sum())) * 1000000), 2)
    
    return df   

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        df["gene_id"] = df[df.columns[0]]
        df.drop(columns=df.columns[0], inplace=True)
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        df.reset_index(inplace=True, drop=True)
        
        ## Define counts columns and initiate new_columns list
        count_columns = df.columns[1:].tolist()
        list_new_names = ["gene_id"]
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = ["transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [4]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True):

    if is_ref:
        
        ## Get gene ids
        df["gene_id"] = df["other"].str.split("gene_id \"", expand=True)[1].str.split('"', expand=True)[0]

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split("transcript_id \"", expand=True)[1].str.split('"', expand=True)[0]   
        
        ## Get biotype
        df["biotype"] = df["other"].str.split("gene_biotype \"", expand=True)[1].str.split('"', expand=True)[0]

        ## Drop duplicates
        df.drop_duplicates(inplace=True)
        
        

    else:

        ## Get CHM gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        

        ## Get CHM transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]


    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [5]:
## Fix column names in counts matrix

grch38_counts = pd.read_csv("../../../../data/bernardo/raw/uky_aged_stringent/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

grch38_counts = fix_column_names(grch38_counts, is_gene=False)

In [7]:
## Calculate total counts and fix gene id
grch38_counts["total_counts"] = grch38_counts[grch38_counts.columns[2:6].tolist()].sum(axis=1)

In [8]:
## Calculate CPM and drop count columns
grch38_counts = calculate_cpm(grch38_counts, is_gene=False)

grch38_counts = grch38_counts[grch38_counts.columns.drop(list(grch38_counts.filter(regex='counts')))]

In [9]:
grch38_counts["avg_CPM"] = grch38_counts["total_CPM"]
grch38_counts.drop(columns="total_CPM", inplace=True)

In [10]:
## Find novel transcripts in both annotations
grch38_counts.loc[grch38_counts["transcript_id"].str.startswith("BambuTx"), "is_novel_transcript"] = True
grch38_counts.loc[~grch38_counts["transcript_id"].str.startswith("BambuTx"), "is_novel_transcript"] = False

## Find novel genes in both annotations
grch38_counts.loc[grch38_counts["gene_id"].str.startswith("BambuGene"), "is_novel_gene"] = True
grch38_counts.loc[~grch38_counts["gene_id"].str.startswith("BambuGene"), "is_novel_gene"] = False

In [11]:
## Create dataframes with either only novel genes or only novel transcripts that are not from novel genes
grch38_novel_genes = grch38_counts.loc[grch38_counts["is_novel_gene"]].copy()
grch38_novel_transcripts = grch38_counts.loc[((grch38_counts["is_novel_transcript"]) & (~grch38_counts["is_novel_gene"]))].copy()

In [12]:
## See how many novel genes
print("We found", grch38_novel_genes.shape[0], "novel genes in GRCh38")
print("We found", grch38_novel_transcripts.shape[0], "novel transcripts in GRCh38")

We found 68 novel genes in GRCh38
We found 256 novel transcripts in GRCh38


In [13]:
## Filter novel genes and transcripts, total CPM > 1
grch38_novel_genes_filtered_avg = grch38_novel_genes[(grch38_novel_genes["avg_CPM"]>1)].copy()
grch38_novel_transcripts_filtered_avg = grch38_novel_transcripts[(grch38_novel_transcripts["avg_CPM"]>1)].copy()

In [14]:
## See how many novel genes
print("We found", grch38_novel_genes_filtered_avg.shape[0], "novel genes in GRCh38 filtered avg")
print("We found", grch38_novel_transcripts_filtered_avg.shape[0], "novel transcripts in GRCh38 filtered avg")

We found 42 novel genes in GRCh38 filtered avg
We found 235 novel transcripts in GRCh38 filtered avg


In [15]:
## Filter novel genes and transcripts, must be present in at least three samples with 50+ counts.
cpm_cols = grch38_novel_genes.columns[2:7].tolist()
grch38_novel_genes_filtered_each = grch38_novel_genes[(grch38_novel_genes[cpm_cols]>1).sum(axis=1)>=5]
grch38_novel_transcripts_filtered_each = grch38_novel_transcripts[(grch38_novel_transcripts[cpm_cols]>1).sum(axis=1)>=5]

In [16]:
## See how many novel genes
print("We found", grch38_novel_genes_filtered_each.shape[0], "novel genes in GRCh38 filtered each")
print("We found", grch38_novel_transcripts_filtered_each.shape[0], "novel transcripts in GRCh38 filtered each")

We found 23 novel genes in GRCh38 filtered each
We found 161 novel transcripts in GRCh38 filtered each


In [17]:
## Get annotated transcript
grch38_annotated_transcript = grch38_counts.loc[((grch38_counts["is_novel_transcript"]==False) & (grch38_counts["is_novel_gene"]==False))].copy()

In [18]:
## Display number of annotated transcripts
print("We found", grch38_annotated_transcript.shape[0], "annotated transcripts in GRCh38")

We found 251213 annotated transcripts in GRCh38


In [19]:
## Apply filters
grch38_annotated_transcript_filtered_avg = grch38_annotated_transcript[(grch38_annotated_transcript["avg_CPM"]>1)].copy()
grch38_annotated_transcript_filtered_each =  grch38_annotated_transcript[(grch38_annotated_transcript[cpm_cols]>1).sum(axis=1)>=5]

In [20]:
## Display number of filtered transcripts
print("We found", grch38_annotated_transcript_filtered_avg.shape[0], "annotated transcripts in GRCh38 filtered average")
print("We found", grch38_annotated_transcript_filtered_each.shape[0], "annotated transcripts in GRCh38 filtered each")

We found 27743 annotated transcripts in GRCh38 filtered average
We found 18339 annotated transcripts in GRCh38 filtered each


In [21]:
# Load original reference
orig_ref = pd.read_csv("../../../../references/bernardo/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

In [22]:
## Create good reference to convert gene names
orig_ref = orig_ref.loc[(orig_ref["type"]=="transcript") | (orig_ref["type"]=="exon")].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True)

In [23]:
## Get names of each of the subgroups of transcripts
protein_coding_names = orig_ref.loc[orig_ref["biotype"] == "protein_coding"]["transcript_id"]
expressed_avg_names = grch38_annotated_transcript_filtered_avg["transcript_id"]
expressed_each_names = grch38_annotated_transcript_filtered_each["transcript_id"]

In [24]:
## Overlap between filtered transcripts and protein coding
protein_coding_expressed_avg_ids = expressed_avg_names.loc[expressed_avg_names.isin(protein_coding_names)].copy()
protein_coding_expressed_each_ids = expressed_each_names.loc[expressed_each_names.isin(protein_coding_names)].copy()

In [25]:
## Drop repeats
protein_coding_expressed_avg_ids = protein_coding_expressed_avg_ids.unique()
protein_coding_expressed_each_ids = protein_coding_expressed_each_ids.unique()

In [26]:
# Print results
print("We found", protein_coding_expressed_avg_ids.shape[0], "annotated protein coding transcripts in GRCh38 filtered average")
print("We found", protein_coding_expressed_each_ids.shape[0], "annotated protein coding transcripts in GRCh38 filtered each")

We found 25272 annotated protein coding transcripts in GRCh38 filtered average
We found 17134 annotated protein coding transcripts in GRCh38 filtered each


In [27]:
## Get GTF ready for outputting
ref_protein_coding_avg = orig_ref.loc[orig_ref["transcript_id"].isin(protein_coding_expressed_avg_ids)].copy()
ref_protein_coding_each = orig_ref.loc[orig_ref["transcript_id"].isin(protein_coding_expressed_each_ids)].copy()

ref_protein_coding_avg.drop(columns=["gene_id", "transcript_id", "biotype"], inplace=True)
ref_protein_coding_each.drop(columns=["gene_id", "transcript_id", "biotype"], inplace=True)

In [28]:
ref_protein_coding_avg.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "filtered_annotated_protein_coding_transcripts/annotated_protein_coding_AVG_cpm_greater_than_one.gtf"),
                             header=None, index=None, sep="\t", quoting=csv.QUOTE_NONE)

ref_protein_coding_each.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "filtered_annotated_protein_coding_transcripts/annotated_protein_coding_EACH_cpm_greater_than_one.gtf"),
                             header=None, index=None, sep="\t", quoting=csv.QUOTE_NONE)

In [29]:
## Get CPM files ready for output
protein_coding_filtered_avg_counts = grch38_counts.loc[grch38_counts["transcript_id"].isin(protein_coding_expressed_avg_ids)].copy()
protein_coding_filtered_each_counts = grch38_counts.loc[grch38_counts["transcript_id"].isin(protein_coding_expressed_each_ids)].copy()

protein_coding_filtered_avg_counts.drop(columns=["is_novel_transcript", "is_novel_gene"], inplace=True)
protein_coding_filtered_each_counts.drop(columns=["is_novel_transcript", "is_novel_gene"], inplace=True)

In [30]:
protein_coding_filtered_avg_counts.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "filtered_annotated_protein_coding_transcripts/annotated_protein_coding_AVG_cpm_greater_than_one.cpm"),
    index=None, sep="\t")

protein_coding_filtered_each_counts.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "filtered_annotated_protein_coding_transcripts/annotated_protein_coding_EACH_cpm_greater_than_one.cpm"),
                             index=None, sep="\t")

In [31]:
## Load Bambu extended reference
bambu_ref = pd.read_csv("../../../../data/bernardo/raw/uky_aged_stringent/bambu_discovery/extended_annotations.gtf", header=None,
                        delimiter="\t", low_memory=False, 
                        names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

In [32]:
## Parse through columns, get gene_id and trancript_id
bambu_ref = parse_df_columns(bambu_ref, is_ref=False)

In [33]:
## Make sure coordinates on bambu ref are integers
bambu_ref["start"] = bambu_ref["start"].astype("int64", copy=True, errors='raise')
bambu_ref["end"] = bambu_ref["end"].astype("int64", copy=True, errors='raise')

In [34]:
## Concatenate novel genes and transcript for different filtering thresholds
novel_filtered_each = pd.concat([grch38_novel_genes_filtered_each, grch38_novel_transcripts_filtered_each])
novel_filtered_avg = pd.concat([grch38_novel_genes_filtered_avg, grch38_novel_transcripts_filtered_avg])
novel_all = pd.concat([grch38_novel_genes, grch38_novel_transcripts])

In [35]:
## Drop useless columns
novel_filtered_each.drop(columns=["is_novel_gene", "is_novel_transcript"], inplace=True)
novel_filtered_avg.drop(columns=["is_novel_gene", "is_novel_transcript"], inplace=True)
novel_all.drop(columns=["is_novel_gene", "is_novel_transcript"], inplace=True)

In [36]:
## Only keep transcript that pass filters on refrences
ref_novel_filtered_each = bambu_ref.loc[bambu_ref["transcript_id"].isin(novel_filtered_each["transcript_id"])].copy()
ref_novel_filtered_avg = bambu_ref.loc[bambu_ref["transcript_id"].isin(novel_filtered_avg["transcript_id"])].copy()
ref_novel_all = bambu_ref.loc[bambu_ref["transcript_id"].isin(novel_all["transcript_id"])].copy()

In [37]:
## Drop columns no longer needed on reference
ref_novel_filtered_each.drop(columns=["gene_id", "transcript_id"], inplace=True)
ref_novel_filtered_avg.drop(columns=["gene_id", "transcript_id"], inplace=True)
ref_novel_all.drop(columns=["gene_id", "transcript_id"], inplace=True)

In [38]:
## Save references
ref_novel_filtered_each.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "filtered_novel_genes_and_transcripts/new_rna_EACH_cpm_greater_than_one.gtf"),
                             header=None, index=None, sep="\t", quoting=csv.QUOTE_NONE)

ref_novel_filtered_avg.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "filtered_novel_genes_and_transcripts/new_rna_AVG_cpm_greater_than_one.gtf"),
                             header=None, index=None, sep="\t", quoting=csv.QUOTE_NONE)

ref_novel_all.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "unfiltered_novel_genes_and_transcripts/new_rna_UNFILTEREDrm_cpm_greater_than_one.gtf"),
                             header=None, index=None, sep="\t", quoting=csv.QUOTE_NONE)

In [39]:
## Save CPM files
novel_filtered_each.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "filtered_novel_genes_and_transcripts/new_rna_EACH_cpm_greater_than_one.cpm"),
                             index=None, sep="\t")

novel_filtered_avg.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "filtered_novel_genes_and_transcripts/new_rna_AVG_cpm_greater_than_one.cpm"),
                             index=None, sep="\t")

novel_all.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "unfiltered_novel_genes_and_transcripts/new_rna_UNFILTERED_cpm_greater_than_one.cpm"),
                             index=None, sep="\t")

In [40]:
## Drop unnecessary columns from original CPM matrix
grch38_counts.drop(columns=["is_novel_gene", "is_novel_transcript"], inplace=True)

In [41]:
## Save CPM matrix
grch38_counts.to_csv(("../../../../data/bernardo/processed/99.other/create_annotations/annotations_and_quant_for_mark_and_maddy/"
    "original_data/unfiltered_all_transcripts.cpm"),
                             index=None, sep="\t")