# Import libraries and define functions + Initial setup

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import csv
import os
import glob


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    dff = df.copy()
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = dff.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        dff["gene_id"] = dff.index
        cols = list(dff.columns)
        cols = [cols[-1]] + cols[:-1]
        dff = dff[cols]
        dff.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = dff.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    dff.columns = list_new_names
    
    return dff 

In [3]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False, delete_other=True):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        if delete_other:
            df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [4]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):
    
    dff = df.copy()

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = dff.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = dff.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        dff[col] = round(dff[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        dff[cpm_name] = round(((dff[col]/(dff[col].sum())) * 1000000), 2)
    
    return dff

# - Create high-confidence annotations

In [5]:
## Import data for transcript level counts and fix column names

df = pd.read_csv("../../../data/raw/nextflow_pipeline_output/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

df = fix_column_names(df, is_gene=False)

In [6]:
## Calculate CPM and drop count columns
df = calculate_cpm(df, is_gene=False)

df = df[df.columns.drop(list(df.filter(regex='counts')))].copy()

In [7]:
## Calculate median CPM
df["median_CPM"] = df[df.filter(regex='[0-9]_CPM').columns].median(axis=1)

In [8]:
## Create dataframe with only new transcripts
df_new = df.loc[df["transcript_id"].str.startswith("Bambu")].copy()

In [9]:
## Create dataframe with new high-confidence transcripts
df_new_high_confidence = df_new.loc[df_new["median_CPM"] > 1].copy()

In [10]:
## Import and parse through extended annotations
ref = pd.read_csv("../../../data/raw/nextflow_pipeline_output/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

ref = ref.loc[~ref["chr"].str.startswith("ERCC-")]

ref = parse_df_columns(ref, is_ref=False)

ref_transcripts = ref.loc[ref["type"] == "transcript"].copy()

In [11]:
## Open original reference
original_ref = pd.read_csv("../../../references/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

original_ref = original_ref.loc[~original_ref["chr"].str.startswith("ERCC-")]

## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True)

## Create gene names dataframe
gene_names = orig_ref[["gene_name", "gene_id", "chr"]].copy()

In [12]:
## Add chromosome to list and remove mito transcripts

df_new_high_confidence = df_new_high_confidence.merge(ref_transcripts[["transcript_id", "chr"]], how="inner", on="transcript_id")

In [13]:
## Create list of high confidence mitochondrial transcripts
list_mito_high_confidence = ["BambuTx1845", "BambuTx1846", "BambuTx1847", "BambuTx1848", "BambuTx1850"]

df_mito_high_confidence = df_new_high_confidence.loc[df_new_high_confidence["transcript_id"].isin(
                                                                                list_mito_high_confidence)].copy()

In [14]:
## Remove all mitochondrial transcripts from high-confidence
df_new_high_confidence = df_new_high_confidence.loc[df_new_high_confidence["chr"] != "MT"].copy()

In [15]:
## Add back only the truly high-confidence mitochondrial transcripts
df_new_high_confidence = pd.concat([df_new_high_confidence, df_mito_high_confidence])

In [16]:
## Remove counts columns
df_new_high_confidence = df_new_high_confidence.drop(columns=df_new_high_confidence.filter(like='_CPM').columns)

In [17]:
## Add gene_name column
df_new_high_confidence = df_new_high_confidence.merge(gene_names, on=["gene_id", "chr"], how="left")

In [18]:
## Reorganize column order
df_new_high_confidence = df_new_high_confidence[["chr", "gene_name", "gene_id", "transcript_id"]].copy()

In [19]:
## Fill NAs in gene_name with gene_id
df_new_high_confidence['gene_name'].fillna(df_new_high_confidence['gene_id'], inplace=True)

In [20]:
## Save high_confidence transcript names and info
df_new_high_confidence.to_csv("../../../references/high_confidence_transcripts.tsv", index=False, sep="\t")

In [21]:
## Save gene names for ease of use
gene_names.to_csv("../../../references/gene_names.tsv", index=False, sep="\t")