# Import libraries and define functions + Initial setup

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import csv


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
name: relative_transcript_abundance

purpose: calculate relative transcript abundance

input: a dataframe with a ref_gene_id column identifying the transcript gene of origin and a cov columns with 
the coverage for the transcripts.

output: the same dataframe with a relative abundance column added
'''



def relative_transcript_abundance(df):
    
    ## Group by gene_id and get total expression for each gene (not counting introns)
    df_sums = df[["gene_id", "total_CPM"]].groupby("gene_id").sum()
    df_sums["total_CPM_gene"] = df_sums["total_CPM"]
    df_sums.drop(columns="total_CPM", inplace=True)
    
    ## Merge dataframe with total gene level CPM with regular transcript level CPM dataframe
    merged_df = pd.merge(df, df_sums, how='inner', on="gene_id")
    
    ## Calculater relative percent abundance for each transcript within its gene
    merged_df["relative_abundance_percent"] = ((merged_df["total_CPM"]/merged_df["total_CPM_gene"]) * 100)
    
    ## Rename total_CPM for transcript column
    merged_df["total_CPM_transcript"] = merged_df["total_CPM"]
    merged_df.drop(columns="total_CPM", inplace=True)


    return merged_df

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        df["gene_id"] = df.index
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        df.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [4]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False, delete_other=True):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        if delete_other:
            df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [5]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = df.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = df.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        df[col] = round(df[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        df[cpm_name] = round(((df[col]/(df[col].sum())) * 1000000), 2)
    
    return df  

In [6]:
## Open original reference
original_ref = pd.read_csv("../references/Homo_sapiens.GRCh38.113.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True)

In [7]:
## Open extended reference to get new gene chromosomes
extended_ref = pd.read_csv("../references/extended_annotations.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

## Parse through reference to get gene names and ids
ext_ref = parse_df_columns(extended_ref, is_ref=False)
ext_ref = ext_ref.loc[ext_ref["gene_id"].str.contains("Bambu")][["gene_id", "chr"]]

In [8]:
reference_key = orig_ref[["gene_id", "gene_name", "chr"]].merge(ext_ref, on=["gene_id", "chr"], how="outer")

In [9]:
glinos_comp = pd.read_csv("../references/Our_SC_vs_glinos_annotation.tracking", sep="\t", header=None, usecols=[2,3,4],
                         names=["glinos", "category_glinos", "ours"])
glinos_comp_our_new_only = glinos_comp.loc[glinos_comp["ours"].str.contains("Bambu")]

In [10]:
leung_comp = pd.read_csv("../references/Our_SC_vs_leung_annotation.tracking", sep="\t", header=None, usecols=[2,3,4],
                         names=["leung", "category_leung", "ours"])
leung_comp_our_new_only = leung_comp.loc[leung_comp["ours"].str.contains("Bambu")]

In [11]:
heberle_comp = pd.read_csv("../references/Our_SC_vs_heberle_annotation.tracking", sep="\t", header=None, usecols=[2,3,4],
                         names=["heberle", "category_heberle", "ours"])
heberle_comp_our_new_only = heberle_comp.loc[heberle_comp["ours"].str.contains("Bambu")]

In [12]:
df = heberle_comp_our_new_only.merge(glinos_comp_our_new_only, on="ours", how="outer") # inner was used before
df = df.merge(leung_comp_our_new_only, on="ours", how="outer")

In [13]:
df.loc[df["category_heberle"] == "=", "is_in_heberle"] = True
df.loc[df["category_heberle"] != "=", "is_in_heberle"] = False

In [14]:
df.loc[df["category_glinos"] == "=", "is_in_glinos"] = True
df.loc[df["category_glinos"] != "=", "is_in_glinos"] = False

In [15]:
df.loc[df["category_leung"] == "=", "is_in_leung"] = True
df.loc[df["category_leung"] != "=", "is_in_leung"] = False

In [16]:
df.loc[((df["is_in_heberle"] == True) & (df["is_in_glinos"] == True) & (df["is_in_leung"] == True)), "is_in_all"] = True
df.loc[~((df["is_in_heberle"] == True) & (df["is_in_glinos"] == True) & (df["is_in_leung"] == True)), "is_in_all"] = False

In [17]:
df.loc[((df["is_in_heberle"] == True) | (df["is_in_glinos"] == True) | (df["is_in_leung"] == True)), "is_in_any"] = True
df.loc[~((df["is_in_heberle"] == True) | (df["is_in_glinos"] == True) | (df["is_in_leung"] == True)), "is_in_any"] = False

In [18]:
df["gene_id"] = df["ours"].str.split(":", expand=True)[1].str.split("|", expand=True)[0]
df["transcript_id"] = df["ours"].str.split(":", expand=True)[1].str.split("|", expand=True)[1]

In [19]:
df = df.merge(reference_key.drop_duplicates(), on="gene_id", how="left")

In [20]:
df_new_from_new = df.loc[df["gene_id"].str.contains("BambuGene")].copy()

df_new_from_known = df.loc[~df["gene_id"].str.contains("BambuGene")].copy() 

In [21]:
df_new_from_known_nuclear = df_new_from_known.loc[df_new_from_known["chr"] != "MT"].copy()
df_new_from_known_mito = df_new_from_known.loc[df_new_from_known["chr"] == "MT"].copy()

In [22]:
print("Number of new from known genes nuclear genes:", df_new_from_known_nuclear.shape[0])
print("Not exclusive to ours:", df_new_from_known_nuclear.loc[df_new_from_known_nuclear["is_in_any"] == True].shape[0])
print("Exclusive to ours:", df_new_from_known_nuclear.loc[df_new_from_known_nuclear["is_in_any"] == False].shape[0])
print("Found in Leung:", df_new_from_known_nuclear.loc[df_new_from_known_nuclear["is_in_leung"] == True].shape[0])
print("Found in Glinos:", df_new_from_known_nuclear.loc[df_new_from_known_nuclear["is_in_glinos"] == True].shape[0])
print("Found in Heberle:", df_new_from_known_nuclear.loc[df_new_from_known_nuclear["is_in_heberle"] == True].shape[0])
print("Found in All:", df_new_from_known_nuclear.loc[df_new_from_known_nuclear["is_in_all"] == True].shape[0])

Number of new from known genes nuclear genes: 54
Not exclusive to ours: 10
Exclusive to ours: 44
Found in Leung: 5
Found in Glinos: 3
Found in Heberle: 9
Found in All: 2


In [23]:
print("Number of new isoforms from known mito genes:", df_new_from_known_mito.shape[0])
print("Not exclusive to ours:", df_new_from_known_mito.loc[df_new_from_known_mito["is_in_any"] == True].shape[0])
print("Exclusive to ours:", df_new_from_known_mito.loc[df_new_from_known_mito["is_in_any"] == False].shape[0])
print("Found in Leung:", df_new_from_known_mito.loc[df_new_from_known_mito["is_in_leung"] == True].shape[0])
print("Found in Glinos:", df_new_from_known_mito.loc[df_new_from_known_mito["is_in_glinos"] == True].shape[0])
print("Found in Heberle:", df_new_from_known_mito.loc[df_new_from_known_mito["is_in_heberle"] == True].shape[0])
print("Found in All:", df_new_from_known_mito.loc[df_new_from_known_mito["is_in_all"] == True].shape[0])

Number of new isoforms from known mito genes: 0
Not exclusive to ours: 0
Exclusive to ours: 0
Found in Leung: 0
Found in Glinos: 0
Found in Heberle: 0
Found in All: 0


In [24]:
print("Number of new from new genes nuclear genes:", df_new_from_new.shape[0])
print("Not exclusive to ours:", df_new_from_new.loc[df_new_from_new["is_in_any"] == True].shape[0])
print("Exclusive to ours:", df_new_from_new.loc[df_new_from_new["is_in_any"] == False].shape[0])
print("Found in Leung:", df_new_from_new.loc[df_new_from_new["is_in_leung"] == True].shape[0])
print("Found in Glinos:", df_new_from_new.loc[df_new_from_new["is_in_glinos"] == True].shape[0])
print("Found in Heberle:", df_new_from_new.loc[df_new_from_new["is_in_heberle"] == True].shape[0])
print("Found in All:", df_new_from_new.loc[df_new_from_new["is_in_all"] == True].shape[0])

Number of new from new genes nuclear genes: 62
Not exclusive to ours: 0
Exclusive to ours: 62
Found in Leung: 0
Found in Glinos: 0
Found in Heberle: 0
Found in All: 0


In [25]:
df.columns

Index(['heberle', 'category_heberle', 'ours', 'glinos', 'category_glinos',
       'leung', 'category_leung', 'is_in_heberle', 'is_in_glinos',
       'is_in_leung', 'is_in_all', 'is_in_any', 'gene_id', 'transcript_id',
       'gene_name', 'chr'],
      dtype='object')

In [26]:
df_output = df[['gene_id', 'gene_name', 'transcript_id', 'chr', 'heberle', 'glinos', 'leung', 'is_in_heberle', 'is_in_glinos', 'is_in_leung', 'is_in_all', 'is_in_any']].copy()

In [27]:
df_output.to_csv("../data/processed/compare_annotations/ours_vs_heberle_vs_glinos_vs_leung_comparison.tsv", sep="\t",
                index=False)