# Table of Contents

## - Import libraries and define functions + Initial setup

## - Pre-process files

## - Make table

# Import libraries and define functions + Initial setup

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import csv
import os
import glob


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    dff = df.copy()
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = dff.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        dff["gene_id"] = dff.index
        cols = list(dff.columns)
        cols = [cols[-1]] + cols[:-1]
        dff = dff[cols]
        dff.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = dff.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    dff.columns = list_new_names
    
    return dff 

In [3]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False, delete_other=True):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        if delete_other:
            df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [4]:
"""
This function loads all salmon quant.sh files with the given filename from all subdirectories 
of the provided parent directory and returns a merged counts and a merged TPM matrix.

Parameters:
filename (str): The name of the salmon file to load
parent_directory (str): The path to the parent directory containing the folders with salmon quant.sf files.

Returns:
list: A merged counts and a merged TPM matrix.
"""
    
def load_salmon_data_and_merge(filename, parent_directory):

    dataframes = []

    # Create a path pattern to find all folders containing the TSV files
    folder_pattern = os.path.join(parent_directory, '*', filename)

    # Get a list of all TSV files matching the pattern
    tsv_files = glob.glob(folder_pattern)
    
    ## Create flag for first iteration of loop
    flag_first = True
    
    # Read each TSV file into a DataFrame and append it to the list
    for tsv_file in tsv_files:
        
        ## Create column names
        sample_name = tsv_file.split("/")[-2].split("_Aligned")[0]
        tpm_name = sample_name + "_TPM"
        counts_name = sample_name + "_counts"
        
        ## Open dataframe for counts and TPM
        df_counts = pd.read_csv(tsv_file, sep='\t', usecols=[0, 4], names=["transcript_id", counts_name],
                               low_memory=False, header=0)
        df_tpm = pd.read_csv(tsv_file, sep='\t', usecols=[0, 3], names=["transcript_id", tpm_name], 
                             low_memory=False, header=0)
                
        ## If it is not the first iteration just add to merged dataframes
        if flag_first == False:
            
            df_meged_counts = df_meged_counts.merge(df_counts, on="transcript_id", how="inner")
            df_meged_tpm = df_meged_tpm.merge(df_tpm, on="transcript_id", how="inner")
        
        ## If it is the first iteration set merged dataframe to first dataframes
        ## Set flag_first to false
        else:
            
            df_meged_counts = df_counts.copy()
            df_meged_tpm = df_tpm.copy()
            
            flag_first = False
            
        df_meged_counts.shape
           

    return df_meged_counts, df_meged_tpm

In [5]:
## define ggplot colors
ggplot2_colors = ["#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#C77CFF", "#FF61CC"]

## - Pre-process files

In [6]:
## Set parent directory and salmon quantification file name
parent_dir = "../../../data/raw/1st_rebuttal_data/ROSMAP_illumina_DorsoLateralPreFrontalCortex_UNIQUE/salmon_alignment_mode/"
name = "quant.sf"

## Create merged dataframe with counts and tpm
df_counts_unique_rosmap, df_tpm_unique = load_salmon_data_and_merge(name, parent_dir)

In [7]:
## Set parent directory and salmon quantification file name
parent_dir = "../../../data/raw/1st_rebuttal_data/CSHL_illumina_uky_aged_brain_with_our_extended_annotation_UNIQUE/salmon_alignment_mode/"
name = "quant.sf"

## Create merged dataframe with counts and tpm
df_counts_unique_ours, df_tpm_unique = load_salmon_data_and_merge(name, parent_dir)

In [8]:
## Open up unique reads
df_counts_unique_gtex = pd.read_csv("../../../data/raw/1st_rebuttal_data/GTEX_with_our_extended_annotation_quant/bambu_quant/uniqueCounts_transcript.txt",
                              sep="\t")

## Only keep BA9 brain samples
## Excluded BA9 sample "GTEX-T5JC-0011-R10A-SM-2TT23.FAK91589" because it had only 46331 reads.
df_counts_unique_gtex = df_counts_unique_gtex[["TXNAME", "GENEID","GTEX-1192X-0011-R10a-SM-4RXXZ.FAK49046_mapped_filtered_sorted",
                  "GTEX-13X6J-0011-R10b-SM-5CEKT.FAK44896_mapped_filtered_sorted",
                  "GTEX-14BIL-0011-R10a-SM-5EQV4.FAK49209_mapped_filtered_sorted",
                  "GTEX-QDT8-0011-R10A-SM-2FKJB.FAK49182_mapped_filtered_sorted",
                  "GTEX-15DCD-0011-R10b-SM-5S51M.FAK42101_mapped_filtered_sorted"]]

## Fix column names
df_counts_unique_gtex = fix_column_names(df_counts_unique_gtex, is_gene=False)

In [9]:
## Calculate total counts for each dataframe
df_counts_unique_rosmap["total_unique_counts_ROSMAP_short-reads"] = df_counts_unique_rosmap[df_counts_unique_rosmap.filter(regex='count').columns].sum(axis=1)
df_counts_unique_ours["total_unique_counts_OURS_short-reads"] = df_counts_unique_ours[df_counts_unique_ours.filter(regex='count').columns].sum(axis=1)
df_counts_unique_gtex["total_unique_counts_GTEx_long-reads"] = df_counts_unique_gtex[df_counts_unique_gtex.filter(regex='count').columns].sum(axis=1)

In [10]:
## Keep only relevant columns
df_counts_unique_rosmap = df_counts_unique_rosmap[["transcript_id", "total_unique_counts_ROSMAP_short-reads"]].copy()
df_counts_unique_ours = df_counts_unique_ours[["transcript_id", "total_unique_counts_OURS_short-reads"]].copy()
df_counts_unique_gtex = df_counts_unique_gtex[["transcript_id", "total_unique_counts_GTEx_long-reads"]].copy()

In [11]:
## Merge dataframes
df = df_counts_unique_rosmap.merge(df_counts_unique_ours, on="transcript_id", how="inner")
df = df.merge(df_counts_unique_gtex, on="transcript_id", how="inner")

## - Make table

In [12]:
## Only keep new transcripts
df = df.loc[df["transcript_id"].str.startswith("Bambu")].copy()

In [13]:
## Import names of high_confidence (hf) new transcript
hf_transcripts = pd.read_csv("../../../references/high_confidence_transcripts.tsv", sep="\t")

## Create high-confidence transcript flag
condition = df["transcript_id"].isin(hf_transcripts["transcript_id"])

df.loc[condition, "is_high-confidence"] = True
df.loc[~condition, "is_high-confidence"] = False

In [14]:
## Import and parse through extended annotations
ref = pd.read_csv("../../../data/raw/nextflow_pipeline_output/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

## Take away ERCCs
ref = ref.loc[~ref["chr"].str.startswith("ERCC-")]

## Parse
ref = parse_df_columns(ref, is_ref=False)

## Only keep transcripts
ref_transcripts = ref.loc[ref["type"] == "transcript"].copy()

In [15]:
## Add chromosome and gene_id
df = df.merge(ref_transcripts[["gene_id", "transcript_id", "chr"]], on="transcript_id", how="inner")

In [16]:
## Add gene names and fill gene_name with gene_id for genes with no name

gene_names = pd.read_csv("../../../references/gene_names.tsv", sep="\t")

df = df.merge(gene_names, on=["gene_id", "chr"], how="left")

df['gene_name'].fillna(df['gene_id'], inplace=True)

In [17]:
## Create Type column to define type of transcript discovery

df.loc[df["gene_id"].str.startswith("Bambu"), "discovery_category"] = "New from new"
df.loc[~df["gene_id"].str.startswith("Bambu"), "discovery_category"] =  "New from known"
df.loc[df["chr"] == "MT", "discovery_category"] = "New from mito"

In [18]:
## Reorder columns

new_col_order = ['chr', 'gene_id', 'gene_name', 'transcript_id', 'discovery_category', 'is_high-confidence',
                 'total_unique_counts_ROSMAP_short-reads', 'total_unique_counts_OURS_short-reads',
                 'total_unique_counts_GTEx_long-reads']


df = df[new_col_order].copy()

In [19]:
## Save table!

df.to_csv("../../../data/processed/1st_rebuttal/new_transcript_unique_support/new_transcripts_unique_support.tsv",
         sep="\t", index=False)