# Library Import and Functions

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
name: relative_transcript_abundance

purpose: calculate relative transcript abundance

input: a dataframe with a ref_gene_id column identifying the transcript gene of origin and a cov columns with 
the coverage for the transcripts.

output: the same dataframe with a relative abundance column added
'''



def relative_transcript_abundance(df):
    
    first=True
    
    for col in df.filter(regex='[0-9]_count').columns:
        
        col_gene_name = col.split("_coun")[0] + "_total_gene_counts"
        col_relative_abundance = col.split("_count")[0] + "_relative_abundance"
    
        df_sums = df[["gene_id", col]].groupby("gene_id").sum()

        df_sums[col_gene_name] = df_sums[col].copy()

        df_sums.drop(columns=col, inplace=True)

        if first:
            merged_df = pd.merge(df, df_sums, how='inner', on="gene_id")
            merged_df[col_relative_abundance] = ((merged_df[col]/merged_df[col_gene_name]) * 100)
            
        else:
            merged_df = pd.merge(merged_df, df_sums, how='inner', on="gene_id")
            merged_df[col_relative_abundance] = ((merged_df[col]/merged_df[col_gene_name]) * 100)
        
        first=False
        
    return merged_df

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        df["gene_id"] = df.index
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        df.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [4]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [5]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = df.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = df.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        df[col] = round(df[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        df[cpm_name] = round(((df[col]/(df[col].sum())) * 1000000), 2)
    
    return df  

In [6]:
## define ggplot colors
ggplot2_colors = ["#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#C77CFF", "#FF61CC"]

In [7]:
## Import and parse through extended annotations
ref = pd.read_csv("../../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

ref = parse_df_columns(ref, is_ref=False)

ref_exons = ref.loc[ref["type"] == "exon"].copy()

ref_transcripts = ref.loc[ref["type"] == "transcript"].copy()

In [8]:
## Fix column names in counts matrix

df = pd.read_csv("../../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

df = fix_column_names(df, is_gene=False)

In [9]:
## Calculate total counts and fix gene id
df["total_counts"] = df[df.filter(regex='count').columns].sum(axis=1)

In [10]:
## Calculate CPM and drop count columns
df = calculate_cpm(df, is_gene=False)

df = df[df.columns.drop(list(df.filter(regex='counts')))].copy()

In [11]:
## Drop ERCCs
df = df.loc[~df["gene_id"].str.startswith("ERCC")].copy()

In [12]:
df["median_CPM"] = df[df.filter(regex='[0-9]_CPM').columns].median(axis=1)

In [13]:
df = df.merge(ref_transcripts[["gene_id", "transcript_id", "chr"]], on=["gene_id", "transcript_id"], how="inner")

In [14]:
## Create dataframes with either only novel genes or only novel transcripts that are not from novel genes
df_novel_genes = df.loc[df["gene_id"].str.startswith("BambuGene")].copy()
df_novel_transcripts = df.loc[((df["transcript_id"].str.startswith("BambuTx")) & (~df["gene_id"].str.startswith("BambuGene")))].copy()

In [15]:
df_novel_transcripts.loc[df_novel_transcripts["chr"] == "MT"]

Unnamed: 0,transcript_id,gene_id,sample_579_PAG75663_CPM,sample_1131_PAM44580_CPM,sample_1218_PAM43779_CPM,sample_1304_PAM44487_CPM,sample_1271_PAM44815_CPM,sample_5356_PAM42933_CPM,sample_1163_PAM44604_CPM,sample_5295_PAG77944_CPM,sample_5292_PAG75292_CPM,sample_1092_PAM41667_CPM,sample_1186_PAM43869_CPM,sample_1291_PAG71816_CPM,total_CPM,median_CPM,chr
1844,BambuTx1845,ENSG00000210082,37.63,1.07,18.76,1.65,43.42,1.31,54.74,101.16,39.21,7.53,14.25,41.55,35.89,28.195,MT
1845,BambuTx1846,ENSG00000210082,1801.93,17.28,1642.26,387.79,596.72,4262.28,302.93,1620.93,643.46,888.01,1314.72,409.95,1123.8,765.735,MT
1846,BambuTx1847,ENSG00000210082,277.92,11.42,122.55,60.76,115.28,79.0,179.48,428.53,151.97,74.96,106.54,155.12,168.58,118.915,MT
1847,BambuTx1848,ENSG00000210082,44.93,5.21,109.71,29.68,137.96,47.33,294.01,640.02,149.23,136.19,104.86,361.06,191.5,122.95,MT
1848,BambuTx1849,ENSG00000198888,131.17,0.54,8.8,0.14,15.05,3.61,9.01,155.17,132.24,19.39,1.04,89.97,58.65,12.03,MT
1849,BambuTx1850,ENSG00000210107,6157.55,6963.79,10379.43,3722.92,6433.1,5986.88,5788.84,14931.68,9284.79,4467.18,3511.5,16709.57,8362.56,6295.325,MT
1850,BambuTx1851,ENSG00000198763,6.77,0.84,1810.41,0.33,1.2,128.7,2.1,1112.37,1.6,0.17,0.59,1610.08,468.78,1.85,MT
1851,BambuTx1852,ENSG00000210151,3.09,0.49,3.2,0.11,1.66,0.0,1.59,9.25,1.95,0.65,1991.29,6.06,143.37,1.805,MT
1852,BambuTx1853,ENSG00000210151,230.57,0.0,200.11,0.0,0.74,0.65,5.82,115.45,102.97,163.78,1188.73,182.22,190.14,109.21,MT
1853,BambuTx1854,ENSG00000198804,229.91,1.51,39.39,14.68,8.52,708.68,2.24,51.7,11.64,210.31,1.26,2.14,89.41,13.16,MT


In [16]:
## Drop new mitochondrial transcripts/genes
df_novel_genes = df_novel_genes.loc[df_novel_genes["chr"] != "MT"].copy()
df_novel_transcripts = df_novel_transcripts.loc[df_novel_transcripts["chr"] != "MT"].copy()

In [17]:
## Filter novel genes and transcripts, one only total CPM > 1, other each CPM > 1

df_novel_genes_median = df_novel_genes[df_novel_genes["median_CPM"] > 1].copy()
df_novel_transcripts_median = df_novel_transcripts[df_novel_transcripts["median_CPM"] > 1].copy()

In [18]:
df_novel_transcripts_median.loc[df_novel_transcripts_median["transcript_id"] == "BambuTx1324"]

Unnamed: 0,transcript_id,gene_id,sample_579_PAG75663_CPM,sample_1131_PAM44580_CPM,sample_1218_PAM43779_CPM,sample_1304_PAM44487_CPM,sample_1271_PAM44815_CPM,sample_5356_PAM42933_CPM,sample_1163_PAM44604_CPM,sample_5295_PAG77944_CPM,sample_5292_PAG75292_CPM,sample_1092_PAM41667_CPM,sample_1186_PAM43869_CPM,sample_1291_PAG71816_CPM,total_CPM,median_CPM,chr
1323,BambuTx1324,ENSG00000185049,8.35,12.95,2.77,1.02,6.68,6.58,0.7,4.12,8.06,6.93,0.29,8.93,5.55,6.63,4


In [19]:
## Open original reference
original_ref = pd.read_csv("../../../../references/bernardo/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

original_ref = original_ref.loc[~original_ref["chr"].str.startswith("ERCC-")]

## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True)


In [20]:
df_novel_genes_median.sort_values(by="median_CPM")

Unnamed: 0,transcript_id,gene_id,sample_579_PAG75663_CPM,sample_1131_PAM44580_CPM,sample_1218_PAM43779_CPM,sample_1304_PAM44487_CPM,sample_1271_PAM44815_CPM,sample_5356_PAM42933_CPM,sample_1163_PAM44604_CPM,sample_5295_PAG77944_CPM,sample_5292_PAG75292_CPM,sample_1092_PAM41667_CPM,sample_1186_PAM43869_CPM,sample_1291_PAG71816_CPM,total_CPM,median_CPM,chr
1884,BambuTx1885,BambuGene285212,0.39,2.67,0.12,1.25,0.42,1.95,2.62,1.94,0.73,0.86,1.16,0.65,1.06,1.01,X
1881,BambuTx1882,BambuGene285108,1.18,0.69,1.31,0.11,1.8,1.3,0.68,0.85,1.5,0.57,0.0,1.22,1.03,1.015,X
1354,BambuTx1355,BambuGene194755,0.6,1.88,0.82,3.86,1.8,3.49,1.05,0.23,0.7,0.98,1.16,0.32,1.08,1.015,4
1422,BambuTx1423,BambuGene208278,0.21,2.77,0.38,0.8,2.26,4.47,1.66,0.43,1.24,1.3,0.68,0.75,1.17,1.02,5
2520,BambuTx2521,BambuGene72712,0.76,4.35,0.67,3.41,0.5,4.31,1.37,0.89,0.25,0.49,2.22,1.15,1.22,1.02,13
3231,BambuTx3232,BambuGene234860,0.03,5.64,0.61,3.86,3.57,5.93,1.44,2.41,0.0,0.24,0.0,0.0,1.36,1.025,6
2002,BambuTx2003,BambuGene34716,0.0,2.97,0.84,4.32,1.98,1.71,0.51,1.36,0.48,0.29,0.72,1.22,1.01,1.03,10
2757,BambuTx2758,BambuGene54883,0.03,3.36,1.08,1.59,0.0,1.46,2.1,0.89,0.38,0.98,1.5,0.0,0.89,1.03,12
1326,BambuTx1327,BambuGene193862,1.26,5.34,0.41,5.79,0.35,3.49,0.19,0.12,0.99,1.22,0.1,1.08,1.1,1.035,4
1708,BambuTx1709,BambuGene255580,0.31,0.89,2.13,6.47,0.04,11.78,0.48,1.2,0.1,2.2,0.53,3.27,1.71,1.045,8


In [56]:
## List of new for known transcripts to be used in PCR validation

new_for_known_pcr_val = ["BambuTx1879", "BambuTx1324", "BambuTx1322", "BambuTx1138", "BambuTx1532", "BambuTx2900"]


In [57]:
## List of new for new transcripts to be used in PCR validation

new_for_new_pcr_val = ["BambuTx151", "BambuTx1891" "BambuTx1711", "BambuTx1025", "BambuTx2804", "BambuTx2506", "BambuTx2710"]

In [58]:
## List of mito and ERCC for new transcripts to be used in PCR validation

mito_and_ercc_for_pcr_val = ["BambuTx1944", "BambuTx1845"]

In [59]:
## Create list of all targets for PCR validation

targets_list = ["BambuTx1879", "BambuTx1324", "BambuTx1322", "BambuTx1138", "BambuTx1532", "BambuTx2900",
               "BambuTx151", "BambuTx1891" "BambuTx1711", "BambuTx1025", "BambuTx2804", "BambuTx2506", "BambuTx2710",
               "BambuTx1944", "BambuTx1845"]

In [60]:
## Read fastq file line by line
fasta_in = open("../../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/transcriptome/transcriptome.fa", 'r')
Lines = fasta_in.readlines()

In [61]:
## Open output fastq files for writing
fasta_targets = open("../../../../data/bernardo/processed/99.other/pcr_primers_for_validation/target_transcriptome.fa", 'w')
fasta_non_targets = open("../../../../data/bernardo/processed/99.other/pcr_primers_for_validation/reference_transcriptome.fa", 'w')

is_target = False

In [62]:
## Create lists with read ids for split files and write lines to files
for i in range(0, len(Lines)):

    if Lines[i].startswith(">"):
        
        if Lines[i].split(">")[1].split("\n")[0] in targets_list:
            fasta_targets.write(Lines[i])
            is_target = True
            
        else:
            fasta_non_targets.write(Lines[i])
            is_target = False
    else:
        if is_target:
            fasta_targets.write(Lines[i])
        else:
            fasta_non_targets.write(Lines[i])

In [63]:
fasta_targets.close()
fasta_non_targets.close()