# Create Explore Results

# Library Import and Functions

In [12]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
from scipy import stats


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [13]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

# Import data

In [14]:
res = pd.read_csv("../../../data/bernardo/processed/04.deseq2/multiple_transcripts_results.tsv", sep="\t")
res.reset_index(inplace=True, drop=False, names="transcript_id")

res_med = pd.read_csv("../../../data/bernardo/processed/04.deseq2/multiple_transcripts_results_med_relevant.tsv", sep="\t")
res_med.reset_index(inplace=True, drop=False, names="transcript_id")

In [15]:
## Open original reference
original_ref = pd.read_csv("../../../references/bernardo/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")


## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="transcript"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True, is_transcript=True)

In [16]:
## Import and parse through extended annotations
bambu_ref = pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

bambu_ref = bambu_ref.loc[~bambu_ref["chr"].str.startswith("ERCC-")]

bambu_ref = parse_df_columns(bambu_ref, is_ref=False)

bambu_ref = bambu_ref.loc[bambu_ref["type"] == "transcript"].copy()

In [17]:
bambu_ref_names = bambu_ref[["gene_id", "transcript_id"]].copy()

orig_ref_names = orig_ref[["gene_id", "gene_name"]].drop_duplicates().copy()

In [18]:
res_med = res_med.merge(bambu_ref_names, on="transcript_id", how="left")
res_med = res_med.merge(orig_ref_names, on=["gene_id"], how="left")

res = res.merge(bambu_ref_names, on="transcript_id", how="left")
res = res.merge(orig_ref_names, on=["gene_id"], how="left")

In [19]:
res_med_filtered = res_med.loc[~res_med["padj"].isna()].copy()
res_med_filtered = res_med_filtered.loc[((res_med_filtered["padj"] < 0.05) & (abs(res_med_filtered["log2FoldChange"]) > 1))].copy()


res_filtered = res.loc[~res["padj"].isna()].copy()
res_filtered = res_filtered.loc[((res["padj"] < 0.05) & (abs(res_filtered["log2FoldChange"]) > 1))].copy()

In [20]:
res_med_filtered

Unnamed: 0,transcript_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_name
63,ENST00000162749,93.406468,-3.886599,1.042495,-3.728169,0.0001928757,0.029844,ENSG00000067182,TNFRSF1A
295,ENST00000261499,111.695543,-1.096631,0.300183,-3.653201,0.000258991,0.037211,ENSG00000108641,B9D1
688,ENST00000307063,203.430343,1.188705,0.277145,4.28911,1.793907e-05,0.006561,ENSG00000170234,PWWP2A
788,ENST00000316562,119.034568,3.531255,0.849668,4.15604,3.238116e-05,0.008685,ENSG00000125779,PANK2
843,ENST00000322776,3037.330639,1.416436,0.37809,3.746297,0.0001794639,0.029312,ENSG00000167792,NDUFV1
1212,ENST00000354232,410.176448,4.237782,1.00927,4.19886,2.682625e-05,0.007926,ENSG00000105379,ETFB
1475,ENST00000367380,672.113953,5.654584,1.484561,3.808926,0.000139572,0.025888,ENSG00000120265,PCMT1
1714,ENST00000373203,109.341926,-4.710604,1.193605,-3.946535,7.929022e-05,0.016789,ENSG00000106991,ENG
1749,ENST00000374550,1725.613549,-7.869932,1.754981,-4.484342,7.313944e-06,0.003269,ENSG00000142676,RPL11
1841,ENST00000377199,187.000771,2.284971,0.508752,4.491329,7.078024e-06,0.003269,ENSG00000204713,TRIM27


In [21]:
res_filtered

Unnamed: 0,transcript_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_name
376,BambuTx2703,167.38913,3.560441,0.846821,4.204479,2.616839e-05,0.01395364,BambuGene290099,
458,ENST00000162749,94.230063,-3.929212,1.025958,-3.829796,0.0001282493,0.03024604,ENSG00000067182,TNFRSF1A
606,ENST00000219479,199.556785,-2.730466,0.723881,-3.771981,0.0001619564,0.03364836,ENSG00000103202,NME4
818,ENST00000238081,82.495404,-2.645551,0.635239,-4.164656,3.118221e-05,0.01395364,ENSG00000134308,YWHAQ
1066,ENST00000253410,326.607619,-2.266557,0.44699,-5.07071,3.963339e-07,0.000691682,ENSG00000131097,HIGD1B
1229,ENST00000259477,1349.335037,1.854543,0.486818,3.80952,0.000139237,0.03075957,ENSG00000136950,ARPC5L
1297,ENST00000261499,111.847347,-1.087178,0.285618,-3.806404,0.0001410019,0.03075957,ENSG00000108641,B9D1
1399,ENST00000262746,1216.082784,3.09718,0.740288,4.183752,2.86737e-05,0.01395364,ENSG00000117450,PRDX1
2662,ENST00000307063,206.321595,1.2139,0.289974,4.186238,2.836162e-05,0.01395364,ENSG00000170234,PWWP2A
2995,ENST00000315480,97.301166,3.069609,0.811733,3.78155,0.0001558549,0.03277085,ENSG00000136878,USP20


In [23]:
res_med_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_results.csv", index=False)
res_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_results.csv", index=False)