# Create Explore Results

# Library Import and Functions

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
from scipy import stats


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

# Import data

In [3]:
res = pd.read_csv("../../../data/bernardo/processed/04.deseq2/multiple_transcripts_results.tsv", sep="\t")
res.reset_index(inplace=True, drop=False, names="transcript_id")

res_med = pd.read_csv("../../../data/bernardo/processed/04.deseq2/multiple_transcripts_results_med_relevant.tsv", sep="\t")
res_med.reset_index(inplace=True, drop=False, names="transcript_id")

In [4]:
## Open original reference
original_ref = pd.read_csv("../../../references/bernardo/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")


## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="transcript"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True, is_transcript=True)

In [5]:
## Import and parse through extended annotations
bambu_ref = pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

bambu_ref = bambu_ref.loc[~bambu_ref["chr"].str.startswith("ERCC-")]

bambu_ref = parse_df_columns(bambu_ref, is_ref=False)

bambu_ref = bambu_ref.loc[bambu_ref["type"] == "transcript"].copy()

In [6]:
bambu_ref_names = bambu_ref[["gene_id", "transcript_id"]].copy()

orig_ref_names = orig_ref[["gene_id", "gene_name"]].drop_duplicates().copy()

In [7]:
res_med = res_med.merge(bambu_ref_names, on="transcript_id", how="left")
res_med = res_med.merge(orig_ref_names, on=["gene_id"], how="left")

res = res.merge(bambu_ref_names, on="transcript_id", how="left")
res = res.merge(orig_ref_names, on=["gene_id"], how="left")

In [8]:
res_med_filtered = res_med.loc[~res_med["padj"].isna()].copy()
res_med_filtered = res_med_filtered.loc[((res_med_filtered["padj"] < 0.05) & (abs(res_med_filtered["log2FoldChange"]) > 1))].copy()


res_filtered = res.loc[~res["padj"].isna()].copy()
res_filtered = res_filtered.loc[((res["padj"] < 0.05) & (abs(res_filtered["log2FoldChange"]) > 1))].copy()

In [9]:
res_med_filtered

Unnamed: 0,transcript_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_name
73,BambuTx2920,234.509098,-2.776865,0.726436,-3.822585,0.0001320598,0.02895,ENSG00000198899,MT-ATP6
91,ENST00000162749,93.389737,-3.882313,1.042564,-3.723813,0.0001962362,0.034217,ENSG00000067182,TNFRSF1A
323,ENST00000261499,111.74124,-1.090523,0.30034,-3.630959,0.0002823703,0.040876,ENSG00000108641,B9D1
716,ENST00000307063,203.897328,1.195732,0.277719,4.305555,1.665676e-05,0.007036,ENSG00000170234,PWWP2A
816,ENST00000316562,119.402371,3.539391,0.849386,4.167,3.086345e-05,0.009456,ENSG00000125779,PANK2
871,ENST00000322776,3045.227806,1.423284,0.378885,3.75651,0.0001722997,0.032068,ENSG00000167792,NDUFV1
1240,ENST00000354232,412.184772,4.242227,1.010506,4.198123,2.69136e-05,0.008842,ENSG00000105379,ETFB
1469,ENST00000361789,146261.374426,-1.275634,0.296788,-4.298126,1.72248e-05,0.007036,ENSG00000198727,MT-CYB
1513,ENST00000367380,675.130747,5.665098,1.484705,3.81564,0.0001358304,0.02895,ENSG00000120265,PCMT1
1752,ENST00000373203,109.145647,-4.703581,1.193287,-3.941701,8.090566e-05,0.01983,ENSG00000106991,ENG


In [10]:
res_filtered

Unnamed: 0,transcript_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_name
403,BambuTx2703,167.530879,3.562361,0.84685,4.206602,2.592389e-05,0.01295655,BambuGene290099,
407,BambuTx2920,235.094218,-2.800427,0.713858,-3.922948,8.747214e-05,0.02400035,ENSG00000198899,MT-ATP6
490,ENST00000162749,94.215795,-3.926734,1.026298,-3.826115,0.0001301817,0.02905135,ENSG00000067182,TNFRSF1A
638,ENST00000219479,199.439517,-2.727422,0.723818,-3.768105,0.0001644918,0.03226586,ENSG00000103202,NME4
850,ENST00000238081,82.445946,-2.642296,0.635129,-4.160253,3.178958e-05,0.01330155,ENSG00000134308,YWHAQ
1098,ENST00000253410,326.370498,-2.26311,0.446577,-5.067683,4.026883e-07,0.0006739795,ENSG00000131097,HIGD1B
1261,ENST00000259477,1350.346156,1.856262,0.487021,3.81146,0.0001381486,0.02910336,ENSG00000136950,ARPC5L
1329,ENST00000261499,111.856904,-1.084985,0.285901,-3.794971,0.0001476606,0.03013897,ENSG00000108641,B9D1
1431,ENST00000262746,1217.076737,3.099541,0.74024,4.187212,2.824024e-05,0.01295655,ENSG00000117450,PRDX1
2694,ENST00000307063,206.460027,1.216317,0.290124,4.19241,2.76007e-05,0.01295655,ENSG00000170234,PWWP2A


In [11]:
res_med_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_results", index=False)
res_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_results", index=False)