# Create Explore Results

# Library Import and Functions

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
from scipy import stats


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

# Import data and process AD vs CT data

In [3]:
res = pd.read_csv("../../../data/bernardo/processed/04.deseq2/multiple_transcripts_results_AD_vs_CT.tsv", sep="\t")
res.reset_index(inplace=True, drop=False, names="transcript_id")

res_med = pd.read_csv("../../../data/bernardo/processed/04.deseq2/multiple_transcripts_results_med_relevant_AD_vs_CT.tsv", sep="\t")
res_med.reset_index(inplace=True, drop=False, names="transcript_id")

res_gene = pd.read_csv("../../../data/bernardo/processed/04.deseq2/genes_AD_vs_CT_results.tsv", sep="\t")
res_gene.reset_index(inplace=True, drop=False, names="gene_id")

In [4]:
## Open original reference
original_ref = pd.read_csv("../../../references/bernardo/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")


## Parse through reference to get gene names and ids
orig_ref = original_ref.loc[original_ref["type"]=="transcript"].copy()
orig_ref = parse_df_columns(orig_ref, is_ref=True, is_transcript=True)

In [5]:
## Import and parse through extended annotations
bambu_ref = pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

bambu_ref = bambu_ref.loc[~bambu_ref["chr"].str.startswith("ERCC-")]

bambu_ref = parse_df_columns(bambu_ref, is_ref=False)

bambu_ref = bambu_ref.loc[bambu_ref["type"] == "transcript"].copy()

In [6]:
bambu_ref_names = bambu_ref[["gene_id", "transcript_id"]].copy()

orig_ref_names = orig_ref[["gene_id", "gene_name"]].drop_duplicates().copy()

In [7]:
res_med = res_med.merge(bambu_ref_names, on="transcript_id", how="left")
res_med = res_med.merge(orig_ref_names, on=["gene_id"], how="left")

res = res.merge(bambu_ref_names, on="transcript_id", how="left")
res = res.merge(orig_ref_names, on=["gene_id"], how="left")


res_gene = res_gene.merge(orig_ref_names, on=["gene_id"], how="left")

In [8]:
res_med_filtered = res_med.loc[~res_med["padj"].isna()].copy()
res_med_filtered = res_med_filtered.loc[((res_med_filtered["padj"] < 0.05) & (abs(res_med_filtered["log2FoldChange"]) > 1))].copy()


res_filtered = res.loc[~res["padj"].isna()].copy()
res_filtered = res_filtered.loc[((res_filtered["padj"] < 0.05) & (abs(res_filtered["log2FoldChange"]) > 1))].copy()


res_gene_filtered = res_gene.loc[~res_gene["padj"].isna()].copy()
res_gene_filtered = res_gene_filtered.loc[((res_gene_filtered["padj"] < 0.05) & (abs(res_gene_filtered["log2FoldChange"]) > 1))].copy()

In [9]:
res_med_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_results_AD_vs_CT.csv", index=False)
res_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_results_AD_vs_CT.csv", index=False)
res_gene_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/genes_filtered_results_AD_vs_CT.csv")

# Import data and process Male vs Female data

In [10]:
res = pd.read_csv("../../../data/bernardo/processed/04.deseq2/multiple_transcripts_results_M_vs_F.tsv", sep="\t")
res.reset_index(inplace=True, drop=False, names="transcript_id")

res_med = pd.read_csv("../../../data/bernardo/processed/04.deseq2/multiple_transcripts_results_med_relevant_M_vs_F.tsv", sep="\t")
res_med.reset_index(inplace=True, drop=False, names="transcript_id")

res_gene = pd.read_csv("../../../data/bernardo/processed/04.deseq2/genes_M_vs_F_results.tsv", sep="\t")
res_gene.reset_index(inplace=True, drop=False, names="gene_id")

In [11]:
res_med = res_med.merge(bambu_ref_names, on="transcript_id", how="left")
res_med = res_med.merge(orig_ref_names, on=["gene_id"], how="left")

res = res.merge(bambu_ref_names, on="transcript_id", how="left")
res = res.merge(orig_ref_names, on=["gene_id"], how="left")


res_gene = res_gene.merge(orig_ref_names, on=["gene_id"], how="left")

In [12]:
res_med_filtered = res_med.loc[~res_med["padj"].isna()].copy()
res_med_filtered = res_med_filtered.loc[((res_med_filtered["padj"] < 0.05) & (abs(res_med_filtered["log2FoldChange"]) > 1))].copy()


res_filtered = res.loc[~res["padj"].isna()].copy()
res_filtered = res_filtered.loc[((res_filtered["padj"] < 0.05) & (abs(res_filtered["log2FoldChange"]) > 1))].copy()


res_gene_filtered = res_gene.loc[~res_gene["padj"].isna()].copy()
res_gene_filtered = res_gene_filtered.loc[((res_gene_filtered["padj"] < 0.05) & (abs(res_gene_filtered["log2FoldChange"]) > 1))].copy()

In [13]:
res_med_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_results_M_vs_F.csv", index=False)
res_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_results_M_vs_F.csv", index=False)
res_gene_filtered.to_csv("../../../data/bernardo/processed/04.deseq2/genes_filtered_results_M_vs_F.csv")