# Create DESEQ2 annotations


##  - Create sample condition dataframe.

## - Gene level counts matrix.

## - Transcript level counts matrix.


## - Create two aditional transcript count matrices

###     -- Transcripts from genes with 2+ transcripts expressed at median CPM > 1


###     -- Transcripts from med-relevant genes with 2+ transcripts expressed at median CPM > 1



###     -- Gene level counts for all AD related genes

###     -- Transcript level counts for all AD related genes



###     -- Full length counts for all novel mitochondrial transcripts

###      --- Unique counts for all novel mitochondrial transcripts

# Library Import and Functions

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
from scipy import stats


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: get_real_transcript_length

purpose: create dataframe containing exonic transcript length and transcript_id

input: bambu gtf file

output: dataframe with two columns, transcript_id and real_transcript_length
'''

def get_real_transcript_length(annotation):
    
    annotation = parse_df_columns(annotation, is_ref=False)
    
    annotation["real_transcript_length"] = abs(annotation["end"] - annotation["start"])
    
    exon_annotation = annotation.loc[annotation["type"] == "exon"][["transcript_id", "real_transcript_length"]].copy()
    
    df_transcript_length = exon_annotation.groupby("transcript_id").sum()
    
    df_transcript_length.reset_index(inplace=True)
    
    return df_transcript_length    

In [3]:
'''
name: relative_transcript_abundance

purpose: calculate relative transcript abundance

input: a dataframe with a ref_gene_id column identifying the transcript gene of origin and a cov columns with 
the coverage for the transcripts.

output: the same dataframe with a relative abundance column added
'''



def relative_transcript_abundance(df):
    
    df_sums = df[["gene_id", "total_CPM"]].groupby("gene_id").sum()
    
    df_sums["total_CPM_gene"] = df_sums["total_CPM"]

    df_sums.drop(columns="total_CPM", inplace=True)
    
    merged_df = pd.merge(df, df_sums, how='inner', on="gene_id")
    
    merged_df["relative_abundance_percent"] = ((merged_df["total_CPM"]/merged_df["total_CPM_gene"]) * 100)
    
    merged_df["total_CPM_transcript"] = merged_df["total_CPM"]
    
    merged_df.drop(columns="total_CPM", inplace=True)


    return merged_df

In [4]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns[1:].tolist()
        list_new_names = ["gene_id"]
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0]
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [5]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [6]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = df.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = df.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        df[col] = round(df[col], 2)
        cpm_name = col + "_CPM"
        df[cpm_name] = round(((df[col]/(df[col].sum())) * 1000000), 2)
    
    return df  

# Import data

In [7]:
## Bambu counts matrix
df_transcript = pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/counts_transcript.txt", 
                           delimiter="\t", low_memory=False, header=0)

## Import gene level bambu counts matrix
df_gene =  pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/counts_gene.txt", 
                           delimiter="\t", low_memory=False, header=0)

In [8]:
## Fix column names
df_transcript_output = fix_column_names(df_transcript.copy(), is_gene=False)

df_gene_output = fix_column_names(df_gene.copy(), is_gene=True)

In [9]:
df_gene_output.loc[df_gene_output["gene_id"] == "ENSG00000203710"]

In [10]:
df_transcript_output.loc[df_transcript_output["gene_id"] == "ENSG00000203710"]["sample_5356_PAM42933"].sum()

In [11]:
df_gene_output.loc[df_gene_output["gene_id"] == "ENSG00000186868"]["sample_5356_PAM42933"].sum()

In [12]:
df_transcript_output.loc[df_transcript_output["gene_id"] == "ENSG00000186868"]["sample_5356_PAM42933"].sum()

In [None]:
## Import and parse through extended annotations
bambu_ref = pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/extended_annotations.gtf", header=None, delimiter="\t",
                        low_memory=False, comment="#", names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"])

bambu_ref = bambu_ref.loc[~bambu_ref["chr"].str.startswith("ERCC-")]

bambu_ref = parse_df_columns(bambu_ref, is_ref=False)

bambu_ref = bambu_ref.loc[bambu_ref["type"] == "transcript"].copy()

mito_tid = bambu_ref.loc[bambu_ref["chr"] == "MT"]["transcript_id"].copy()
mito_gid = bambu_ref.loc[bambu_ref["chr"] == "MT"]["gene_id"].copy()

#  - Create sample condition dataframe.

In [None]:
## Make experimental design table and save it

sample_ids = df_transcript_output.columns[2:].to_list()

sample_conditions = ["AD", "CT", "AD", "CT", "CT", "CT", "CT", "AD", "AD", "CT", "AD", "AD"]

sample_sex = ["Male", "Female", "Female", "Male", "Male", "Male", "Female", "Female", "Male", 
             "Female", "Female", "Male"]


d_condition = {"sample_id": sample_ids, "condition": sample_conditions, "sex": sample_sex}

df_experimental_design = pd.DataFrame(data=d_condition)

df_experimental_design.to_csv("../../../data/bernardo/processed/04.deseq2/experimental_design.tsv", sep="\t", 
                             index=False)

# - Gene level counts matrix.

In [None]:
## Save unfiltered gene dataset

df_gene_output[df_gene_output.columns[1:]] = round(df_gene_output[df_gene_output.columns[1:]],0).astype(int).copy()

df_gene_output = df_gene_output.loc[~df_gene_output["gene_id"].str.startswith("ERCC")].copy()
df_gene_output = df_gene_output.loc[~df_gene_output["gene_id"].isin(mito_gid)].copy()

df_gene_output.to_csv("../../../data/bernardo/processed/04.deseq2/gene_counts_unfiltered.tsv", sep="\t", 
                             index=False)



In [None]:
## Save unfiltered gene dataset

df_transcript_output[df_transcript_output.columns[2:]] = round(df_transcript_output[df_transcript_output.columns[2:]],0).astype(int).copy()

df_transcript_output = df_transcript_output.loc[~df_transcript_output["transcript_id"].str.startswith("DQ")]
df_transcript_output = df_transcript_output.loc[~df_transcript_output["transcript_id"].isin(mito_tid)].copy()

df_transcript_output.drop(columns="gene_id", inplace=True)

df_transcript_output.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_counts_unfiltered.tsv", sep="\t", 
                             index=False)

# - Create two aditional transcript count matrices

##     -- Transcripts from genes with 2+ transcripts expressed at median CPM > 1


##     -- Transcripts from med-relevant genes with 2+ transcripts expressed at median CPM > 1

In [None]:
## Calculate median CPM for transcripts

df_transcript_cpm = fix_column_names(df_transcript.copy(), is_gene=False)
df_transcript_cpm = calculate_cpm(df_transcript_cpm, is_gene=False)

df_transcript_cpm["median_CPM"] = df_transcript_cpm[df_transcript_cpm.filter(regex="CPM").columns].median(axis=1)

In [None]:
## Only keep transcripts with a median CPM > 1

df_transcript_cpm = df_transcript_cpm[["gene_id", "transcript_id", "median_CPM"]].copy()

df_transcript_cpm = df_transcript_cpm.loc[df_transcript_cpm["median_CPM"] > 1].copy()

In [None]:
## Find transcript ids of transcripts from genes with 2+ transcripts at median CPM > 1

df_transcript_multiple = df_transcript_cpm.loc[df_transcript_cpm.duplicated(subset=["gene_id"], keep=False)].copy()

multiple_transcript_ids = df_transcript_multiple["transcript_id"].copy()

In [None]:
## Do the same as above but restricting to medically relevant genes

## Import disease relevant genes
disease_relevant_genes = pd.read_csv("../../../references/bernardo/medically_relevant_genes.tsv", sep="\t")


df_transcript_multiple_med_relevant = df_transcript_multiple.loc[df_transcript_multiple["gene_id"].isin(
                                        disease_relevant_genes["gene_id"])].copy()


multiple_transcript_ids_med_relevant = df_transcript_multiple_med_relevant["transcript_id"].copy()

In [None]:
## Filter transcript output df by the transcript ids defined above and write it to a file


df_transcript_output_multiple = df_transcript_output.loc[df_transcript_output["transcript_id"].isin(
                                    multiple_transcript_ids)].copy()


df_transcript_output_multiple_med_relevant = df_transcript_output.loc[df_transcript_output["transcript_id"].isin(
                                    multiple_transcript_ids_med_relevant)].copy()



df_transcript_output_multiple.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_counts_multiple_median_cpm_1.tsv",
                                     sep="\t", index=False)


df_transcript_output_multiple_med_relevant.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_counts_multiple_med_relevant_median_cpm_1.tsv",
                            sep="\t", index=False)

# Gene & Transcript level counts for all AD related genes

In [None]:
## Get AD names
ad_names = pd.read_csv("../../../references/bernardo/AD_gwas_genes.tsv", sep="\t")

In [None]:
## Fix column names
df_transcript_ad = fix_column_names(df_transcript.copy(), is_gene=False)

In [None]:
## Fix transcripts 
df_transcript_ad[df_transcript_ad.columns[2:]] = round(df_transcript_ad[df_transcript_ad.columns[2:]],0).astype(int).copy()

df_transcript_ad = df_transcript_ad.loc[~df_transcript_ad["transcript_id"].str.startswith("DQ")]
df_transcript_ad = df_transcript_ad.loc[~df_transcript_ad["transcript_id"].isin(mito_tid)].copy()


df_transcript_ad = df_transcript_ad.merge(ad_names, on="gene_id", how="inner")
df_transcript_ad.sort_values(by="gene_name", inplace=True)

In [None]:
## Create transcript id to gene name converter
transcript_converter = df_transcript_ad[["gene_name", "gene_id", "transcript_id"]].copy()

In [None]:
## Add chromosome to transcript converter
bambu_ref_chr = bambu_ref[["gene_id", "chr"]].copy().drop_duplicates()

transcript_converter = transcript_converter.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()

In [None]:
## Put counts in format for DESEQ2 and save it

df_transcript_ad.drop(columns=["gene_id", "gene_name"], inplace=True)

df_transcript_ad.to_csv("../../../data/bernardo/processed/04.deseq2/AD_transcript_counts.tsv", sep="\t", index=False)
transcript_converter.to_csv("../../../data/bernardo/processed/04.deseq2/AD_transcript_converter.tsv", sep="\t", index=False)

In [None]:
## Fix column names
df_gene_ad = fix_column_names(df_gene.copy(), is_gene=True)

In [None]:
## Fix transcripts 
df_gene_ad[df_gene_ad.columns[1:]] = round(df_gene_ad[df_gene_ad.columns[1:]],0).astype(int).copy()

df_gene_ad = df_gene_ad.loc[~df_gene_ad["gene_id"].str.startswith("ERCC")]
df_gene_ad = df_gene_ad.loc[~df_gene_ad["gene_id"].isin(mito_gid)].copy()


df_gene_ad = df_gene_ad.merge(ad_names, on="gene_id", how="inner")
df_gene_ad.sort_values(by="gene_name", inplace=True)

In [None]:
## Create transcript id to gene name converter
gene_converter = df_gene_ad[["gene_name", "gene_id"]].copy()

In [None]:
## Add chromosome to gene converter
gene_converter = gene_converter.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()

In [None]:
df_gene_ad.drop(columns="gene_name", inplace=True)

df_gene_ad.to_csv("../../../data/bernardo/processed/04.deseq2/AD_gene_counts.tsv", sep="\t", index=False)
gene_converter.to_csv("../../../data/bernardo/processed/04.deseq2/AD_gene_converter.tsv", sep="\t", index=False)

# Full or unique length counts for all novel mitochondrial transcripts

In [None]:
df_full_length = pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/fullLengthCounts_transcript.txt", sep="\t")

df_unique = pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/uniqueCounts_transcript.txt", sep="\t")

In [None]:
df_full_length = fix_column_names(df_full_length, is_gene=False)

df_unique = fix_column_names(df_unique, is_gene=False)

In [None]:
df_full_length_mito = df_full_length.loc[df_full_length["transcript_id"].isin(mito_tid)].copy()
df_unique_mito = df_unique.loc[df_unique["transcript_id"].isin(mito_tid)].copy()

df_full_length_mito = df_full_length_mito.loc[df_full_length_mito["transcript_id"].str.startswith("Bambu")].copy()
df_unique_mito = df_unique_mito.loc[df_unique_mito["transcript_id"].str.startswith("Bambu")].copy()


df_full_length_mito[df_full_length_mito.columns[2:]] = round(df_full_length_mito[df_full_length_mito.columns[2:]],0).astype(int).copy()
df_unique_mito[df_unique_mito.columns[2:]] = round(df_unique_mito[df_unique_mito.columns[2:]],0).astype(int).copy()


In [None]:
df_full_length_mito.drop(columns="gene_id", inplace=True)

df_unique_mito.drop(columns="gene_id", inplace=True)

In [None]:
df_full_length_mito.to_csv("../../../data/bernardo/processed/04.deseq2/new_mito_full_length.tsv", sep="\t", index=False)
df_unique_mito.to_csv("../../../data/bernardo/processed/04.deseq2/new_mito_unique.tsv", sep="\t", index=False)

# Make converter for differentially expressed genes and transcripts

In [None]:
## Open results files
res_gene_AD = pd.read_csv("../../../data/bernardo/processed/04.deseq2/genes_filtered_results_AD_vs_CT.csv")
res_trans_AD = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_results_AD_vs_CT.csv")
res_trans_med_AD = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_results_AD_vs_CT.csv")

res_gene_AD_two = pd.read_csv("../../../data/bernardo/processed/04.deseq2/genes_filtered_two_cell_types_results_AD_vs_CT.csv")
res_trans_AD_two = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_two_cell_types_results_AD_vs_CT.csv")
res_trans_med_AD_two = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_two_cell_types_results_AD_vs_CT.csv")

res_gene_AD_four = pd.read_csv("../../../data/bernardo/processed/04.deseq2/genes_filtered_four_cell_types_results_AD_vs_CT.csv")
res_trans_AD_four = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_four_cell_types_results_AD_vs_CT.csv")
res_trans_med_AD_four = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_four_cell_types_results_AD_vs_CT.csv")



res_gene_SEX = pd.read_csv("../../../data/bernardo/processed/04.deseq2/genes_filtered_results_M_vs_F.csv")
res_trans_SEX = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_results_M_vs_F.csv")
res_trans_med_SEX = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_results_M_vs_F.csv")

res_gene_SEX_two = pd.read_csv("../../../data/bernardo/processed/04.deseq2/genes_filtered_two_cell_types_results_M_vs_F.csv")
res_trans_SEX_two = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_two_cell_types_results_M_vs_F.csv")
res_trans_med_SEX_two = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_two_cell_types_results_M_vs_F.csv")

res_gene_SEX_four = pd.read_csv("../../../data/bernardo/processed/04.deseq2/genes_filtered_four_cell_types_results_M_vs_F.csv")
res_trans_SEX_four = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_multiple_transcripts_four_cell_types_results_M_vs_F.csv")
res_trans_med_SEX_four = pd.read_csv("../../../data/bernardo/processed/04.deseq2/filtered_med_relevant_multiple_transcripts_four_cell_types_results_M_vs_F.csv")

In [None]:
## Fill NAs in gene_name column with the gene_id column
res_gene_AD.loc[res_gene_AD["gene_name"].isna(), "gene_name"] = res_gene_AD["gene_id"].copy()
res_trans_AD.loc[res_trans_AD["gene_name"].isna(), "gene_name"] = res_trans_AD["gene_id"].copy()
res_trans_med_AD.loc[res_trans_med_AD["gene_name"].isna(), "gene_name"] = res_trans_med_AD["gene_id"].copy()

res_gene_AD_two.loc[res_gene_AD_two["gene_name"].isna(), "gene_name"] = res_gene_AD_two["gene_id"].copy()
res_trans_AD_two.loc[res_trans_AD_two["gene_name"].isna(), "gene_name"] = res_trans_AD_two["gene_id"].copy()
res_trans_med_AD_two.loc[res_trans_med_AD_two["gene_name"].isna(), "gene_name"] = res_trans_med_AD_two["gene_id"].copy()

res_gene_AD_four.loc[res_gene_AD_four["gene_name"].isna(), "gene_name"] = res_gene_AD_four["gene_id"].copy()
res_trans_AD_four.loc[res_trans_AD_four["gene_name"].isna(), "gene_name"] = res_trans_AD_four["gene_id"].copy()
res_trans_med_AD_four.loc[res_trans_med_AD_four["gene_name"].isna(), "gene_name"] = res_trans_med_AD_four["gene_id"].copy()



res_gene_SEX.loc[res_gene_SEX["gene_name"].isna(), "gene_name"] = res_gene_SEX["gene_id"].copy()
res_trans_SEX.loc[res_trans_SEX["gene_name"].isna(), "gene_name"] = res_trans_SEX["gene_id"].copy()
res_trans_med_SEX.loc[res_trans_med_SEX["gene_name"].isna(), "gene_name"] = res_trans_med_SEX["gene_id"].copy()

res_gene_SEX_two.loc[res_gene_SEX_two["gene_name"].isna(), "gene_name"] = res_gene_SEX_two["gene_id"].copy()
res_trans_SEX_two.loc[res_trans_SEX_two["gene_name"].isna(), "gene_name"] = res_trans_SEX_two["gene_id"].copy()
res_trans_med_SEX_two.loc[res_trans_med_SEX_two["gene_name"].isna(), "gene_name"] = res_trans_med_SEX_two["gene_id"].copy()

res_gene_SEX_four.loc[res_gene_SEX_four["gene_name"].isna(), "gene_name"] = res_gene_SEX_four["gene_id"].copy()
res_trans_SEX_four.loc[res_trans_SEX_four["gene_name"].isna(), "gene_name"] = res_trans_SEX_four["gene_id"].copy()
res_trans_med_SEX_four.loc[res_trans_med_SEX_four["gene_name"].isna(), "gene_name"] = res_trans_med_SEX_four["gene_id"].copy()

In [None]:
res_gene_AD = res_gene_AD.sort_values(by="padj", ascending=True)
res_trans_AD = res_trans_AD.sort_values(by="padj", ascending=True)
res_trans_med_AD = res_trans_med_AD.sort_values(by="padj", ascending=True)

res_gene_AD_two = res_gene_AD_two.sort_values(by="padj", ascending=True)
res_trans_AD_two = res_trans_AD_two.sort_values(by="padj", ascending=True)
res_trans_med_AD_two = res_trans_med_AD_two.sort_values(by="padj", ascending=True)

res_gene_AD_four = res_gene_AD_four.sort_values(by="padj", ascending=True)
res_trans_AD_four = res_trans_AD_four.sort_values(by="padj", ascending=True)
res_trans_med_AD_four = res_trans_med_AD_four.sort_values(by="padj", ascending=True)




res_gene_SEX = res_gene_SEX.sort_values(by="padj", ascending=True)
res_trans_SEX = res_trans_SEX.sort_values(by="padj", ascending=True)
res_trans_med_SEX = res_trans_med_SEX.sort_values(by="padj", ascending=True)

res_gene_SEX_two = res_gene_SEX_two.sort_values(by="padj", ascending=True)
res_trans_SEX_two = res_trans_SEX_two.sort_values(by="padj", ascending=True)
res_trans_med_SEX_two = res_trans_med_SEX_two.sort_values(by="padj", ascending=True)

res_gene_SEX_four = res_gene_SEX_four.sort_values(by="padj", ascending=True)
res_trans_SEX_four = res_trans_SEX_four.sort_values(by="padj", ascending=True)
res_trans_med_SEX_four = res_trans_med_SEX_four.sort_values(by="padj", ascending=True)

In [None]:
## Create transcript id to gene name converter

res_gene_AD = res_gene_AD[["gene_name", "gene_id"]].copy()
res_trans_AD = res_trans_AD[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_med_AD = res_trans_med_AD[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_AD = pd.concat([res_trans_AD, res_trans_med_AD]).drop_duplicates()

res_gene_AD_two = res_gene_AD_two[["gene_name", "gene_id"]].copy()
res_trans_AD_two = res_trans_AD_two[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_med_AD_two = res_trans_med_AD_two[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_AD_two = pd.concat([res_trans_AD_two, res_trans_med_AD_two]).drop_duplicates()

res_gene_AD_four = res_gene_AD_four[["gene_name", "gene_id"]].copy()
res_trans_AD_four = res_trans_AD_four[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_med_AD_four = res_trans_med_AD_four[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_AD_four = pd.concat([res_trans_AD_four, res_trans_med_AD_four]).drop_duplicates()





res_gene_SEX = res_gene_SEX[["gene_name", "gene_id"]].copy()
res_trans_SEX = res_trans_SEX[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_med_SEX = res_trans_med_SEX[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_SEX = pd.concat([res_trans_SEX, res_trans_med_SEX]).drop_duplicates()

res_gene_SEX_two = res_gene_SEX_two[["gene_name", "gene_id"]].copy()
res_trans_SEX_two = res_trans_SEX_two[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_med_SEX_two = res_trans_med_SEX_two[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_SEX_two = pd.concat([res_trans_SEX_two, res_trans_med_SEX_two]).drop_duplicates()

res_gene_SEX_four = res_gene_SEX_four[["gene_name", "gene_id"]].copy()
res_trans_SEX_four = res_trans_SEX_four[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_med_SEX_four = res_trans_med_SEX_four[["gene_name", "gene_id", "transcript_id"]].copy()
res_trans_SEX_four = pd.concat([res_trans_SEX_four, res_trans_med_SEX_four]).drop_duplicates()

In [None]:
## Add chromosome!
res_gene_AD = res_gene_AD.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()
res_trans_AD = res_trans_AD.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()

res_gene_AD_two = res_gene_AD_two.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()
res_trans_AD_two = res_trans_AD_two.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()

res_gene_AD_four = res_gene_AD_four.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()
res_trans_AD_four = res_trans_AD_four.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()




res_gene_SEX = res_gene_SEX.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()
res_trans_SEX = res_trans_SEX.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()

res_gene_SEX_two = res_gene_SEX_two.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()
res_trans_SEX_two = res_trans_SEX_two.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()

res_gene_SEX_four = res_gene_SEX_four.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()
res_trans_SEX_four = res_trans_SEX_four.merge(bambu_ref_chr, on="gene_id", how="inner").drop_duplicates()

In [None]:
## Put counts in format for DESEQ2 and save it
res_gene_AD.to_csv("../../../data/bernardo/processed/04.deseq2/gene_results_converter_AD.tsv", sep="\t", index=False)
res_trans_AD.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_results_converter_AD.tsv", sep="\t", index=False)

res_gene_AD_two.to_csv("../../../data/bernardo/processed/04.deseq2/gene_results_converter_AD_two.tsv", sep="\t", index=False)
res_trans_AD_two.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_results_converter_AD_two.tsv", sep="\t", index=False)

res_gene_AD_four.to_csv("../../../data/bernardo/processed/04.deseq2/gene_results_converter_AD_four.tsv", sep="\t", index=False)
res_trans_AD_four.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_results_converter_AD_four.tsv", sep="\t", index=False)




res_gene_SEX.to_csv("../../../data/bernardo/processed/04.deseq2/gene_results_converter_SEX.tsv", sep="\t", index=False)
res_trans_SEX.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_results_converter_SEX.tsv", sep="\t", index=False)

res_gene_SEX_two.to_csv("../../../data/bernardo/processed/04.deseq2/gene_results_converter_SEX_two.tsv", sep="\t", index=False)
res_trans_SEX_two.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_results_converter_SEX_two.tsv", sep="\t", index=False)

res_gene_SEX_four.to_csv("../../../data/bernardo/processed/04.deseq2/gene_results_converter_SEX_four.tsv", sep="\t", index=False)
res_trans_SEX_four.to_csv("../../../data/bernardo/processed/04.deseq2/transcript_results_converter_SEX_four.tsv", sep="\t", index=False)