In [1]:
## Load libraries
import pandas as pd
import numpy as np

In [2]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns[1:].tolist()
        list_new_names = ["gene_id"]
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [4]:
## Import bulk data
df_bulk = pd.read_csv("../../../data/bernardo/raw/ad_vs_ct_pilot_study_february_2023_GRCh38-107_discovery/bambu_discovery/counts_gene.txt", sep="\t")
df_bulk = fix_column_names(df_bulk, is_gene=True)

In [5]:
## Include disease status in column names
sample_conditions = ["AD", "CT", "AD", "CT", "CT", "CT", "CT", "AD", "AD", "CT", "AD", "AD"]

list_new_names = ["gene_id"]

for i in range(len(df_bulk.columns[1:].to_list())):
    new_name = df_bulk.columns[(i+1)].split("PA")[0] + sample_conditions[i]
    list_new_names.append(new_name)
    
df_bulk.columns = list_new_names

In [6]:
## Load HUGO gene symbol converter

name_converter = pd.read_csv("../../../references/bernardo/hugo_name_converter.csv")

In [7]:
## Inner merge with counts matrix base on gene_id

df_bulk_named = df_bulk.merge(name_converter, on="gene_id", how="inner")

In [8]:
df_bulk_named.drop(columns="gene_id", inplace=True)

In [9]:
df_bulk_named.columns

Index(['sample_579_AD', 'sample_1131_CT', 'sample_1218_AD', 'sample_1304_CT',
       'sample_1271_CT', 'sample_5356_CT', 'sample_1163_CT', 'sample_5295_AD',
       'sample_5292_AD', 'sample_1092_CT', 'sample_1186_AD', 'sample_1291_AD',
       'gene_name'],
      dtype='object')

In [10]:
new_column_order = ['gene_name', 'sample_1131_CT',  'sample_1304_CT', 'sample_1271_CT', 'sample_5356_CT', 'sample_1163_CT',
                    'sample_1092_CT', 'sample_5292_AD', 'sample_1186_AD', 'sample_1291_AD', 'sample_579_AD', 
                    'sample_1218_AD', 'sample_5295_AD']
       

In [11]:
df_bulk_final = df_bulk_named[new_column_order].copy()

In [12]:
df_bulk_final.to_csv("../../../data/bernardo/processed/06.deconvolution_analysis/processed_bulk_data.tsv", sep="\t", index=False)