In [1]:
## Load libraries
import pandas as pd
import numpy as np

In [2]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns[1:].tolist()
        list_new_names = ["gene_id"]
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [3]:
## Import bulk data
metadata = pd.read_csv("../../../data/bernardo/raw/public_single_cell_data/medial_temporal_cortex_allen_institute_2/cell_metadata.csv")

In [4]:
## Drop unecessary columns
metadata = metadata[["specimen_name", "QCpass", "class_label", "subclass_label"]].copy()

In [5]:
## Drop cells that don't pass QC
metadata = metadata.loc[metadata["QCpass"] == "True"].copy().reset_index(drop=True)

In [6]:
## Create cell_type assignments
metadata.loc[metadata["class_label"].str.startswith("Neuronal:"), "cell_type"] = "Neuron" 
metadata.loc[~metadata["class_label"].str.startswith("Neuronal:"), "cell_type"] = "Non-neuronal"

In [7]:
## Only keep relevant columns
metadata = metadata[["specimen_name", "cell_type"]].copy()

In [9]:
## Create final metadata dataframe with 1000 cells from each cell type of interest
metadata_final = pd.concat([metadata.loc[metadata["cell_type"] == "Neuron"].sample(n=2000, random_state=27).copy(),
                     metadata.loc[metadata["cell_type"] == "Non-neuronal"].sample(n=2000, random_state=28).copy()])

In [10]:
## Load allen data for medial temporal cortex
df_allen = pd.read_csv("../../../data/bernardo/raw/public_single_cell_data/medial_temporal_cortex_allen_institute_2/Reference_MTG_RNAseq_all-nuclei.2022-06-07.csv",)

In [11]:
## Create final dataframe with desired cell types (1000 each) 
df_final = df_allen.merge(metadata_final, on="specimen_name", how="inner")

In [12]:
## Get number of instances for each cell type
df_final["cell_type"].value_counts()

Non-neuronal    2000
Neuron          2000
Name: cell_type, dtype: int64

In [13]:
## Set the allen dataframe to None so it doesn't take up memory
df_allen = None

In [14]:
## Drop Nas
df_final = df_final.dropna()

In [15]:
## Drop specimen name column
df_final.drop(columns="specimen_name", inplace=True)

In [16]:
## Make celltype the index
df_final.index = df_final["cell_type"].copy()
df_final = df_final.drop(columns="cell_type")

In [17]:
## Make columns into rows and rows into columns
df_final_T = df_final.T

In [18]:
## Free up memory
df_final = None

In [19]:
## Create gene_name column and make it the first column
df_final_T["gene_name"] = df_final_T.index

first_column = df_final_T.pop('gene_name')

df_final_T.insert(0, 'gene_name', first_column)

In [20]:
df_final_T.to_csv("../../../data/bernardo/processed/06.deconvolution_analysis/medial_temporal_gyrus_allen_single_cell_data_two_types.tsv", sep="\t", index=False)