In [2]:
## Load libraries
import pandas as pd
import numpy as np

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns[1:].tolist()
        list_new_names = ["gene_id"]
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [None]:
## Import bulk data
metadata = pd.read_csv("../public_single_cell_data/primary_motor_cortex_allen_institute/metadata.csv")

In [4]:
metadata = metadata[["sample_name", "class_label"]].copy()

In [5]:
metadata.loc[metadata["class_label"].str.startswith("G"), "cell_type"] = "Neuron" 
metadata.loc[~metadata["class_label"].str.startswith("G"), "cell_type"] = "Non-neuronal" 

In [6]:
metadata = metadata[["sample_name", "cell_type"]].copy()

In [7]:
metadata["cell_type"].value_counts()

Neuron          72528
Non-neuronal     4005
Name: cell_type, dtype: int64

In [15]:
df_allen = pd.read_csv("../public_single_cell_data/primary_motor_cortex_allen_institute/matrix.csv", nrows=5000)

In [16]:
df_final = df_allen.merge(metadata, on="sample_name", how="inner")

In [17]:
df_allen = None

In [18]:
df_final = df_final.dropna()

In [19]:
df_final["cell_type"].value_counts()

Neuron          4795
Non-neuronal     205
Name: cell_type, dtype: int64

In [20]:
df_final = pd.concat([df_final.loc[df_final["cell_type"] == "Neuron"].sample(n=205).copy(),
             df_final.loc[df_final["cell_type"] == "Non-neuronal"].copy()])

In [21]:
df_final.drop(columns="sample_name", inplace=True)

In [22]:
df_final.index = df_final["cell_type"].copy()
df_final = df_final.drop(columns="cell_type")

In [23]:
df_final_T = df_final.T

In [24]:
df_final = None

In [25]:
df_final_T["gene_name"] = df_final_T.index

first_column = df_final_T.pop('gene_name')

df_final_T.insert(0, 'gene_name', first_column)

In [26]:
df_final_T.to_csv("../processed_data/primary_motor_cortex_allen_single_cell_data.tsv", sep="\t", index=False)