In [1]:
import pandas as pd
import os
import glob
import numpy as np

In [2]:
code_dir = os.getcwd()
base_dir = code_dir.replace("/Z_codes_local", "")

In [3]:
def mouse_to_human(inList, reverse=False):
    inList = [str(x).upper() for x in inList]
    
    mmhs_ref = base_dir + "/Human_to_Mouse_geneNames.csv"
    mmhs_df = pd.read_csv(mmhs_ref)
    
    
    if not reverse:
        old_names = mmhs_df['mouse_geneName'].values.tolist()
        new_names = mmhs_df['human_geneName'].values.tolist()
    else:
        old_names = mmhs_df['human_geneName'].values.tolist()
        new_names = mmhs_df['mouse_geneName'].values.tolist()
    old_names = [ str(x).upper() for x in old_names]
    new_names = [ str(x).upper() for x in new_names]
    
    outList = []
    for i in inList:
        if i in old_names:
            idx = old_names.index(i)
            outList.append(new_names[idx])
        else:
            outList.append("")
    if reverse:
        outList = [x.title() for x in outList]
        return(outList)
    else:
        return(outList)

## 0. Convert mouse references to human

*Remove space in gene names for one file*
Choi_2015 = "/media/pipkin/ROCKET-PRO/T_cell_signature_Reference/X_Formatted_csv/CD4/2015_NAT_IMMUNOL_Choi_signatures.csv"
Choi_2015_df = pd.read_csv(Choi_2015)
Choi_2015_df["gene_symbol"] = [x.replace(" ", "") for x in Choi_2015_df["gene_symbol"].tolist()]
Choi_2015_df.to_csv(Choi_2015, index=False)

*Human to mouse*

In [101]:
input_dir = base_dir + '/X_Formatted_csv/*'
output_dir = base_dir + '/X_GeneSignatures_mm'

sig_files = glob.glob("%s/*signatures.csv"%input_dir)
for i in sig_files:
    if not os.path.exists(i.replace(".csv", "_mouse.csv")):
        print(i)
        i_mouse = i.replace(".csv", "_mouse.csv")
        i_df = pd.read_csv(i)
        i_df['gene_symbol'] = mouse_to_human(i_df['gene_symbol'].values.tolist(), reverse=True)
        i_df = i_df[i_df['gene_symbol'] != '']
        i_df.to_csv(i_mouse, index=False)        

/media/pipkin/ROCKET-PRO/T_cell_signature_Reference/X_Formatted_csv/CD4/2015_NAT_IMMUNOL_Choi_signatures.csv


## 1. Summarize all references 

In [107]:
wk_dir = base_dir + "/X_GeneSignatures_mm"
os.chdir(wk_dir)

input_dir = base_dir + '/X_Formatted_csv'

all_sig_files = glob.glob("%s/*/*signatures_mouse.csv"%input_dir)
all_sig_df = pd.DataFrame()

for i in all_sig_files:
    i_basename = i.split("/")[-1].replace("_signatures_mouse.csv", "")
    i_df = pd.read_csv(i)[['gs_name', 'gene_symbol']]
    i_df['gs_name'] = [i_basename + "---" + x for x in i_df['gs_name'].values]
    all_sig_df = all_sig_df.append(i_df)

all_sig_df['gs_name'] = [x.replace(" ", "_").replace("/","-").replace("ï", "i") for x in all_sig_df['gs_name'].values]
all_sig_df_sum = all_sig_df.groupby('gs_name').count()
all_sig_df_sum.columns = ['gene_number']

In [108]:
all_sig_df.to_csv("all_mouse_T_cell_signatures.csv", index=False)
all_sig_df_sum.to_csv("all_mouse_T_cell_gs_signatures_summary.csv")

*Update annotated file*

In [109]:
from datetime import datetime
today_date = datetime.today().strftime('%Y%m%d')

In [110]:
anno_file_dir = base_dir + '/Y_annotated'
anno_files = glob.glob("%s/plotuse_gs*.csv"%anno_file_dir)
anno_files.sort()

old_anno_file = anno_files[-1]
old_date = old_anno_file.split("/")[-1].replace(".csv", "").replace("plotuse_gs_", "").replace("_anno","")

#--- If update the file if last update is not today
if old_date != today_date:
    #--- Use the annotated file if it is available
    if  os.path.exists(old_anno_file.replace("plotuse", "anno_plotuse")):
        old_anno_df = pd.read_csv(old_anno_file.replace("plotuse", "anno_plotuse"))
    else:
        old_anno_df = pd.read_csv(old_anno_file)
        
    print("Last update: %s"%old_date)
    print("Today: %s"%today_date)
    new_gs = list(set(all_sig_df_sum.index) - set(old_anno_df['gs_name']))
    
    #--- If there are actually new gene signatures, update the file
    if len(new_gs) > 0:
        print("Add %s new signatures"%str(len(new_gs)))
        add_anno_df = pd.DataFrame({"gs_name": new_gs, "type": ["" for x in new_gs],
                                    "use": ["" for x in new_gs], "plot_use": ["" for x in new_gs],
                                    "abbr": ["" for x in new_gs], "order": ["" for x in new_gs]})
        new_anno_df = old_anno_df.append(add_anno_df)

        new_anno_file = old_anno_file.replace(old_date, today_date)
        new_anno_df.to_csv(new_anno_file, index=False)
else:
    print("Up to date")

Last update: 20200928
Today: 20200929


## 2. Create subset references

*Select most up-to-date annotation file*

In [118]:
anno_file_dir = base_dir + '/Y_annotated'
anno_files = glob.glob("%s/plotuse_gs*.csv"%anno_file_dir)
anno_files.sort()

old_anno_file = anno_files[-1]
old_date = old_anno_file.split("/")[-1].replace(".csv", "").replace("plotuse_gs_", "").replace("_anno","")

#--- Use the annotated file if it is available
if  os.path.exists(old_anno_file.replace("plotuse", "anno_plotuse")):
    old_anno_df = pd.read_csv(old_anno_file.replace("plotuse", "anno_plotuse"))
else:
    old_anno_df = pd.read_csv(old_anno_file)

In [119]:
wk_dir = base_dir + "/X_GeneSignatures_mm"
os.chdir(wk_dir)

all_sig_df = pd.read_csv("all_mouse_T_cell_signatures.csv")
gs_anno = old_anno_df

In [120]:
gs_anno_use = gs_anno[gs_anno['use'] != 'n']
all_sig_df_use = all_sig_df[[True if x in gs_anno_use['gs_name'].values else False for x in all_sig_df['gs_name'].values]]

gs_anno_use_types = np.unique(gs_anno_use['type'])

In [123]:
hs_sum_df = pd.read_csv('/media/pipkin/ROCKET-PRO/T_cell_signature_Reference/X_GeneSignatures_hs/all_human_T_cell_gs_signatures_summary.csv')
mm_sum_df = pd.read_csv('/media/pipkin/ROCKET-PRO/T_cell_signature_Reference/X_GeneSignatures_mm/all_mouse_T_cell_gs_signatures_summary.csv')