# Notebook to generate a dictionary of canonical gene markers

Defining the dictionary

In [1]:
import pickle
from pyprojroot.here import here

In [2]:
marker_genes_dict = {
    'lineages' : {
         'B+PC': ["CD79A", "MS4A1", "CD19", "JCHAIN","MZB1", "TCL1A", "FCER2", "CD27"],
         'T+NK': ["CD3D", "CD4","CD8A", 'FOXP3', 'NCAM1', 'GNLY', 'NKG7', 'KLRD1'],
         'Mono+DC': ['CST3', 'LYZ', "CD14", "FCGR3A",'CLEC9A', 'CLEC10A', 'CLEC4C', 'CD1C'],
         'HSCs': ['CD34', "KIT"], 
         "MC": ["TPSAB1", "CPA3"],
         "pDC":["IL3RA", "LILRA4"],
         'Platelets+RBC': ["PPBP", "HBA1", "HBB"], 
         'Profileration': ['MKI67', 'TOP2A']
    },
    'T_and_NK': {
        "CD4": ["CD4","CD8A", "CD8B", "GZMK"], # NEGATIVE "CD8A", "CD8B", "GZMK"
        "CD8": ["CD8A", "CD8B", "GZMK", "CD4"], # NEGATIVE "CD4"
        "Naive/CM" : [ "ANXA1", "PASK", "SELL", "LEF1", "NOSIP", "CCR7", "TCF7", "ACTN1", "FOXP1", "KLF2", "ITGA6"],
        "Effector/Mem" : ["ZNF683", "KLRB1", "PRDM1", "CX3CR1", "EOMES", "KLRG1", "TNFSF13B", "GZMK", "CCL5", "CCL4", "NKG7", "CD69", "ITGAE"],
        "T helper" : ["CXCR3", "GATA3", "RORC", "RORA", "IL17F", "IL17A", "CCR6", "CXCR6", "IFNG", "IL4", "IL6ST", "CXCR5", "CXCL13", "PDCD1"],
        "IFN response" : ["IFI16", "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "ISG15"],
        "T reg" : ["IL32", "CCR7", "LEF1", "TCF7", "FOXP3", "CTLA4", "IL2RA", "ICOS", "TIGIT", "TOX2", "IKZF2", "GATA3", "CD28"],
        "Gamma Delta" : ["TRGC1", "TRGC2", "TRDC"], # NEGATIVE "CD8A", "CD8B", "CD4"
        "Cytotoxic" : ["GZMK", "GZMH", "CCL5", "CCL4", "NKG7", "CD69", "PRF1", "ITGAE", "CST7", "GZMA", "CCL4L2", "KLRG1", "CTSW", "GZMH", "GZMM", "KLRK1", "HLA-C", "PRF1", "XCL2", "XCL1"],
        "Exhausted" : ["HAVCR2", "LAG3", "PDCD1", "TIGIT", "TOX", "TOX2", "LAYN", "CTLA4"],
        "MAIT" : ["KLRB1","IL7R", "SLC4A10"],
        "ILC" : ["KIT", "NCR1", "KLRG1"],
        "NK" : ["NCAM1", "FCGR3A", "CX3CR1", "GNLY", "KLRC2", "KLRD1", "KLRC3", "KLRK1", "KLRC1", "GNLY", "NKG7"],
        "Proliferative" : ["MKI67", "TOP2A", "STMN1", "UBE2C", "PCLAF", "CENPF", "CDK1"]
},
    'Mono_and_DC': {
        "Monocytes":  ["CD14", "S100A8", "S100A9", "LYZ", "VCAN", "FCN1"],
        "Non_classical monocytes" : ["FCGR3A", "CX3CR1", "HLA-DRB1", "HLA-DRA"],
        "DC1" : ["CLEC9A", "XCR1", "IDO1", "CLNK", "ZNF366"],
        "DC2" : ["CD1C", "FCER1A", "CLEC10A"],
        "DC3" : ["CD1C", "S100A8", "S100A9", "ANXA1"],
        "DC4" : ["ITGAX", "FCGR3A", "SERPINA1", "LILRB2", "SIGLEC10"],
        "DC5" : ["AXL", "SIGLEC6", "CD22", "DAB2"],        
        },
    'B': {
        "B" : ["MS4A1", "CD79A", "CD79B"],
        "Naive B cell" : ["TCL1A", "FCER2","IGHD", "IGHM", "CCR7", "SELL", "CD79A", "VPREB3", "FCRL1", "NIBAN3", "CD79B", "HVCN1", "CD72", "CD19"],
        "Memory B cell" :  ["CD27", "TNFRSF13B", "BANK1", "IGHG1", "IGHG2", "IGHA1", "IGHA2", "PRDM1", "CD24", "CD74", "HLA-DRA", "BLK", "SPIB", "P2RX5", "CD37"],
        "Immature B cell" : ["CD19", "RAG1", "RAG2", "CD9", "SOX4"],
        "Atypical B cell" : ["TBX21", "ITGAX"],
        "Activated B cell" : ["CD69","CD83", "NFKB1", "NFKB2"],
        "ISG related B cell": ["ISG15","IFI6", "IFITM1", "IFITM2"],
        "Plasma cell": ["MZB1","DERL3", "JCHAIN", "XBP1", "PRDM1", "IRF4"],
        
},
    'Plasma': { 'Plasma': ["MZB1", "SDC1", "JCHAIN",  "DERL3", "XBP1", "IRF4", "PRDM1", "IGHM", "IGHD", "IGHE", "IGHA1","IGHA2", "IGHG1", "IGHG2", "IGHG3", "IGHG4", "IGLC3", "IGLC1", "IGHGP"] },
    'pDC_and_HSCs': {
        'pDC' : ["IL3RA", "IRF7", "LILRA4", "IRF8", "JCHAIN", "GZMB"],
        'HSCs': ["CD34", "KIT"]
},    
    'Platelets_and_RBC': {
        'Platelets': ["PPBP"],
        'RBC': ["HBA1", "HBB"]
    },
    'Mono': {
        "Monocytes":  ["CD14", "S100A8", "S100A9", "LYZ", "VCAN", "FCN1"],
        "Non_classical monocytes" : ["FCGR3A", "CX3CR1", "HLA-DRB1", "HLA-DRA"],
        "DC1" : ["CLEC9A", "XCR1", "IDO1", "CLNK", "ZNF366"],
        "DC2" : ["CD1C", "FCER1A", "CLEC10A"],
        "DC3" : ["CD1C", "S100A8", "S100A9", "ANXA1"],
        "DC4" : ["ITGAX", "FCGR3A", "SERPINA1", "LILRB2", "SIGLEC10"],
        "DC5" : ["AXL", "SIGLEC6", "CD22", "DAB2"],
        "aDC" : ["CCL19", "CCR7", "IL7R", "AIRE"]
    },
    'DC': {
        "Monocytes":  ["CD14", "S100A8", "S100A9", "LYZ", "VCAN", "FCN1"],
        "Non_classical monocytes" : ["FCGR3A", "CX3CR1", "HLA-DRB1", "HLA-DRA"],
        "DC1" : ["CLEC9A", "XCR1", "IDO1", "CLNK", "ZNF366"],
        "DC2" : ["CD1C", "FCER1A", "CLEC10A"],
        "DC3" : ["CD1C", "S100A8", "S100A9", "ANXA1"],
        "DC4" : ["ITGAX", "FCGR3A", "SERPINA1", "LILRB2", "SIGLEC10"],
        "DC5" : ["AXL", "SIGLEC6", "CD22", "DAB2"],
        "aDC" : ["CCL19", "CCR7", "IL7R", "AIRE"]
    },
    'T': {
        "CD4": ["CD4","CD8A", "CD8B", "GZMK"], # NEGATIVE "CD8A", "CD8B", "GZMK"
        "CD8": ["CD8A", "CD8B", "GZMK", "CD4"], # NEGATIVE "CD4"
        "Naive/CM" : [ "ANXA1", "PASK", "SELL", "LEF1", "NOSIP", "CCR7", "TCF7", "ACTN1", "FOXP1", "KLF2", "ITGA6"],
        "Effector/Mem" : ["ZNF683", "KLRB1", "PRDM1", "CX3CR1", "EOMES", "KLRG1", "TNFSF13B", "GZMK", "CCL5", "CCL4", "NKG7", "CD69", "ITGAE"],
        "T helper" : ["CXCR3", "GATA3", "RORC", "RORA", "IL17F", "IL17A", "CCR6", "CXCR6", "IFNG", "IL4", "IL6ST", "CXCR5", "CXCL13", "PDCD1"],
        "IFN response" : ["IFI16", "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "ISG15"],
        "T reg" : ["IL32", "CCR7", "LEF1", "TCF7", "FOXP3", "CTLA4", "IL2RA", "ICOS", "TIGIT", "TOX2", "IKZF2", "GATA3", "CD28"],
        "Gamma Delta" : ["TRGC1", "TRGC2", "TRDC"], # NEGATIVE "CD8A", "CD8B", "CD4"
        "Cytotoxic" : ["GZMK", "GZMH", "CCL5", "CCL4", "NKG7", "CD69", "PRF1", "ITGAE", "CST7", "GZMA", "CCL4L2", "KLRG1", "CTSW", "GZMH", "GZMM", "KLRK1", "HLA-C", "PRF1", "XCL2", "XCL1"],
        "Exhausted" : ["HAVCR2", "LAG3", "PDCD1", "TIGIT", "TOX", "TOX2", "LAYN", "CTLA4"],
        "MAIT" : ["KLRB1","IL7R", "SLC4A10"],
        "ILC" : ["KIT", "NCR1", "KLRG1"],
        "NK" : ["NCAM1", "FCGR3A", "CX3CR1", "GNLY", "KLRC2", "KLRD1", "KLRC3", "KLRK1", "KLRC1", "GNLY", "NKG7"],
        "Proliferative" : ["MKI67", "TOP2A", "STMN1", "UBE2C", "PCLAF", "CENPF", "CDK1"]
    },
    'T_Naive': {
        "CD4": ["CD4","CD8A", "CD8B", "GZMK"], # NEGATIVE "CD8A", "CD8B", "GZMK"
        "CD8": ["CD8A", "CD8B", "GZMK", "CD4"], # NEGATIVE "CD4"
        "Naive/CM" : [ "ANXA1", "PASK", "SELL", "LEF1", "NOSIP", "CCR7", "TCF7", "ACTN1", "FOXP1", "KLF2", "ITGA6"],
        "Effector/Mem" : ["ZNF683", "KLRB1", "PRDM1", "CX3CR1", "EOMES", "KLRG1", "TNFSF13B", "GZMK", "CCL5", "CCL4", "NKG7", "CD69", "ITGAE"],
        "T helper" : ["CXCR3", "GATA3", "RORC", "RORA", "IL17F", "IL17A", "CCR6", "CXCR6", "IFNG", "IL4", "IL6ST", "CXCR5", "CXCL13", "PDCD1"],
        "IFN response" : ["IFI16", "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "ISG15"],
        "T reg" : ["IL32", "CCR7", "LEF1", "TCF7", "FOXP3", "CTLA4", "IL2RA", "ICOS", "TIGIT", "TOX2", "IKZF2", "GATA3", "CD28"],
        "Gamma Delta" : ["TRGC1", "TRGC2", "TRDC"], # NEGATIVE "CD8A", "CD8B", "CD4"
        "Cytotoxic" : ["GZMK", "GZMH", "CCL5", "CCL4", "NKG7", "CD69", "PRF1", "ITGAE", "CST7", "GZMA", "CCL4L2", "KLRG1", "CTSW", "GZMH", "GZMM", "KLRK1", "HLA-C", "PRF1", "XCL2", "XCL1"],
        "Exhausted" : ["HAVCR2", "LAG3", "PDCD1", "TIGIT", "TOX", "TOX2", "LAYN", "CTLA4"],
        "MAIT" : ["KLRB1","IL7R", "SLC4A10"],
        "ILC" : ["KIT", "NCR1", "KLRG1"],
        "NK" : ["NCAM1", "FCGR3A", "CX3CR1", "GNLY", "KLRC2", "KLRD1", "KLRC3", "KLRK1", "KLRC1", "GNLY", "NKG7"],
        "Proliferative" : ["MKI67", "TOP2A", "STMN1", "UBE2C", "PCLAF", "CENPF", "CDK1"]
    },
        'T_NonNaive': {
        "CD4": ["CD4","CD8A", "CD8B", "GZMK"], # NEGATIVE "CD8A", "CD8B", "GZMK"
        "CD8": ["CD8A", "CD8B", "GZMK", "CD4"], # NEGATIVE "CD4"
        "Naive/CM" : [ "ANXA1", "PASK", "SELL", "LEF1", "NOSIP", "CCR7", "TCF7", "ACTN1", "FOXP1", "KLF2", "ITGA6"],
        "Effector/Mem" : ["ZNF683", "KLRB1", "PRDM1", "CX3CR1", "EOMES", "KLRG1", "TNFSF13B", "GZMK", "CCL5", "CCL4", "NKG7", "CD69", "ITGAE"],
        "T helper" : ["CXCR3", "GATA3", "RORC", "RORA", "IL17F", "IL17A", "CCR6", "CXCR6", "IFNG", "IL4", "IL6ST", "CXCR5", "CXCL13", "PDCD1"],
        "IFN response" : ["IFI16", "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "ISG15"],
        "T reg" : ["IL32", "CCR7", "LEF1", "TCF7", "FOXP3", "CTLA4", "IL2RA", "ICOS", "TIGIT", "TOX2", "IKZF2", "GATA3", "CD28"],
        "Gamma Delta" : ["TRGC1", "TRGC2", "TRDC"], # NEGATIVE "CD8A", "CD8B", "CD4"
        "Cytotoxic" : ["GZMK", "GZMH", "CCL5", "CCL4", "NKG7", "CD69", "PRF1", "ITGAE", "CST7", "GZMA", "CCL4L2", "KLRG1", "CTSW", "GZMH", "GZMM", "KLRK1", "HLA-C", "PRF1", "XCL2", "XCL1"],
        "Exhausted" : ["HAVCR2", "LAG3", "PDCD1", "TIGIT", "TOX", "TOX2", "LAYN", "CTLA4"],
        "MAIT" : ["KLRB1","IL7R", "SLC4A10"],
        "ILC" : ["KIT", "NCR1", "KLRG1"],
        "NK" : ["NCAM1", "FCGR3A", "CX3CR1", "GNLY", "KLRC2", "KLRD1", "KLRC3", "KLRK1", "KLRC1", "GNLY", "NKG7"],
        "Proliferative" : ["MKI67", "TOP2A", "STMN1", "UBE2C", "PCLAF", "CENPF", "CDK1"]
    },
        'T_CD4_NonNaive': {
        "CD4": ["CD4","CD8A", "CD8B", "GZMK"], # NEGATIVE "CD8A", "CD8B", "GZMK"
        "CD8": ["CD8A", "CD8B", "GZMK", "CD4"], # NEGATIVE "CD4"
        "Naive/CM" : [ "ANXA1", "PASK", "SELL", "LEF1", "NOSIP", "CCR7", "TCF7", "ACTN1", "FOXP1", "KLF2", "ITGA6"],
        "Effector/Mem" : ["ZNF683", "KLRB1", "PRDM1", "CX3CR1", "EOMES", "KLRG1", "TNFSF13B", "GZMK", "CCL5", "CCL4", "NKG7", "CD69", "ITGAE"],
        "T helper" : ["CXCR3", "GATA3", "RORC", "RORA", "IL17F", "IL17A", "CCR6", "CXCR6", "IFNG", "IL4", "IL6ST", "CXCR5", "CXCL13", "PDCD1"],
        "IFN response" : ["IFI16", "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "ISG15"],
        "T reg" : ["IL32", "CCR7", "LEF1", "TCF7", "FOXP3", "CTLA4", "IL2RA", "ICOS", "TIGIT", "TOX2", "IKZF2", "GATA3", "CD28"],
        "Gamma Delta" : ["TRGC1", "TRGC2", "TRDC"], # NEGATIVE "CD8A", "CD8B", "CD4"
        "Cytotoxic" : ["GZMK", "GZMH", "CCL5", "CCL4", "NKG7", "CD69", "PRF1", "ITGAE", "CST7", "GZMA", "CCL4L2", "KLRG1", "CTSW", "GZMH", "GZMM", "KLRK1", "HLA-C", "PRF1", "XCL2", "XCL1"],
        "Exhausted" : ["HAVCR2", "LAG3", "PDCD1", "TIGIT", "TOX", "TOX2", "LAYN", "CTLA4"],
        "MAIT" : ["KLRB1","IL7R", "SLC4A10"],
        "ILC" : ["KIT", "NCR1", "KLRG1"],
        "NK" : ["NCAM1", "FCGR3A", "CX3CR1", "GNLY", "KLRC2", "KLRD1", "KLRC3", "KLRK1", "KLRC1", "GNLY", "NKG7"],
        "Proliferative" : ["MKI67", "TOP2A", "STMN1", "UBE2C", "PCLAF", "CENPF", "CDK1"]
    },
        'T_CD8_NonNaive': {
        "CD4": ["CD4","CD8A", "CD8B", "GZMK"], # NEGATIVE "CD8A", "CD8B", "GZMK"
        "CD8": ["CD8A", "CD8B", "GZMK", "CD4"], # NEGATIVE "CD4"
        "Naive/CM" : [ "ANXA1", "PASK", "SELL", "LEF1", "NOSIP", "CCR7", "TCF7", "ACTN1", "FOXP1", "KLF2", "ITGA6"],
        "Effector/Mem" : ["ZNF683", "KLRB1", "PRDM1", "CX3CR1", "EOMES", "KLRG1", "TNFSF13B", "GZMK", "CCL5", "CCL4", "NKG7", "CD69", "ITGAE"],
        "T helper" : ["CXCR3", "GATA3", "RORC", "RORA", "IL17F", "IL17A", "CCR6", "CXCR6", "IFNG", "IL4", "IL6ST", "CXCR5", "CXCL13", "PDCD1"],
        "IFN response" : ["IFI16", "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "ISG15"],
        "T reg" : ["IL32", "CCR7", "LEF1", "TCF7", "FOXP3", "CTLA4", "IL2RA", "ICOS", "TIGIT", "TOX2", "IKZF2", "GATA3", "CD28"],
        "Gamma Delta" : ["TRGC1", "TRGC2", "TRDC"], # NEGATIVE "CD8A", "CD8B", "CD4"
        "Cytotoxic" : ["GZMK", "GZMH", "CCL5", "CCL4", "NKG7", "CD69", "PRF1", "ITGAE", "CST7", "GZMA", "CCL4L2", "KLRG1", "CTSW", "GZMH", "GZMM", "KLRK1", "HLA-C", "PRF1", "XCL2", "XCL1"],
        "Exhausted" : ["HAVCR2", "LAG3", "PDCD1", "TIGIT", "TOX", "TOX2", "LAYN", "CTLA4"],
        "MAIT" : ["KLRB1","IL7R", "SLC4A10"],
        "ILC" : ["KIT", "NCR1", "KLRG1"],
        "NK" : ["NCAM1", "FCGR3A", "CX3CR1", "GNLY", "KLRC2", "KLRD1", "KLRC3", "KLRK1", "KLRC1", "GNLY", "NKG7"],
        "Proliferative" : ["MKI67", "TOP2A", "STMN1", "UBE2C", "PCLAF", "CENPF", "CDK1"]
    },
    'NK': {
        "Naive/CM" : [ "ANXA1", "PASK", "SELL", "LEF1", "NOSIP", "CCR7", "TCF7", "ACTN1", "FOXP1", "KLF2", "ITGA6"],
        "Effector/Mem" : ["ZNF683", "KLRB1", "PRDM1", "CX3CR1", "EOMES", "KLRG1", "TNFSF13B", "GZMK", "CCL5", "CCL4", "NKG7", "CD69", "ITGAE"],
        "T helper" : ["CXCR3", "GATA3", "RORC", "RORA", "IL17F", "IL17A", "CCR6", "CXCR6", "IFNG", "IL4", "IL6ST", "CXCR5", "CXCL13", "PDCD1"],
        "IFN response" : ["IFI16", "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1", "IFIT2", "IFIT3", "IFIT5", "ISG15"],
        "T reg" : ["IL32", "CCR7", "LEF1", "TCF7", "FOXP3", "CTLA4", "IL2RA", "ICOS", "TIGIT", "TOX2", "IKZF2", "GATA3", "CD28"],
        "Gamma Delta" : ["TRGC1", "TRGC2", "TRDC"], # NEGATIVE "CD8A", "CD8B", "CD4"
        "Cytotoxic" : ["GZMK", "GZMH", "CCL5", "CCL4", "NKG7", "CD69", "PRF1", "ITGAE", "CST7", "GZMA", "CCL4L2", "KLRG1", "CTSW", "GZMH", "GZMM", "KLRK1", "HLA-C", "PRF1", "XCL2", "XCL1"],
        "Exhausted" : ["HAVCR2", "LAG3", "PDCD1", "TIGIT", "TOX", "TOX2", "LAYN", "CTLA4"],
        "MAIT" : ["KLRB1","IL7R", "SLC4A10"],
        "ILC" : ["KIT", "NCR1", "KLRG1"],
        "NK" : ["NCAM1", "FCGR3A", "CX3CR1", "GNLY", "KLRC2", "KLRD1", "KLRC3", "KLRK1", "KLRC1", "GNLY", "NKG7"],
        "Proliferative" : ["MKI67", "TOP2A", "STMN1", "UBE2C", "PCLAF", "CENPF", "CDK1"]
    },
    'pDC': {
        'pDC' : ["IL3RA", "IRF7", "LILRA4", "IRF8", "JCHAIN", "GZMB"]
    },    
    'HSC': {
        'HSCs': ["CD34", "KIT"]
    }    
}

Exporting the dictionary as a pickle

In [3]:
with open(here('external_reference_data/markerGenes_dictionary.pkl'), 'wb') as f:
    pickle.dump(marker_genes_dict, f)