In [1]:
import json
import pandas

In [6]:
# 数据结构
class goTerm(object):
    def __init__(self, collection, systematicName, pmid, exactSource,
                 externalDetailsURL, msigdbURL, geneSymbols, filteredBySimilarity, externalNamesForSimilarTerms):
        self.collection = collection
        self.systematicName = systematicName
        self.pmid = pmid
        self.exactSource = exactSource
        self.externalDetailsURL = externalDetailsURL
        self.msigdbURL = msigdbURL
        self.geneSymbols = geneSymbols
        self.filteredBySimilarity = filteredBySimilarity
        self.externalNamesForSimilarTerms = externalNamesForSimilarTerms
    
    def info(self):
        d = {"collection": self.collection,
            "systematicName": self.systematicName,
            "pmid": self.pmid,
            "exactSource": self.exactSource,
            "externalDetailsURL": self.externalDetailsURL,
            "msigdbURL": self.msigdbURL,
            "geneSymbols": self.geneSymbols,
            "filteredBySimilarity": self.filteredBySimilarity,
            "externalNamesForSimilarTerms": self.externalNamesForSimilarTerms,
            }
        return(d)

class goData(object):
    def __init__(self):
        self.data = {}
    
    def addTerm(self, name, GoTerm):
        name = name
        GoTerm = GoTerm

        self.data[name] = GoTerm

    def searchGene(self, geneName, numMin=None, rateMin=1, compareGeneName=True):
        '''
        input:
            geneName|\t  str/list|\t  待查询的geneName
            numMin|\t  int|\t  =None, geneName中要存在于GOterm中的gene的最小值
            rateMin|\t  float|\t  =1, geneName中要存在于GOterm中的gene的最低比例[0,1]
            compareGeneName|\t  bool|\t  =True, 指定rateMin是应用于geneName还是应用于GO数据库中的term
        '''
        geneName = geneName
        numMin = (numMin, len(geneName))[numMin is None]
        rateMin = rateMin
        compareGeneName = compareGeneName
        
        if isinstance(geneName, str):
            geneName = [geneName]

        termList = []
        for name, goTermObject in self.data.items():
            geneSet = goTermObject.geneSymbols
            geneExisted = [i in geneSet for i in geneName]
            num = geneExisted.count(True)
            rate = (num / len(geneSet), num / len(geneName))[compareGeneName is True]
            rate = round(rate, 4)
            if num>= numMin or rate >= rateMin:
                termList.append([name, num, rate])
        termList = sorted(termList, key=lambda x: [-x[2], x[0]])
        
        return termList
    
    def getDf(self):
        df = [i.split('_', 1) for i in self.data.keys()]
        df = [[i[0], i[1].lower()]for i in df]
        df = pandas.DataFrame(df, columns=["class", "name"])

        return(df)


In [7]:
# 一级函数
def checkOverlap(data1, data2):
    '''
    input:
        data1|\t  list|\t  ...
        data2|\t  list|\t  ...
    change:
        查询data1与data2的重叠情况, 并返回交集
    output:
        data|\t  set|\t  data1与data2的交集
    '''
    data1 = set(data1)
    data2 = set(data2)

    data = data1.intersection(data2)

    print("overlap num: {}, overlap/data1: {}, overlap/data2:{}".format(len(data), len(data)/len(data1), len(data)/len(data2)))

    return(data)

In [8]:
if __name__ == "__main__":
    with open("c5.go.v2023.2.Hs.json", 'r') as file:
        txt = json.load(file)

    go = goData()
    for name, dictTerm in txt.items():
        goTermObject = goTerm(collection = dictTerm["collection"],
                              systematicName = dictTerm["systematicName"],
                              pmid = dictTerm["pmid"],
                              exactSource = dictTerm["exactSource"],
                              externalDetailsURL = dictTerm["externalDetailsURL"],
                              msigdbURL = dictTerm["msigdbURL"],
                              geneSymbols = dictTerm["geneSymbols"],
                              filteredBySimilarity = dictTerm["filteredBySimilarity"],
                              externalNamesForSimilarTerms = dictTerm["externalNamesForSimilarTerms"])
        go.addTerm(name=name, GoTerm=goTermObject)

    df = go.getDf()
    

In [23]:
go.searchGene(["CCR5", "ITGB2", "CD6", "CXCR13", "VCAN", "CXCR5", "CCL19", "SDC2", "ITGA6", "CCR6", "SDC3", "CXCR3", "CXCR6", "CXCR4"],
              numMin=5,
              compareGeneName=True)

[['GOBP_CELL_MOTILITY', 11, 0.7857],
 ['GOCC_CELL_SURFACE', 11, 0.7857],
 ['GOBP_LOCOMOTION', 9, 0.6429],
 ['GOBP_CELL_CHEMOTAXIS', 8, 0.5714],
 ['GOBP_DEFENSE_RESPONSE', 8, 0.5714],
 ['GOBP_INFLAMMATORY_RESPONSE', 8, 0.5714],
 ['GOBP_LEUKOCYTE_MIGRATION', 8, 0.5714],
 ['GOBP_TAXIS', 8, 0.5714],
 ['GOCC_EXTERNAL_SIDE_OF_PLASMA_MEMBRANE', 8, 0.5714],
 ['GOCC_SIDE_OF_MEMBRANE', 8, 0.5714],
 ['GOBP_CELL_ADHESION', 7, 0.5],
 ['GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY', 7, 0.5],
 ['GOBP_G_PROTEIN_COUPLED_RECEPTOR_SIGNALING_PATHWAY', 7, 0.5],
 ['GOBP_RESPONSE_TO_CHEMOKINE', 7, 0.5],
 ['GOBP_RESPONSE_TO_CYTOKINE', 7, 0.5],
 ['GOMF_MOLECULAR_TRANSDUCER_ACTIVITY', 7, 0.5],
 ['GOBP_APOPTOTIC_PROCESS', 6, 0.4286],
 ['GOBP_CALCIUM_MEDIATED_SIGNALING', 6, 0.4286],
 ['GOBP_LEUKOCYTE_CHEMOTAXIS', 6, 0.4286],
 ['GOBP_POSITIVE_REGULATION_OF_CYTOSOLIC_CALCIUM_ION_CONCENTRATION',
  6,
  0.4286],
 ['GOBP_REGULATION_OF_CELL_ADHESION', 6, 0.4286],
 ['GOBP_SECOND_MESSENGER_MEDIATED_SIGNALING', 6, 0.4286],
 [

In [None]:
['GOBP_LEUKOCYTE_MIGRATION', 3, 1.0],           5944  !
"lymphocyte_chemotaxis"                         4308
"lymphocyte_aggregation"                        5737
"lymphocyte_migration"                          5944
"lymphocyte_migration_into_lymphoid_organs"     6158
"regulation_of_lymphocyte_migration"            7480
"negative_regulation_of_lymphocyte_migration"   7481
"positive_regulation_of_lymphocyte_migration"   7482

['GOBP_CELL_CHEMOTAXIS', 8, 0.5714285714285714]         5060 
['GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY', 7, 0.5]    2000
['GOBP_RESPONSE_TO_CHEMOKINE', 7, 0.5]                  7375  !
['GOBP_RESPONSE_TO_CYTOKINE', 7, 0.5]                   2914
"chemokine_production"                                  2664
"response_to_chemokine"                                 7375
"chemokine_activity"                                    9095

"b_cell_proliferation_involved_in_immune_response"      318
"regulation_of_b_cell_proliferation"                    2363
"positive_regulation_of_b_cell_proliferation"           2365
"b_cell_proliferation"                                  3401

In [18]:
index = 3401

term = df.at[index, "class"]+'_'+df.at[index, "name"].upper()
termInfo = go.data[term].info()
for k, v in termInfo.items():
    print(k, '\t', v)

collection 	 C5:GO:BP
systematicName 	 M11204
pmid 	 
exactSource 	 GO:0042100
externalDetailsURL 	 http://amigo.geneontology.org/amigo/term/GO:0042100
msigdbURL 	 https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/GOBP_B_CELL_PROLIFERATION
geneSymbols 	 ['ABL1', 'ADA', 'AHR', 'ATAD5', 'ATM', 'BAX', 'BCL2', 'BCL6', 'BMI1', 'BST1', 'BTK', 'CARD11', 'CASP3', 'CD180', 'CD19', 'CD22', 'CD300A', 'CD320', 'CD38', 'CD40', 'CD40LG', 'CD70', 'CD74', 'CD79A', 'CD81', 'CDKN1A', 'CHRNB2', 'CLCF1', 'CR2', 'CTLA4', 'CTPS1', 'EPHB2', 'FCGR2B', 'FCRL3', 'FOSL2', 'GAPT', 'GPR183', 'HSPD1', 'IFNA1', 'IFNA10', 'IFNA13', 'IFNA14', 'IFNA16', 'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'IFNB1', 'IFNE', 'IFNK', 'IFNW1', 'IKZF3', 'IL10', 'IL13', 'IL2', 'IL21', 'IL4', 'IL5', 'IL7', 'IL7R', 'IL9', 'INPP5D', 'IRS2', 'LEF1', 'LYN', 'MEF2C', 'MIF', 'MIR185', 'MNDA', 'MS4A1', 'MZB1', 'NCKAP1L', 'NFATC2', 'NFKBIZ', 'PAWR', 'PELI1', 'PKN1', 'PLCL2', 'PRKCD', 'PRLR', 'PTPRC', 'RAG2', '

In [20]:
Pro_inflammatory_genes = ['CSF2', 'FCER1G', 'GZMB', 'PTPN6', 'ICAM1', 'IFNG', 'IFNGR1', 'ITGB2L', 'KLRA1', 'KLRA3', 'KLRA4', 'KLRA7', 'KLRA8', 'KLRA9', 'KLRC1', 'KLRC2', 'KLRD1', 'LAT', 'KLRB1C', 'NCR1', 'PRF1', 'SH2D1A', 'SYK', 'TNF', 'TYROBP', 'FCGR4', 'KLRK1', 'PIK3R6', 'CCR6', 'CXCR2', 'CXCR4', 'CCR1', 'CCR9', 'CCR3', 'CCR2', 'CCR5', 'CCR7', 'CCR10', 'BCAR1', 'CX3CR1', 'FGR', 'GNB4', 'GNGT2', 'HCK', 'XCL1', 'LYN', 'CXCL9', 'NCF1', 'NFKBIA', 'PRKCD', 'CCL1', 'CCL17', 'CCL2', 'CCL22', 'CCL3', 'CCL4', 'CCL5', 'CCL6', 'CCL9', 'CXCL2', 'CXCL12', 'STAT1', 'XCR1', 'GRK3', 'PF4', 'PPBP', 'GNG11', 'CXCL16', 'CXCR6', 'ACKR3', 'CSF1', 'CSF1R', 'CX3CR1', 'CXCL1', 'CXCL10', 'IL10', 'IL18', 'IL18RAP', 'IL18R1', 'IL2RA', 'IL2RB', 'IL6', 'IL6RA', 'IL6ST', 'XCL1', 'LTA', 'CXCL9', 'TNF', 'TNFRSF1B', 'XCR1', 'PF4', 'PPBP']
Antigen_presenting_genes = ['CIITA', 'CALR', 'CD4', 'CD8A', 'CD8B1', 'CTSB', 'CTSL', 'CTSS', 'H2-AA', 'H2-AB1', 'H2-EB1', 'H2-DMA', 'H2-DMB1', 'H2-DMB2', 'H2-OA', 'H2-OB', 'HSPA8', 'HSPA1B', 'HSP90AB1', 'HSP90AA1', 'IFNG', 'CD74', 'KLRC1', 'KLRD1', 'LGMN', 'HSPA1A', 'TNF']
Lymphatic_inducing_genes = ['GLYCAM1', 'FUT7', 'GCNT1', 'CHST4', 'B3GNT3', 'CCL21A', 'CCL2', 'CCL3', 'CCL4', 'CCL5', 'CCL8', 'CCL18', 'CCL19', 'CCL21', 'CXCL9', 'CXCL10', 'CXCL11', 'CXCL13', 'CCR7', 'CXCR5', 'SELL', 'LAMP3', 'CXCL13', 'CD200', 'FBLN7', 'ICOS', 'SGPP2', 'SH2D1A', 'TIGIT', 'PDCD1', 'CCR5', 'CXCR3', 'CSF2', 'IGSF6', 'IL2RA', 'CD38', 'CD40', 'CD5', 'MS4A1', 'CCR5', 'CXCR3']
Chemokine = ['PIK3R6', 'CCR6', 'CXCR2', 'CXCR4', 'CCR1', 'CCR9', 'CCR3', 'CCR2', 'CCR5', 'CCR7', 'CCR10', 'BCAR1', 'CX3CR1', 'FGR', 'GNB4', 'GNGT2', 'CXCL1', 'HCK', 'CXCL10', 'XCL1', 'LYN', 'CXCL9', 'NCF1', 'NFKBIA', 'PRKCD', 'CCL1', 'CCL17', 'CCL2', 'CCL22', 'CCL3', 'CCL4', 'CCL5', 'CCL6', 'CCL9', 'CXCL2', 'CXCL12', 'STAT1', 'XCR1', 'GRK3', 'PF4', 'PPBP', 'GNG11', 'CXCL16', 'CXCR6']
Cytokine = ['CCR6', 'CXCR2', 'CXCR4', 'CCR1', 'CCR9', 'CCR3', 'CCR2', 'CCR5', 'CCR7', 'CCR10', 'ACKR3', 'CSF1', 'CSF1R', 'CX3CR1', 'CXCL1', 'CXCL10', 'IL10', 'IL18', 'IL18RAP', 'IL18R1', 'IL2RA', 'IL2RB', 'IL6', 'IL6RA', 'IL6ST', 'XCL1', 'LTA', 'CXCL9', 'CCL1', 'CCL17', 'CCL2', 'CCL22', 'CCL3', 'CCL4', 'CCL5', 'CCL6', 'CCL9', 'CXCL2', 'CXCL12', 'TNF', 'TNFRSF1B', 'XCR1', 'PF4', 'PPBP']

In [36]:
go.searchGene(Cytokine,
              numMin=10,
              ompareGeneName=True)

[['GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY', 39, 0.8864],
 ['GOBP_RESPONSE_TO_CYTOKINE', 39, 0.8864],
 ['GOBP_DEFENSE_RESPONSE', 38, 0.8636],
 ['GOBP_INFLAMMATORY_RESPONSE', 36, 0.8182],
 ['GOBP_CELL_MOTILITY', 33, 0.75],
 ['GOBP_LOCOMOTION', 33, 0.75],
 ['GOBP_CELL_CHEMOTAXIS', 32, 0.7273],
 ['GOBP_TAXIS', 32, 0.7273],
 ['GOBP_BIOLOGICAL_PROCESS_INVOLVED_IN_INTERSPECIES_INTERACTION_BETWEEN_ORGANISMS',
  28,
  0.6364],
 ['GOBP_LEUKOCYTE_MIGRATION', 28, 0.6364],
 ['GOBP_RESPONSE_TO_CHEMOKINE', 28, 0.6364],
 ['GOBP_LEUKOCYTE_CHEMOTAXIS', 27, 0.6136],
 ['GOBP_REGULATION_OF_IMMUNE_SYSTEM_PROCESS', 27, 0.6136],
 ['GOBP_G_PROTEIN_COUPLED_RECEPTOR_SIGNALING_PATHWAY', 26, 0.5909],
 ['GOBP_POSITIVE_REGULATION_OF_IMMUNE_SYSTEM_PROCESS', 26, 0.5909],
 ['GOBP_REGULATION_OF_RESPONSE_TO_EXTERNAL_STIMULUS', 26, 0.5909],
 ['GOBP_POSITIVE_REGULATION_OF_MULTICELLULAR_ORGANISMAL_PROCESS', 25, 0.5682],
 ['GOMF_CYTOKINE_RECEPTOR_BINDING', 24, 0.5455],
 ['GOMF_SIGNALING_RECEPTOR_BINDING', 24, 0.5455],
 ['G

---

start

In [45]:
result = {}
result["refLymphocyteRecruitment"] = ['CCR7', 'CXCR5', 'SELL']
result["refChemokines"] = ["CCR5", "ITGB2", 'CD6', 'CXCR13', 'VCAN', 'CXCR5', 'CCL19', 'SDC2', 'ITGA6', 'CCR6', 'SDC3', 'CXCR3', 'CXCR6', 'CXCR4']
result["refGCBProliferation"] = ['MAP2K1', 'GNB1', 'EGFR', 'GNG12', 'EPHA2', 'GNB4', 'RAPGEF5', 'TIAM1', 'IGF1R']
indexList = [4308,5737,5944,6158,7480,7481,7482,5060,2000,2914,2664,7375,9095,318,2363,2365,3401]
for i in indexList:
    name = "{}_{}".format(df.at[i, "class"], df.at[i, "name"].upper())
    geneSet = go.data[name].geneSymbols
    name = name.split('_', 1)[1].lower()
    result[name] = geneSet

lymphocyte_chemotaxis
lymphocyte_aggregation
lymphocyte_migration
lymphocyte_migration_into_lymphoid_organs
regulation_of_lymphocyte_migration
negative_regulation_of_lymphocyte_migration
positive_regulation_of_lymphocyte_migration
cell_chemotaxis
cytokine_mediated_signaling_pathway
response_to_cytokine
chemokine_production
response_to_chemokine
chemokine_activity
b_cell_proliferation_involved_in_immune_response
regulation_of_b_cell_proliferation
positive_regulation_of_b_cell_proliferation
b_cell_proliferation


In [52]:
30*15/60

7.5

In [48]:
with open("0004_pathwayGeneSet.json", 'w') as file:
    json.dump(result, file)

end

---