In [1]:
import requests
import pymongo
from bs4 import BeautifulSoup

In [2]:
class Cancer_Spider:
    def __init__(self):
        self.baseUrl="https://www.malacards.org/card/"
        self.cancers = []
        self.header={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"}
        self.conn=pymongo.MongoClient()
        self.db=self.conn['medical']
        self.col=self.db['data']
        
    def get_name(self):
      response = requests.get("https://www.malacards.org/categories/cancer_disease_list",headers=self.header)
      soup = BeautifulSoup(response.text, 'html.parser')
      for row in soup.find("table", class_="search-results").find_all("tr")[1:]:
        name = row.find_all("td")[3].get_text().replace(",","").lower()
        self.cancers.append(name)

    def get_html(self,cancer):
        response = requests.get(self.baseUrl+cancer.replace(" ","_"),headers=self.header)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
        
    def Spider_main(self):
        self.get_name()
        for cancer in self.cancers:
            soup=self.get_html(cancer)
            cancer_data={"Name":cancer}
            cancer_data["Aliases"]=self.Aliases_spider(soup)
            cancer_data["Summary"]=self.Summary_spider(soup)
            cancer_data["Related_diseases"]=self.Related_diseases_spider(soup)
            cancer_data["Symptoms"]=self.Symptoms_spider(soup)
            cancer_data["Drugs"]=self.Drugs_spider(soup)
            cancer_data["Genes"]=self.Gene_spider(soup)
            self.col.insert_one(cancer_data)
    
    def Aliases_spider(self,soup):
        tds = soup.find("td",attrs={"width":"390"})
        if(tds):
          Aliases = tds.find_all("div")
          Aliases_list=[]
          for div in Aliases:
              Aliases_list.append([text for text in div.stripped_strings][0])
          return Aliases_list
        return []
        
    def Summary_spider(self,soup):
        divs = soup.find("div",{"id":"Summary"})
        if(divs):
          for sup in divs.find_all("sup"):
              #sup.clear()
              sup.decompose()
          Summary=" ".join([text.replace("\n","") for text in soup.find("div",{"id":"Summary"}).stripped_strings])
          return Summary
        return ""
    
    def Related_diseases_spider(self,soup):
        Related_diseases=[]
        table = soup.find("table",{"id":"RelatedDiseases-table"})
        if(table):
          for tr in table.find("tbody").find_all("tr")[:10]:
              Related_diseases.append(tr.find_all("td")[1].get_text())
        return Related_diseases
    
    def Symptoms_spider(self,soup):
        Symptoms_list=[]
        for span in soup.find_all("span",itemprop="signOrSymptom"):
            try:
              st = span.span.span.get_text()
              if st[len(st) - 1] == ',':
                  st = st[:len(st)-1]
              Symptoms_list.append(st)
            except:
              continue
        return Symptoms_list

    def Drugs_spider(self,soup):
        Drugs_list=[]
        table = soup.find("table", id='MaladiesUnifiedCompounds-table')
        if(table):
          for tr in table.find("tbody").find_all("tr",{"class":"unified_drugs hidden_minicard"}):
              tr.decompose()
          for tr in table.find("tbody").find_all("tr")[:5]:
              Drugs_list.append(tr.find_all("td")[2].get_text())
        return Drugs_list
        
    def Gene_spider(self,soup):
        Genes_list=[]
        count = 0
        table = soup.find("table",{"id":"RelatedGenes-table"})
        if(table):
          for div in table.find("tbody").find_all("div", {"class":"no_wrap"}):
              if count == 10: break
              if div.find('a').get_text() != "\n\n": 
                  count += 1
              Genes_list.append(div.find('a').get_text())
        return Genes_list


handler=Cancer_Spider()
handler.Spider_main()
print("done")

AttributeError: 'Cancer_Spider' object has no attribute 'header'

In [6]:
import os
import json
from py2neo import Graph, Node, Relationship

In [11]:
class MedicalGraph:
    def __init__(self):
        cur_dir='/'.join(os.path.abspath('__file__').split('//')[:-1])
        self.data_path=os.path.join(cur_dir,'../medical_data.json')
        self.g=Graph(
            host='127.0.0.1',
            http_port=7474,
            user='neo4j',
            password='Hao9707171717')
    
    def read_nodes(self):
        self.cancers = []
        self.cancer_infos = []
        self.drugs =set()
        self.symptoms =set()
        self.genes =set()
        self.diseases=set()

        self.rels_drug = []
        self.rels_gene = []
        self.rels_symptom = []
        self.rels_disease=[]

        count = 0
        file = open(self.data_path, encoding='utf-8')
        data = file.readline()
        while data:
            cancer_dict = {}
            #count += 1
            #print(count)
            
            data_json = json.loads(data)
            cancer = data_json['Name']
            self.cancers.append(cancer)
            cancer_dict['name'] = cancer

            if data_json["Symptoms"]:
                for symptom in data_json["Symptoms"]:
                    self.symptoms.add(symptom)
                    self.rels_symptom.append([cancer, symptom])

            if data_json["Aliases"]:
                cancer_dict['aliases'] = data_json['Aliases']

            if data_json["Summary"]:
                cancer_dict['summary'] = data_json['Summary']

            if data_json["Drugs"]:
                for drug in data_json["Drugs"]:
                    self.drugs.add(drug)
                    self.rels_drug.append([cancer,drug])
                    
            if data_json["Genes"]:
                for gene in data_json["Genes"]:
                    self.genes.add(gene)
                    self.rels_gene.append([cancer,gene])
                    
            if data_json["Related_diseases"]:
                for disease in data_json["Related_diseases"]:
                    self.diseases.add(disease)
                    self.rels_disease.append([cancer,disease])
            
            self.cancer_infos.append(cancer_dict)
            data = file.readline()
            
        return self.cancers,self.cancer_infos,self.drugs,self.symptoms,self.genes,self.diseases,self.rels_drug,self.rels_gene,self.rels_symptom,self.rels_disease
    
    def create_other_node(self, label, nodeList):
      count = 0
      length = len(nodeList)
      for nodeName in nodeList:
          node = Node(label, name=nodeName)
          self.g.create(node)
          count += 1
          print(count, length)
      return
    
    def create_cancer_node(self, cancerList):
      count = 0
      length = len(cancerList)
      for dic in cancerList:
          if 'aliases' in dic and 'summary' in dic:
            node = Node("Cancer", name=dic['name'], aliases=dic['aliases'], summary=dic['summary'])
          elif 'aliases' in dic:
            node = Node("Cancer", name=dic['name'], aliases=dic['aliases'])
          elif 'summary' in dic:
            node = Node("Cancer", name=dic['name'], summary=dic['summary'])
          else:
            node = Node("Cancer", name=dic['name'])
          self.g.create(node)
          count += 1
          print(count, length)
      return

    def create_relation(self, startLabel, endLabel, edges, rel_type, rel_name):
      count = 0
      set_edges = []
      for edge in edges:
          set_edges.append('->'.join(edge))
      set_edges = set(set_edges)
      length = len(set_edges)
      for edge in set_edges:
          edge = edge.split('->')
          pName = edge[0]
          qName = edge[1]
          query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
              startLabel, endLabel, pName, qName, rel_type, rel_name)
          try:
              self.g.run(query)
              count += 1
              print(rel_type, count, length)
          except Exception as e:
              print(e)
      return

    def create_node_main(self):
      self.create_cancer_node(self.cancer_infos)
      self.create_other_node('Drug', self.drugs)
      print(len(self.drugs))
      self.create_other_node('Symptom', self.symptoms)
      print(len(self.symptoms))
      self.create_other_node('Gene', self.genes)
      print(len(self.genes))
      self.create_other_node('Related_Disease', self.diseases)
      print(len(self.diseases))
      return
    
    def create_relation_main(self):
      self.create_relation('Cancer', 'Drug', self.rels_drug, 'related_drug', 'drug_treatment')
      self.create_relation('Cancer', 'Gene', self.rels_gene, 'related_gene', 'related_gene')
      self.create_relation('Cancer', 'Symptom', self.rels_symptom, 'has_symptom', 'related_symptom')
      self.create_relation('Cancer', 'Related_Disease', self.rels_disease, 'complication', 'related_disease')
    
    def export_data(self):
        f_drug = open('./drug.txt', 'w+')
        f_cancer = open('./cancer.txt', 'w+')
        f_symptom = open('./symptom.txt', 'w+')
        f_disease = open('./disease.txt', 'w+')
        f_gene = open('./gene.txt', 'w+')
        Cancers,Cancer_infos,Drugs,Symptoms,Genes,Diseases,rels_drug,rels_gene,rels_symptom,rels_disease=self.read_nodes()
        

        f_drug.write('\n'.join(list(Drugs)))
        f_cancer.write('\n'.join(list(Cancers)))
        f_symptom.write('\n'.join(list(Symptoms)))
        f_disease.write('\n'.join(list(Diseases)))
        f_gene.write('\n'.join(list(Genes)))

        f_drug.close()
        f_cancer.close()
        f_symptom.close()
        f_disease.close()
        f_gene.close()

        return
    

handler = MedicalGraph()
#handler.read_nodes()
#handler.create_node_main()
#handler.create_relation_main()
handler.export_data()

In [1]:
import ahocorasick

In [2]:
class QuestionClassifier:
    def __init__(self):
        
        cur_dir = "/Users/runyuhao/Desktop/Big Data Analysis/"
        self.disease_path = os.path.join(cur_dir, './disease.txt')
        self.drug_path = os.path.join(cur_dir, './drug.txt')
        self.cancer_path = os.path.join(cur_dir, './cancer.txt')
        self.symptom_path = os.path.join(cur_dir, './symptom.txt')
        self.gene_path = os.path.join(cur_dir, './gene.txt')
        self.deny_path = os.path.join(cur_dir, './deny.txt')

        self.disease_words = [i.strip() for i in open(self.disease_path) if i.strip()]
        self.drug_words = [i.strip() for i in open(self.drug_path) if i.strip()]
        self.cancer_words = [i.strip() for i in open(self.cancer_path) if i.strip()]
        self.gene_words = [i.strip() for i in open(self.gene_path) if i.strip()]
        self.symptom_words = [i.strip() for i in open(self.symptom_path) if i.strip()]
        self.region_words = set(self.drug_words + self.disease_words + self.cancer_words +self.gene_words + self.symptom_words)
        self.deny_words = [i.strip() for i in open(self.deny_path,encoding='utf-8') if i.strip()]
        
        self.region_tree = self.build_actree(list(self.region_words))

        self.wdtype_dict = self.build_wdtype_dict()

        self.symptom_qwds = ['symptom', 'manifestation', 'indication', 'indicator', 'sign', 'mark', 'prodrome', 'trait', 'signal']
        self.acompany_qwds = ['complication', 'complicating disease', 'related disease']
        self.drug_qwds = ['drug', 'medicine', 'medication', 'medicament', 'remedy', 'cure', 'antidote', 'panacea']
        self.gene_qwds = ['gene', 'dna', 'chromosome', 'genetic code', 'nucleic acid']
        self.alias_qwds = ['alias', 'aka', 'pseudonym', 'incognito', 'other name']

        print('model init finished ......')

        return

    
    def build_actree(self, wordlist):
        actree = ahocorasick.Automaton()
        for index, word in enumerate(wordlist):
            actree.add_word(word, (index, word))
        actree.make_automaton()
        return actree
    
    def build_wdtype_dict(self):
        wd_dict = dict()
        for wd in self.region_words:
            wd_dict[wd] = []
            if wd in self.disease_words:
                wd_dict[wd].append('disease')
            if wd in self.cancer_words:
                wd_dict[wd].append('cancer')
            if wd in self.gene_words:
                wd_dict[wd].append('gene')
            if wd in self.drug_words:
                wd_dict[wd].append('drug')
            if wd in self.symptom_words:
                wd_dict[wd].append('symptom')
            #print(wd_dict)
        return wd_dict

    def check_medical(self, question):
        region_wds = []
        for i in self.region_tree.iter(question):
            wd = i[1][1]
            region_wds.append(wd)
        stop_wds = []
        for wd1 in region_wds:
            for wd2 in region_wds:
                if wd1 in wd2 and wd1 != wd2:
                    stop_wds.append(wd1)
        final_wds = [i for i in region_wds if i not in stop_wds]
        final_dict = {i:self.wdtype_dict.get(i) for i in final_wds}

        return final_dict

    def check_words(self, wds, sentence):
        for wd in wds:
            if wd in sentence.lower():
                return True
        return False

    def classify(self, question):
        data = {}
        medical_dict = self.check_medical(question)
        if not medical_dict:
            return {}
        data['args'] = medical_dict
        types = []
        for type_ in medical_dict.values():
            types += type_
        types = set(types)
        question_type = 'others'

        question_types = []

        if self.check_words(self.symptom_qwds, question) and 'cancer' in types:
            question_type = 'disease_symptom'
            question_types.append(question_type)

        if self.check_words(self.symptom_qwds, question) and 'symptom' in types:
            question_type = 'symptom_disease'
            question_types.append(question_type)
     
        if self.check_words(self.acompany_qwds, question) and 'cancer' in types:
            question_type = 'disease_acompany'
            question_types.append(question_type)

        if self.check_words(self.drug_qwds, question) and 'cancer' in types:
            question_type = 'disease_drug'
            question_types.append(question_type)
      
        if self.check_words(self.drug_qwds, question) and 'drug' in types:
            question_type = 'drug_disease'
            question_types.append(question_type)
     
        if self.check_words(self.gene_qwds, question) and 'cancer' in types:
            question_type = 'disease_gene'
            question_types.append(question_type)
       
        if self.check_words(self.gene_qwds, question) and 'gene' in types:
            question_type = 'gene_disease'
            question_types.append(question_type)
        
        if self.check_words(self.alias_qwds, question) and 'cancer' in types:
            question_type = 'disease_alias'
            question_types.append(question_type)
        

        if question_types == [] and 'cancer' in types:
            question_types = ['disease_desc']

        if question_types == [] and 'symptom' in types:
            question_types = ['symptom_disease']

        data['question_types'] = question_types

        return data


In [3]:
class QuestionParser:
    def build_entitydict(self,args):
        entity_dict={}
        for arg,types in args.items():
            for type in types:
                if type not in entity_dict:
                    entity_dict[type]=[arg]
                else:
                    entity_dict[type].append(arg)
        return entity_dict
    
    def sql_transfer(self, question_type, entities):
        if not entities:
            return []

        sql = []

        if question_type == 'disease_symptom':
            sql = ["MATCH (m:Cancer)-[r:has_symptom]->(n:Symptom) where m.name = '{0}' return m.name, n.name".format(i) for i in entities]

        elif question_type == 'symptom_disease':
            sql = ["MATCH (m:Cancer)-[r:has_symptom]->(n:Symptom) where n.name = '{0}' return m.name, n.name".format(i) for i in entities]

        elif question_type == 'disease_acompany':
            sql = ["MATCH (m:Cancer)-[r:complication]->(n:Related_Disease) where m.name = '{0}' return m.name, n.name".format(i) for i in entities]

        elif question_type == 'disease_drug':
            sql = ["MATCH (m:Cancer)-[r:related_drug]->(n:Drug) where m.name = '{0}' return m.name, n.name".format(i) for i in entities]

        elif question_type == 'drug_disease':
            sql = ["MATCH (m:Cancer)-[r:related_drug]->(n:Drug) where n.name = '{0}' return m.name, n.name".format(i) for i in entities]

        elif question_type == 'disease_gene':
            sql = ["MATCH (m:Cancer)-[r:related_gene]->(n:Gene) where m.name = '{0}' return m.name, n.name".format(i) for i in entities]

        elif question_type == 'gene_disease':
            sql = ["MATCH (m:Cancer)-[r:related_gene]->(n:Gene) where n.name = '{0}' return m.name, n.name".format(i) for i in entities]

        elif question_type == 'disease_alias':
            sql = ["MATCH (m:Cancer) where m.name = '{0}' return m.name, m.aliases".format(i) for i in entities]

        return sql

    def parser_main(self, res_classify):
        args = res_classify['args']
        entity_dict = self.build_entitydict(args)
        question_types = res_classify['question_types']
        sqls = []
        for question_type in question_types:
            sql_ = {}
            sql_['question_type'] = question_type
            sql = []
            if question_type == 'disease_symptom':
                sql = self.sql_transfer(question_type, entity_dict.get('cancer'))
                
            elif question_type == 'symptom_disease':
                sql = self.sql_transfer(question_type, entity_dict.get('symptom'))
                
            elif question_type == 'disease_acompany':
                sql = self.sql_transfer(question_type, entity_dict.get('disease'))

            elif question_type == 'disease_drug':
                sql = self.sql_transfer(question_type, entity_dict.get('cancer'))

            elif question_type == 'drug_disease':
                sql = self.sql_transfer(question_type, entity_dict.get('drug'))

            elif question_type == 'disease_gene':
                sql = self.sql_transfer(question_type, entity_dict.get('cancer'))

            elif question_type == 'gene_disease':
                sql = self.sql_transfer(question_type, entity_dict.get('gene'))
                
            elif question_type == 'disease_alias':
                sql = self.sql_transfer(question_type, entity_dict.get('cancer'))

            if sql:
                sql_['sql'] = sql

                sqls.append(sql_)

        return sqls


In [6]:
import os
from py2neo import Graph

class AnswerSearcher:
    def __init__(self):
        self.g=Graph(
            host='127.0.0.1',
            http_port=7474,
            user='neo4j',
            password='Hao9707171717')
        self.num_limit = 20

    def search_main(self, sqls):
        final_answers = []
        for sql_ in sqls:
            question_type = sql_['question_type']
            queries = sql_['sql']
            answers = []
            for query in queries:
                ress = self.g.run(query).data()
                answers += ress
            final_answer = self.answer_prettify(question_type, answers)
            if final_answer:
                final_answers.append(final_answer)
        return final_answers

    def answer_prettify(self, question_type, answers):
        final_answer = []
        if not answers:
            return ''
        if question_type == 'disease_symptom':
            desc = [i['n.name'] for i in answers]
            subject = answers[0]['m.name']
            final_answer = '{0} may have such symptoms: {1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))

        elif question_type == 'symptom_disease':
            desc = [i['m.name'] for i in answers]
            subject = answers[0]['n.name']
            final_answer = 'Symptoms {0} may have such cancers：{1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))

        elif question_type == 'disease_gene':
            desc = [i['n.name'] for i in answers]
            subject = answers[0]['m.name']
            final_answer = '{0} may have such genes: {1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
            
        elif question_type == 'gene_disease':
            desc = [i['m.name'] for i in answers]
            subject = answers[0]['n.name']
            final_answer = 'Gene {0} may cause {1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))
        
        elif question_type == 'disease_drug':
            desc = [i['n.name'] for i in answers]
            subject = answers[0]['m.name']
            final_answer = '{0}\'s drugs include: {1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))

        elif question_type == 'drug_disease':
            desc = [i['m.name'] for i in answers]
            subject = answers[0]['n.name']
            final_answer = 'Drugs {0} may be helpful for {1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))

        elif question_type == 'disease_acompany':
            desc = [i['n.name'] for i in answers]
            subject = answers[0]['m.name']
            final_answer = '{0}\'s complications include: {1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))

        elif question_type == 'disease_alias':
            desc = answers[0]['m.aliases']
            subject = answers[0]['m.name']
            final_answer = '{0}\'s aliases include: {1}'.format(subject, '；'.join(list(set(desc))[:self.num_limit]))


        return final_answer
    
class ChatBotGraph:
    def __init__(self):
        self.classifier = QuestionClassifier()
        self.parser = QuestionParser()
        self.searcher = AnswerSearcher()

    def chat_main(self, sent):
        answer = 'Hello, I am your health keeper. If I can\'t answer your question, please forgive me~'
        res_classify = self.classifier.classify(sent)
        if not res_classify:
            return answer
        res_sql = self.parser.parser_main(res_classify)
        final_answers = self.searcher.search_main(res_sql)
        if not final_answers:
            return answer
        else:
            return '\n'.join(final_answers)

if __name__ == '__main__':
    handler = ChatBotGraph()
    while 1:
        question = input('User input:')
        answer = handler.chat_main(question)
        print(answer)
    

model init finished ......
User input:What are the aliases of breast cancer?
breast cancer's aliases include: Breast Carcinoma；Breast Cancer, Male；Breast Cancer, Early-Onset；Familial Cancer of Breast；Susceptibility to Breast Cancer；Mammary Cancer；Breast Cancer, Male, Susceptibility to；Male Breast Carcinoma；Breast Cancer, Protection Against；Malignant Tumor of the Breast；Familial Breast Carcinoma；Invasive Ductal Breast Carcinoma；Breast Lobular Carcinoma；Breast Cancer, Lobular；Mammary Tumor；Male Breast Cancer；Hereditary Breast Carcinoma；Malignant Neoplasm of Male Breast；Familial Breast Cancer；Carcinoma of Male Breast
User input:What are the gene related to lung cancer?
lung cancer may have such genes: EGFR；PRKN；

；SLC22A18；IRF1；BRAF；ERBB2；KRAS；PIK3CA；ERCC6；PPP2R1B


KeyboardInterrupt: 