## Use saprQL to query DBpedia

In [None]:
import rdflib
g=rdflib.Graph()
g.load('http://dbpedia.org/resource/Biomedical')

for s,p,o in g:
    print (s,p,o)

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON

prefix = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dbpedia: <http://dbpedia.org/resource/>PREFIX dcterms: <http://purl.org/dc/terms/>
    PREFIX dbo: <http://dbpedia.org/ontology/>PREFIX category: <http://dbpedia.org/resource/Category:>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>PREFIX foaf: <http://xmlns.com/foaf/0.1/>PREFIX dbpprop: <http://dbpedia.org/property/>
    PREFIX dbprop: <http://dbpedia.org/property/>PREFIX grs: <http://www.georss.org/georss/>
    PREFIX category: <http://dbpedia.org/resource/Category:>
    PREFIX dcterms: <http://purl.org/dc/terms/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX freebase: <http://rdf.freebase.com/ns/>
    PREFIX db: <http://dbpedia.org/>
    PREFIX dbp: <http://dbpedia.org/property/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX http: <http://www.w3.org/2006/http#>"""

In [2]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery(prefix + """    
    SELECT ?z
    WHERE { <http://dbpedia.org/resource/Biomedical_engineering> rdf:type ?z }
""")
#Arye_Rosen
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result)

{'z': {'type': 'uri', 'value': 'http://www.w3.org/2002/07/owl#Thing'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/ontology/Software'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/Abstraction100002137'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/Cognition100023271'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/Content105809192'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/Discipline105996646'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/KnowledgeDomain105999266'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/PsychologicalFeature100023100'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/WikicatEngineeringDisciplines'}}


In [3]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery(prefix + """
    SELECT ?z
    WHERE { <http://dbpedia.org/resource/Biomedical_engineering> rdfs:label ?z }
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    if result['z']['xml:lang'] == 'en':
        print(result['z']['value'])

Biomedical engineering


## Search things in DBpedia and Store into mongoDB

### Search things using DBpedia

In [4]:
from pymongo import MongoClient

config = json.load(open("config.json"))

client = MongoClient()
client = MongoClient(config["server"], 27017)

db = client.KG

In [5]:
searchtype = {}
searchtype['abstract'] = 'dbo'
searchtype['birthDate'] = 'dbo'
searchtype['birthPlace'] = 'dbo'
searchtype['almaMater'] = 'dbo'
searchtype['field'] = 'dbo'
searchtype['city'] = 'dbo'
searchtype['state'] = 'dbo'
searchtype['type'] = 'dbo'
searchtype['homepage'] = 'foaf'
searchtype['subject'] = 'dct'
searchtype['label'] = 'rdfs'
searchtype['wikiPageExternalLink'] = 'dbo'
searchtype['isPrimaryTopicOf'] = 'foaf'

In [6]:
#Replace symbol that may crash the query
def clean(word):
    word = word.replace(' ','_')
    word = word.replace('"','//')
    word = word.replace("'","/")
    return word


def clean_output(word):
    
    word = word.split('/')[-1]
    word = word.replace('_',' ')
    return word

In [7]:
#Search the page title and get its pages
def get_type_from_db(title):
    title = clean(title)
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery(prefix + """    
        SELECT ?z
        WHERE { <http://dbpedia.org/resource/"""+title+"""> rdf:type ?z }
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    value = []
    for result in results["results"]["bindings"]:
        value.append(result['z']['value'].split('/')[-1])
    if 'Person' in value:
        return 'Person'
    elif 'University' in value:
        return 'University'
    else:
        return 'Thing'


In [8]:
query_contents = {}
query_contents['Person'] = ['abstract','type','birthDate','birthPlace','label','field','homepage','almaMater']
query_contents['University'] = ['abstract','type','state','city','label','homepage']
query_contents['Thing'] = ['abstract','label','wikiPageExternalLink','subject','isPrimaryTopicOf']

In [9]:
#Get all fields needed for a person
def construct(title,ctype):
    title = clean(title)
    results = {}
    results['category'] = clean(ctype)
    for content in query_contents[ctype]:
        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setQuery(prefix + """    
            SELECT ?z
            WHERE { <http://dbpedia.org/resource/"""+title+"> "+searchtype[content]+":"+content+""" ?z }
        """)
        sparql.setReturnFormat(JSON)
        result = sparql.query().convert()
        output = []
        for r in result["results"]["bindings"]:
            if content != 'homepage' and 'Link' not in content:
                r['z']['value'] = clean_output(r['z']['value'])
            try:
                if r['z']['xml:lang'] == 'en':
                    output.append(r['z']['value'])
            except:
                output.append(r['z']['value'])
        results[content] = output
    return results

In [10]:
construct('Cassie_Mitchell','Person')

{'abstract': ['Cassie Mitchell (born 1981) is an American chemist and Paralympic athlete and cyclist.'],
 'almaMater': ['Georgia Institute of Technology',
  'Emory University',
  'Oklahoma State University–Stillwater'],
 'birthDate': ['1981-1-1'],
 'birthPlace': ['Muskogee, Oklahoma'],
 'category': 'Person',
 'field': ['Biomedical engineering', 'Chemical engineering'],
 'homepage': ['http://www.cassie-mitchell.com/paralympics.html'],
 'label': ['Cassie Mitchell'],
 'type': []}

In [11]:
construct('Drexel_University','University')

{'abstract': ["Drexel University is a private research university with three campuses in Philadelphia and one in Sacramento, California. It was founded in 1891 by Anthony J. Drexel, a noted financier and philanthropist. As of 2015, more than 26,000 students are enrolled in over 70 undergraduate programs and more than 100 master's, doctoral, and professional programs at the university. Drexel's cooperative education program (co-op) is a unique aspect of the school's degree programs, offering students the opportunity to gain up to 18 months of paid, full-time work experience in a field relevant to their undergraduate major or graduate degree program prior to graduation."],
 'category': 'University',
 'city': ['Philadelphia'],
 'homepage': ['http://www.drexel.edu'],
 'label': ['Drexel University'],
 'state': ['Pennsylvania'],
 'type': ['Private university', 'Research']}

In [12]:
construct('Biomedical_engineering','Thing')

{'abstract': ['Biomedical engineering (BME) is the application of engineering principles and design concepts to medicine and biology for healthcare purposes (e.g. diagnostic or therapeutic). This field seeks to close the gap between engineering and medicine, combining the design and problem solving skills of engineering with medical and biological sciences to advance health care treatment, including diagnosis, monitoring, and therapy.Biomedical engineering has only recently emerged as its own study, compared to many other engineering fields. Such an evolution is common as a new field transitions from being an interdisciplinary specialization among already-established fields, to being considered a field in itself. Much of the work in biomedical engineering consists of research and development, spanning a broad array of subfields (see below). Prominent biomedical engineering applications include the development of biocompatible prostheses, various diagnostic and therapeutic medical devic

### Add Relationship to MongoDB

In [13]:
relationships = {}
relationships['Person'] = ['birthPlace','field','almaMater']
relationships['University'] = ['city','state','type']
relationships['Thing'] = ['isPrimaryTopicOf']

In [16]:
#check if the noe is already exists in the database
def check_not_exists(result):
    if result['category'] == 'Person':
        return db.nodes.find({'category':result['category'],'label':result['label'],'birthDate':result['birthDate']}).count() == 0
    else:
        return db.nodes.find({'category':result['category'],'label':result['label']}).count() == 0
        
    
#Transfer the result format and save to mongoDB
def save_to_mongo(result):
    if check_not_exists(result):
        db.nodes.insert_one(result)
    else:
        if result['category'] == 'Person':
            db.nodes.update_one({'category':result['category'],'label':result['label'],'birthDate':result['birthDate']},{'$set':result})
        else:
            db.nodes.update_one({'category':result['category'],'label':result['label']},{'$set':result})
    return


def add_relation_to_mongo(result):
    relationship = relationships[result['category']]
    for r in relationship:
        if result['category'] == 'Person':
            s = {'label':result['label'],'category':result['category'],'birthday':result['birthDate']}
            d = {'label':result[r],'category':r}
            if len(d['label']) != 0:
                db.edges.insert_one({'Source':s,'Destination':d,'relationship':r})
        else:
            s = {'label':result['label'],'category':result['category']}
            d = {'label':result[r],'category':r}
            if len(d['label']) != 0:
                db.edges.insert_one({'Source':s,'Destination':d,'relationship':r})
    return 

In [17]:
# Use the title to search and extract
def get_info_from_db(title):
    page_type = get_type_from_db(title)
    result = construct(title, page_type)
    return result
    
    
# Transder pages to specific nodes with labels
def transfer_pages():
    pages = db.pages.find({})
    total = db.pages.find({}).count()
    count = 0
    for page in pages:
        result = get_info_from_db(page['title'])
        save_to_mongo(result)
        add_relation_to_mongo(result)
        if count%100 == 0:
            print(count)
        count += 1

In [20]:
transfer_pages()

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


In [19]:
db.edges.remove({})

  """Entry point for launching an IPython kernel.


{'n': 2541, 'ok': 1.0}