## Method (I)

In [30]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pdx

In [33]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
# From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
sparql.setQuery("""
SELECT DISTINCT ?doid ?item ?label ?wpLang
 
WHERE {
   ?item wdt:P699 ?doid ;
         rdfs:label ?label ;
         rdfs:label ?english .
   ?article schema:about ?item ;
            schema:inLanguage ?wpLang .
   FILTER (lang(?label) = ?wpLang)
   FILTER (lang(?english) = "en")
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [34]:
results_df = pd.io.json.json_normalize(results['results']['bindings'])
results_df[['doid.value', 'item.value', 'label.value', 'wpLang.value']]

Unnamed: 0,doid.value,item.value,label.value,wpLang.value
0,DOID:4184,http://www.wikidata.org/entity/Q819207,قصور جارات الدرق الكاذب,ar
1,DOID:4184,http://www.wikidata.org/entity/Q819207,Pseudohypoparathyreoidismus,de
2,DOID:4184,http://www.wikidata.org/entity/Q819207,pseudohypoparathyroidism,en
3,DOID:4184,http://www.wikidata.org/entity/Q819207,pseudohipoparatiroidismo,es
4,DOID:4184,http://www.wikidata.org/entity/Q819207,کم‌کاری کاذب غده پاراتیروئید,fa
5,DOID:4184,http://www.wikidata.org/entity/Q819207,Pseudohipoparatireoidizam,hr
6,DOID:4184,http://www.wikidata.org/entity/Q819207,偽性副甲状腺機能低下症,ja
7,DOID:4184,http://www.wikidata.org/entity/Q819207,거짓부갑상샘저하증,ko
8,DOID:2377,http://www.wikidata.org/entity/Q8277,تصلب متعدد,ar
9,DOID:2377,http://www.wikidata.org/entity/Q8277,Esclerosis múltiple,ast


In [35]:
language = 'english'
abbre = 'en'

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

# specify the double "{}" for format()
sparql.setQuery("""
SELECT DISTINCT ?doid ?item ?label ?wpLang
WHERE {{
   ?item wdt:P699 ?doid ;
         rdfs:label ?label ;
         rdfs:label ?{0} .
   ?article schema:about ?item ;
            schema:inLanguage ?wpLang .
   FILTER (lang(?label) = ?wpLang)
   FILTER (lang(?{0}) = "{1}")
}}
""".format(language, abbre))

sparql.setReturnFormat(JSON)
results = sparql.query().convert()

results_df = pd.io.json.json_normalize(results['results']['bindings'])
results_df[['doid.value', 'item.value', 'label.value', 'wpLang.value']]

Unnamed: 0,doid.value,item.value,label.value,wpLang.value
0,DOID:4184,http://www.wikidata.org/entity/Q819207,قصور جارات الدرق الكاذب,ar
1,DOID:4184,http://www.wikidata.org/entity/Q819207,Pseudohypoparathyreoidismus,de
2,DOID:4184,http://www.wikidata.org/entity/Q819207,pseudohypoparathyroidism,en
3,DOID:4184,http://www.wikidata.org/entity/Q819207,pseudohipoparatiroidismo,es
4,DOID:4184,http://www.wikidata.org/entity/Q819207,کم‌کاری کاذب غده پاراتیروئید,fa
5,DOID:4184,http://www.wikidata.org/entity/Q819207,Pseudohipoparatireoidizam,hr
6,DOID:4184,http://www.wikidata.org/entity/Q819207,偽性副甲状腺機能低下症,ja
7,DOID:4184,http://www.wikidata.org/entity/Q819207,거짓부갑상샘저하증,ko
8,DOID:2750,http://www.wikidata.org/entity/Q829150,malaltia d'Andersen,ca
9,DOID:2750,http://www.wikidata.org/entity/Q829150,Morbus Andersen,de


## Method (II)

In [36]:
import pandas as pd
import json
from SPARQLWrapper import SPARQLWrapper, JSON

In [37]:
def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

In [38]:
wds = "https://query.wikidata.org/sparql"

In [39]:
rq = """
SELECT DISTINCT ?doid ?item ?label ?wpLang
 
WHERE {
   ?item wdt:P699 ?doid ;
         rdfs:label ?label ;
         rdfs:label ?english .
   ?article schema:about ?item ;
            schema:inLanguage ?wpLang .
   FILTER (lang(?label) = ?wpLang)
   FILTER (lang(?english) = "en")
}
"""

In [40]:
df = get_sparql_dataframe(wds, rq)

In [41]:
df.head()

Unnamed: 0,doid,item,label,wpLang
0,DOID:4184,http://www.wikidata.org/entity/Q819207,قصور جارات الدرق الكاذب,ar
1,DOID:4184,http://www.wikidata.org/entity/Q819207,Pseudohypoparathyreoidismus,de
2,DOID:4184,http://www.wikidata.org/entity/Q819207,pseudohypoparathyroidism,en
3,DOID:4184,http://www.wikidata.org/entity/Q819207,pseudohipoparatiroidismo,es
4,DOID:4184,http://www.wikidata.org/entity/Q819207,کم‌کاری کاذب غده پاراتیروئید,fa


In [42]:
df.describe()

Unnamed: 0,doid,item,label,wpLang
count,47109,47109,47109,47109
unique,3588,3564,41320,236
top,DOID:635,http://www.wikidata.org/entity/Q79793,AIDS,en
freq,157,202,38,3245


In [43]:
df['label'].value_counts()[:10]

AIDS               38
Malaria            23
Ebola              21
SARS               19
meningitis         18
Hepatitis A        18
Kuru               17
Vitiligo           15
Gastroenteritis    15
Kwashiorkor        14
Name: label, dtype: int64