In [2]:
from tqdm import tqdm, tqdm_notebook

In [4]:
import pandas as pd
import numpy as np
import json
from wikidataintegrator import wdi_core
import matplotlib.pyplot as plt

query = """
SELECT * WHERE {
{SELECT ?item (COUNT(DISTINCT ?wdLang) as ?WDTranslations) (GROUP_CONCAT(DISTINCT ?wdLang) as ?wdlanguages ) WHERE {
   ?item wdt:P699 ?doid ;
         rdfs:label ?label .
   BIND (lang(?label) AS ?wdLang)
}
GROUP BY ?item ?itemLabel
}

{SELECT ?item ?itemLabel (COUNT(DISTINCT ?wpLang) as ?WPTranslations) (GROUP_CONCAT(DISTINCT ?wpLang) as ?wplanguages ) WHERE {
   ?item wdt:P699 ?doid .
   ?article schema:about ?item ;
            schema:inLanguage ?wpLang .
   # BIND (lang(?label) AS ?lang)
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?item ?itemLabel

}
#FILTER (?WDTranslations != ?WPTranslations)
BIND ((?WDTranslations - ?WPTranslations) as ?difference )
}
ORDER BY DESC(?WDTranslations)
"""

df = wdi_core.WDItemEngine.execute_sparql_query(query, as_dataframe=True)

In [5]:
df.head()

Unnamed: 0,WDTranslations,WPTranslations,difference,item,itemLabel,wdlanguages,wplanguages
0,172,158,14,http://www.wikidata.org/entity/Q12199,AIDS,su sv sw ta te tg th tk tl tr ts tt tum uk ur ...,af gsw am an ar arz as ast az ba sgs be-tarask...
1,158,138,20,http://www.wikidata.org/entity/Q12136,disease,es ru de fr pt it ca fi hu ka lb sv en nb nl g...,ht hu hy ia id ilo io is it ja jam jv ka kk kn...
2,157,144,13,http://www.wikidata.org/entity/Q12078,cancer,rw de id af am an ar arc arz as ast az azb ba ...,af gsw am an ar arc arz as ast az azb ba sgs b...
3,149,146,3,http://www.wikidata.org/entity/Q12156,malaria,da ig af ak an ar arz as ast az ba be be-taras...,af ak gsw an ar arz as ast az ba sgs be-tarask...
4,147,140,7,http://www.wikidata.org/entity/Q12204,tuberculosis,ar sh zh-hant fa uz szl vi pl af am an arz as ...,nn nb nv oc or pa pl pnb ps pt qu ro ru rue sa...


In [6]:
df["WDTranslations"].astype(str).astype(int).mean()

16.759731167740128

In [7]:
df["WPTranslations"].astype(str).astype(int).mean()

13.248669840380845

In [8]:
# top 10 disease terms with translations available through Wikidata
df[["itemLabel","WDTranslations", "WPTranslations", "difference"]].head(10)

Unnamed: 0,itemLabel,WDTranslations,WPTranslations,difference
0,AIDS,172,158,14
1,disease,158,138,20
2,cancer,157,144,13
3,malaria,149,146,3
4,tuberculosis,147,140,7
5,diabetes mellitus,138,133,5
6,Ebola virus disease,132,124,8
7,influenza,131,125,6
8,cholera,128,121,7
9,Alzheimer's disease,124,107,17


In [9]:
query = """
SELECT DISTINCT ?doid ?item ?label ?wpLang
 
WHERE {
   ?item wdt:P699 ?doid ;
         rdfs:label ?label ;
         rdfs:label ?english .
         
   ?article schema:about ?item ;
            schema:inLanguage ?wpLang .
   FILTER (lang(?label) = ?wpLang)
   FILTER (lang(?english) = "en")
}
"""
df = wdi_core.WDItemEngine.execute_sparql_query(query, as_dataframe=True)

In [10]:
df.head()

Unnamed: 0,doid,item,label,wpLang
0,DOID:4184,http://www.wikidata.org/entity/Q819207,قصور جارات الدرق الكاذب,ar
1,DOID:4184,http://www.wikidata.org/entity/Q819207,Pseudohypoparathyreoidismus,de
2,DOID:4184,http://www.wikidata.org/entity/Q819207,pseudohypoparathyroidism,en
3,DOID:4184,http://www.wikidata.org/entity/Q819207,pseudohipoparatiroidismo,es
4,DOID:4184,http://www.wikidata.org/entity/Q819207,کم‌کاری کاذب غده پاراتیروئید,fa


In [11]:
df2 = pd.DataFrame()
gb = df.groupby("doid")
for n in gb:
  s = n[1][['label', 'wpLang']].set_index("wpLang").T
  s = s.groupby(s.columns, axis=1).agg(lambda x: '|'.join(x.values))
  s['ID'] = n[0]
  df2 = df2.append(s)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [12]:
df2.head()

Unnamed: 0,ID,ace,af,ak,am,an,ang,ar,arc,arz,...,wuu,xh,xmf,yi,yo,yue,za,zea,zh,zu
label,DOID:0001816,,,,,,,ساركومة وعائية,,,...,,,,,,,,,,
label,DOID:0002116,,,,,,,الظفرة,,,...,,,,,,,,,翼状胬肉,
label,DOID:0014667,,,,,,,اضطراب استقلابي,,,...,,,,,,,,,代謝疾病,
label,DOID:0040002,,,,,,,,,,...,,,,,,,,,,
label,DOID:0050011,,,,,,,,,,...,,,,,,,,,,


In [13]:
df2 = df2.set_index("ID")

In [14]:
df2.loc['DOID:9351']

ace                        NaN
af                Suikersiekte
ak                         NaN
am                     ስኳር በሽታ
an           Diabetis mellitus
ang                        NaN
ar                      السكري
arc                        NaN
arz                  مرض السكر
as                বহুমুত্ৰ ৰোগ
ast                   Diabetes
av                         NaN
ay                         NaN
az              Şəkərli diabet
azb                      دیابت
ba               Шәкәр диабеты
bar                        NaN
bcl                        NaN
be              цукровы дыябет
be-tarask       цукровы дыябэт
bg              Захарен диабет
bho                        NaN
bjn                        NaN
bm                         NaN
bn                বহুমূত্র রোগ
bo                         NaN
bpy                 ডায়াবেটিস
br                      Diabet
bs           Diabetes mellitus
bxr            Саахарай диабет
                   ...        
tpi                        NaN
tr      

In [None]:
# df2.to_csv("sdfeg.csv")