<a href="https://colab.research.google.com/github/PietrH/common_wikidata_props/blob/main/get_wikidata_botanical_collector_properties.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sparqlwrapper
!pip install wikidata
# https://rdflib.github.io/sparqlwrapper/

Collecting sparqlwrapper
  Downloading SPARQLWrapper-1.8.5-py3-none-any.whl (26 kB)
Collecting rdflib>=4.0
  Downloading rdflib-6.1.1-py3-none-any.whl (482 kB)
[K     |████████████████████████████████| 482 kB 9.9 MB/s 
[?25hCollecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 606 kB/s 
Installing collected packages: isodate, rdflib, sparqlwrapper
Successfully installed isodate-0.6.1 rdflib-6.1.1 sparqlwrapper-1.8.5
Collecting wikidata
  Downloading Wikidata-0.7.0-py3-none-any.whl (29 kB)
Installing collected packages: wikidata
Successfully installed wikidata-0.7.0


In [2]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT DISTINCT ?item ?itemLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
  {
    SELECT DISTINCT ?item ?property ?propertyLabel WHERE {
      ?item p:P106 ?statement0.
      ?statement0 (ps:P106/(wdt:P279*)) wd:Q2083925.
      
    }
    LIMIT 100000
  }
}"""


def get_results(endpoint_url, query):
    user_agent = "get-botcol-props Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

#for result in results["results"]["bindings"]:
#    print(result)

In [79]:
from wikidata.client import Client
import re
client = Client()

tally = []

for result in results["results"]["bindings"]:
  q_num = result["itemLabel"]["value"]
  entity = client.get(q_num, load=False)
  #list all properties
  props = re.findall("(?<=property': ')P[0-9]+",str(entity.attributes['claims']))
  tally.append(props)

In [80]:
from collections import Counter

c = Counter([item for sublist in tally for item in sublist])

In [81]:

# get n most common properties for botanical collectors, or all of them

n_prop = len(set([item for sublist in tally for item in sublist]))
#c.most_common(n_prop)
c.most_common(35)

[('P813', 21473),
 ('P248', 13040),
 ('P854', 11190),
 ('P106', 5091),
 ('P214', 3002),
 ('P143', 2716),
 ('P735', 2652),
 ('P6264', 2273),
 ('P1810', 2212),
 ('P734', 2112),
 ('P1545', 2092),
 ('P31', 1992),
 ('P569', 1864),
 ('P21', 1813),
 ('P3831', 1790),
 ('P570', 1622),
 ('P27', 1581),
 ('P586', 1421),
 ('P2889', 1389),
 ('P6944', 1294),
 ('P535', 1165),
 ('P304', 1123),
 ('P19', 1079),
 ('P428', 931),
 ('P20', 812),
 ('P4081', 776),
 ('P69', 773),
 ('P108', 726),
 ('P7859', 601),
 ('P3342', 583),
 ('P580', 547),
 ('P244', 545),
 ('P463', 543),
 ('P213', 478),
 ('P551', 421)]

In [82]:
prop = client.get('P586')

In [83]:
prop.label

m'IPNI author ID'

In [84]:
import pandas as pd

df = pd.DataFrame(c.most_common(n_prop),columns=['Property','n'])
#df['label'] = df['Property'].map()

In [85]:
df['Label']= df['Property'].map(lambda prop: client.get(prop).label.texts['en'])

In [86]:
%load_ext google.colab.data_table

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table


Some properties can occur more than once on an entity, especially claims regarding references; Because I think it's valuable to see what claims are added to references, I included these in the tally

In [87]:
df

Unnamed: 0,Property,n,Label
0,P813,21473,retrieved
1,P248,13040,stated in
2,P854,11190,reference URL
3,P106,5091,occupation
4,P214,3002,VIAF ID
...,...,...,...
610,P1028,1,donated by
611,P2342,1,AGORHA person/institution ID
612,P1007,1,Lattes Platform number
613,P5745,1,Pacific Coast Architecture Database person ID


In [75]:
%unload_ext google.colab.data_table

write output to csv and download


In [88]:
df.to_csv("botanical_collectors_properties.csv")

In [90]:
from google.colab import files

files.download('botanical_collectors_properties.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>