# Abstract

Final project for Electronic Publishing and Digital Storytelling in fullfillment of an LM in Digital Humanities and Digital Knowledge from the University of Bologna.

## Project Aims
Wikidata is one of the largest free and open knowledge databases in the world. 
Launched in 2012, it now contains over 97 million items, over six million of them people.

This project investigates how Wikidata describes art historians and how those descriptions differ across gender.
This project serves as a case study in how our descriptions of history create history.

### Phase 1: Overview
We first wanted to get an wide view of Wikidata's data on art historians.
To do this we first queried art historians grouped by gender.

In [108]:
#insert Denise's initial query that breaks down those with art historian/sub groups into genders
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"


General_Query = """
SELECT ?genderLabel (count(distinct ?human) as ?number)
WHERE
{SERVICE wikibase:label {
     bd:serviceParam wikibase:language "en" .
   }
  ?human wdt:P31 wd:Q5
  ; wdt:P21 ?gender
  ; wdt:P106/wdt:P279* wd:Q1792450 .
}
GROUP BY ?genderLabel
"""

# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(General_Query)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
results = sparql_wd.query().convert()


# manipulate the result
for result in results["results"]["bindings"]:
    gender = result["genderLabel"]["value"]
    total = result["number"]["value"]
    
    print ("On Wikidata there are" + " " + total + " " + gender + " " + "art historians.")

On Wikidata there are 11757 male art historians.
On Wikidata there are 5881 female art historians.
On Wikidata there are 2 non-binary art historians.


Then we wanted to look at the properties used to describe art historians across genders. So we ran a query to count the number of properties used for each

In [13]:
#insert Sarah's query getting property counts
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
my_SPARQL_query = """
SELECT ?genderLabel (count(distinct ?property) as ?number)
WHERE
{SERVICE wikibase:label {
     bd:serviceParam wikibase:language "en" .
   }

  ?human wdt:P31 wd:Q5
  ; wdt:P21 ?gender
  ; ?property ?object
  ; wdt:P106/wdt:P279* wd:Q1792450 .

}

GROUP BY ?genderLabel
"""
# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(my_SPARQL_query)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
results = sparql_wd.query().convert()

# manipulate the result
for result in results["results"]["bindings"]:
    print(result["genderLabel"]["value"], result["number"]["value"])
print("🦐")


male 2662
female 1797
non-binary 233
🦐


This query is for NOT distinct, so total number of declarations

In [63]:
#insert Sarah's query getting property counts
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
my_SPARQL_query = """
SELECT ?genderLabel (count(?property) as ?number)
WHERE
{SERVICE wikibase:label {
     bd:serviceParam wikibase:language "en" .
   }

  ?human wdt:P31 wd:Q5
  ; wdt:P21 ?gender
  ; ?property ?object
  ; wdt:P106/wdt:P279* wd:Q1792450 .

}

GROUP BY ?genderLabel
"""
# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(my_SPARQL_query)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
results = sparql_wd.query().convert()

# manipulate the result
for result in results["results"]["bindings"]:
    print(result["genderLabel"]["value"], result["number"]["value"])
print("🌮")

male 1204094
female 422519
non-binary 493
🌮


We also wanted to look at basic trends over time.

In [135]:
from SPARQLWrapper import SPARQLWrapper, JSON, GET, POST, CSV
import csv 
import pandas as pd
import requests
import json
import ssl
import numpy as np
import pandas as pd
import string
import networkx as nx
import matplotlib.pyplot as plt

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"


property_over_time= """

SELECT  ?year ?genderLabel(count(?historian) as ?count) 

WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?historian wdt:P31 wd:Q5
                    ;wdt:P21 ?gender
                    ;wdt:P106/wdt:P279* wd:Q1792450
                    ; ?property ?object
                    ;wdt:P569 ?birthdate FILTER(?birthdate > "1900-01-01T00:00:00Z"^^xsd:dateTime)
                   
}
GROUP BY (year(xsd:dateTime(?birthdate)) as ?year) ?genderLabel

Order by ?genderLabel DESC(?year)

"""
# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(property_over_time)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
time_result = sparql_wd.query().convert

with open('periods.csv', mode='w') as my_file:
    my_writer = csv.writer(my_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    my_writer.writerow(["Year","Gender", "Total"])

    #for result in time_result["results"]["bindings"]:
        #my_writer.writerow([result["year"]["value"],result["genderLabel"]["value"], result["count"]["value"].strip()])
        




In [136]:
data = pd.read_csv("periods.csv")
# print the first 5 rows
data.head()

Unnamed: 0,Year,Gender,Total


__NOTE!!!! This code copies everything from the first one and just changes the query. Is there a more efficient way to do this? It seems good to have it all bc you could run whichever query you want whenever you want but if we know she's going to run all the preceeding code before, maybe there's a way to make it more efficient (eg. only import once, reuse variables, etc.?)__

### Phase 2: Types of Properties
Then we wanted to break down those properties into types to see if certain properties/types of properties appear more often for some genders over others.
The first query is for how many art historians of each gender are also linked to a VIAF authority.

******* had huge problems getting a more generic query to work; really want ANY authority, not just viaf. see project notes

In [22]:
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
my_SPARQL_query = """
SELECT ?genderLabel (count(distinct ?human) as ?number)
WHERE
{SERVICE wikibase:label {
     bd:serviceParam wikibase:language "en" .
   }

  ?human wdt:P31 wd:Q5
  ; wdt:P21 ?gender
  ; ?property ?object
  ; wdt:P106/wdt:P279* wd:Q1792450 
  ; wdt:P214 ?viafid .
}


GROUP BY ?genderLabel
LIMIT 10
"""
# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(my_SPARQL_query)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
results = sparql_wd.query().convert()

# manipulate the result
for result in results["results"]["bindings"]:
    print(result["genderLabel"]["value"], result["number"]["value"])
print("🧁")


male 10998
female 5197
non-binary 2
🧁


In [5]:
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
my_SPARQL_query = """
# Make a list of the most used authority control properties for people for art historians by gender
SELECT ?propertyLabel ?genderLabel ?count WHERE {
  {
    select distinct?gender ?propertyclaim (COUNT(*) AS ?count) where {
      ?item wdt:P106/wdt:P279* wd:Q1792450  .
      ?item wdt:P31 wd:Q5 .
      ?item wdt:P21 ?gender .
      ?item ?propertyclaim [] .
    } group by ?propertyclaim ?gender
  }
  ?property wikibase:propertyType wikibase:ExternalId .
  ?property wdt:P31 wd:Q19595382 .
  ?property wikibase:claim ?propertyclaim .
  SERVICE wikibase:label {            # ... include the labels
    bd:serviceParam wikibase:language "en" .
  }
} ORDER BY DESC (?count)
#LIMIT 100
"""
# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(my_SPARQL_query)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
results = sparql_wd.query().convert()

# manipulate the result
for result in results["results"]["bindings"]:
    print(result["propertyLabel"]["value"], result["genderLabel"]["value"], result["count"]["value"])
print("👻")

VIAF ID male 11548
ISNI male 9640
WorldCat Identities ID male 9520
Library of Congress authority ID male 8596
GND ID male 8474
NUKAT ID male 7830
Nationale Thesaurus voor Auteurs ID male 7190
Bibliothèque nationale de France ID male 5736
VIAF ID female 5432
NKCR AUT ID male 5081
Deutsche Biographie (GND) ID male 4950
PLWABN ID male 4277
ISNI female 4026
WorldCat Identities ID female 3661
Library of Congress authority ID female 3493
SHARE Catalogue author ID male 3425
GND ID female 3419
Vatican Library VcBA ID male 3129
Unione Romana Biblioteche Scientifiche ID male 3092
NUKAT ID female 2994
American Academy in Rome ID male 2934
IxTheo authority ID male 2774
NORAF ID male 2580
Open Library ID male 2412
Nationale Thesaurus voor Auteurs ID female 2368
Bibliothèque nationale de France ID female 2212
National Library of Israel J9U ID male 2137
abART person ID male 2089
Vatican Library ID (former scheme) male 2033
CONOR.SI ID male 1802
NKCR AUT ID female 1746
Kallías ID male 1571
Deutsche Bi

### Phase 2a: Professions and Occupations

Total number of other jobs

In [6]:
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
number_jobs = """
SELECT ?genderLabel (COUNT(?job) AS ?count_job)
WHERE 
{ 

  ?human wdt:P21 ?gender
  ; wdt:P106 wd:Q1792450
  ; wdt:P106 ?job
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

}
GROUP BY ?genderLabel
"""
# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(number_jobs)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
results = sparql_wd.query().convert()


# manipulate the result
for result in results["results"]["bindings"]:
    print(result["genderLabel"]["value"], result["count_job"]["value"])
print("🍩")
  



male 27557
female 11080
non-binary 7
🍩


In [142]:
#QUERY MALE

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
male_job_list = """

SELECT ?jobLabel (COUNT(?human) AS ?tot)

WHERE 
{ 
  ?human wdt:P21 wd:Q6581097
  ; wdt:P106 wd:Q1792450
  ; wdt:P106 ?job
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?jobLabel 
ORDER BY DESC(?tot)
LIMIT 21
"""

# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(male_job_list)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
male_job_list_result = sparql_wd.query().convert()


with open('male_jobs.csv', mode='w') as my_file:
    my_writer = csv.writer(my_file, delimiter=',', quoting=csv.QUOTE_ALL)
    # write the column names
    my_writer.writerow(['Job', 'Total'])
    for result in male_job_list_result ["results"]["bindings"]:
        my_writer.writerow([result["jobLabel"]["value"], result["tot"]["value"].strip()])
        


# parse the csv into a dataframe
df = pd.read_csv("male_jobs.csv")
# print the first 15 rows
df.head(20)




Unnamed: 0,Job,Total
0,art historian,11139
1,university teacher,1804
2,writer,1056
3,archaeologist,1033
4,historian,1003
5,painter,640
6,curator,552
7,art critic,497
8,exhibition curator,415
9,journalist,385


In [143]:
QUERY FEMALE

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
female_job_list = """

SELECT ?jobLabel (COUNT(?human) AS ?tot)

WHERE 
{ 
  ?human wdt:P21 wd:Q6581072 
  ; wdt:P106 wd:Q1792450
  ; wdt:P106 ?job
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?jobLabel 
ORDER BY DESC(?tot)
LIMIT 21
"""

# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(female_job_list)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
female_job_list_result = sparql_wd.query().convert()


with open('female_jobs.csv', mode='w') as my_file:
    my_writer = csv.writer(my_file, delimiter=',', quoting=csv.QUOTE_ALL)
    # write the column names
    my_writer.writerow(['Job', 'Total'])
    for result in female_job_list_result ["results"]["bindings"]:
        my_writer.writerow([result["jobLabel"]["value"], result["tot"]["value"].strip()])
        


# parse the csv into a dataframe
df = pd.read_csv("female_jobs.csv")
# print the first 15 rows
df.head(20)





Unnamed: 0,Job,Total
0,art historian,5698
1,university teacher,487
2,historian,418
3,exhibition curator,391
4,curator,385
5,writer,379
6,archaeologist,188
7,art critic,164
8,journalist,139
9,author,117


In [145]:
#QUERY NON BINARY

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
non_binary_job_list = """

SELECT ?jobLabel (COUNT(?human) AS ?tot)

WHERE 
{ 
  ?human wdt:P21 wd:Q48270 
  ; wdt:P106 wd:Q1792450
  ; wdt:P106 ?job
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?jobLabel 
ORDER BY DESC(?tot)
LIMIT 21
"""

# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(non_binary_job_list)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
non_binary_job_list_result = sparql_wd.query().convert()


with open('non_binary_jobs.csv', mode='w') as my_file:
    my_writer = csv.writer(my_file, delimiter=',', quoting=csv.QUOTE_ALL)
    # write the column names
    my_writer.writerow(['Job', 'Total'])
    for result in non_binary_job_list_result ["results"]["bindings"]:
        my_writer.writerow([result["jobLabel"]["value"], result["tot"]["value"].strip()])
        


# parse the csv into a dataframe
df = pd.read_csv("non_binary_jobs.csv")
# print the first 15 rows
df.head(20)





Unnamed: 0,Job,Total
0,writer,1
1,professor,1
2,art historian,1
3,journalist,1
4,film critic,1
5,literary critic,1
6,essayist,1


It is lear that both man and women after art historian is university teacher. So we wanted to take a look at the property related to university degree for both genders

In [147]:
ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
Academic_Degree = """
SELECT ?degreeLabel ?gender ?genderLabel (count(?human) as ?count)
WHERE
{ 
  ?human wdt:P31 wd:Q5
  ; wdt:P21 ?gender
  ; wdt:P106/wdt:P279* wd:Q1792450 
  ; wdt:P512 ?degree
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}

GROUP BY ?degreeLabel ?gender ?genderLabel
ORDER BY ?gender DESC(?count)
"""

# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(Academic_Degree)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
Academic_Degree_result = sparql_wd.query().convert()

for result in results["results"]["bindings"]:
    print(result["degreeLabel"]["value"], result["genderLabel"]["value"],result["count"]["value"])



TypeError: 'method' object is not subscriptable


### Phase 2b: Personal Relationships
Are men or women more likely to have personal relationships listed? What kinds of relationships appear?

Below query shows all personal relationship properties and how often they're used. I think it's super weird that "relative" is used exclusively in women's profiles.

In [20]:
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
my_SPARQL_query = """
# Make a list of the most used authority control properties for people for art historians by gender
SELECT ?propertyLabel ?genderLabel ?count WHERE {
  {
    select distinct?gender ?propertyclaim (COUNT(*) AS ?count) where {
      ?item wdt:P106/wdt:P279* wd:Q1792450  .
      ?item wdt:P31 wd:Q5 .
      ?item wdt:P21 ?gender .
      ?item ?propertyclaim [] .
    } group by ?propertyclaim ?gender
  }
  #?property wikibase:propertyType wikibase:ExternalId .
  ?property wdt:P31 wd:Q22964231 .
  ?property wikibase:claim ?propertyclaim .
  SERVICE wikibase:label {            # ... include the labels
    bd:serviceParam wikibase:language "en" .
  }
} ORDER BY DESC (?count)
#LIMIT 100
"""
# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(my_SPARQL_query)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
results = sparql_wd.query().convert()

# manipulate the result
for result in results["results"]["bindings"]:
    print(result["propertyLabel"]["value"], result["genderLabel"]["value"], result["count"]["value"])
print("👶")

child male 798
father male 736
sibling male 624
spouse male 591
spouse female 378
father female 241
mother male 213
child female 161
relative male 156
sibling female 132
mother female 102
relative female 40
number of children male 30
unmarried partner male 23
number of children female 16
unmarried partner female 4
stepparent male 2
godparent male 1
godparent female 1
number of children non-binary 1
👶


# Phase 2c: Academic Relationships: 

Query to see if there is any significant difference in the academic interaction between art historians depending on gender. 

In [155]:
ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

# prepare the query
Academic_Relationship = """

SELECT ?historiangenderLabel ?studentgenderLabel (count(?student) as ?numberstudent)
WHERE
{
?historian wdt:P31 wd:Q5
; wdt:P21 ?historiangenderLabel
; wdt:P106/wdt:P279* wd:Q1792450
; wdt:P185 ?student.
?student wdt:P21 ?studentgenderLabel
}



GROUP BY ?studentgenderLabel ?historiangenderLabel
"""
# set the endpoint 
sparql_wd = SPARQLWrapper(wikidata_endpoint)
# set the query
sparql_wd.setQuery(Academic_Relationship)
# set the returned format
sparql_wd.setReturnFormat(JSON)
# get the results
Academic_Relationship_results = sparql_wd.query().convert()

for result in Academic_Relationship_results["results"]["bindings"]:
    print(result["studentgenderLabel"]["value"], result["historiangenderLabel"]["value"], result["numberstudent"]["value"])



http://www.wikidata.org/entity/Q6581097 http://www.wikidata.org/entity/Q6581097 161
http://www.wikidata.org/entity/Q6581072 http://www.wikidata.org/entity/Q6581097 88
http://www.wikidata.org/entity/Q6581097 http://www.wikidata.org/entity/Q6581072 21
http://www.wikidata.org/entity/Q6581072 http://www.wikidata.org/entity/Q6581072 16
http://www.wikidata.org/entity/Q6581072 http://www.wikidata.org/entity/Q48270 4
http://www.wikidata.org/entity/Q6581097 http://www.wikidata.org/entity/Q48270 1
