In [175]:
import pandas as pd
import pickle as pkl
import time
from collections import Counter
import numpy as np
import matplotlib.pylab as plt
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

In [176]:
authors = pd.read_csv("data/final_authors_after_properties.csv", low_memory=False, index_col=0).fillna("''")

In [177]:
authors.columns

Index(['ISBN', 'title', 'author', 'alt_title', 'alt_author', 'correct_author',
       'alt_first_author', 'viaf_id', 'QID', 'sexuality', 'country',
       'language', 'religion', 'gender', 'ethnicity', 'birthyear',
       'deathyear'],
      dtype='object')

# Properties QIDs to labels

## Sort out

In [178]:
authors.columns[9:]

Index(['sexuality', 'country', 'language', 'religion', 'gender', 'ethnicity',
       'birthyear', 'deathyear'],
      dtype='object')

In [179]:
for col in authors.columns[9:]:
    authors[col] = authors[col].apply(lambda d: d.split('[')[1][1:] if d[0]=="[" else d).apply(lambda d: d.split(']')[0][:-1] if d[-1]=="]" else d).str.replace("'","")

In [180]:
property_labels = {}
for col in authors.columns[9:-2]:
    property_labels[col] = []
    unique_values = authors[col].unique()
    for value in unique_values:
        if value!="":
            values = value.split(", ")
            for val in values:
                if val not in property_labels[col]:
                    property_labels[col].append(val)

## Turn qid to label

In [152]:
start = time.time()
for col in ["religion","gender","ethnicity"]:
    new_vals = []
    for val in property_labels[col]:
        prop_dict = get_entity_dict_from_api(val)
        try:
            new_val = prop_dict["labels"]["en"]["value"]      
        except:
            # take the first if english doesn't exist
            print(prop_dict["labels"])
            new_val = list(prop_dict["labels"].values())[0]["value"]
        new_vals.append(new_val)
    property_labels[col].append(new_vals)
    print(time.time() - start)
    print(col)

{'de': {'language': 'de', 'value': 'Surmang-Kagyü'}}
57.503150939941406
religion
60.886780738830566
gender
{'be': {'language': 'be', 'value': 'Чэхі, племя'}, 'ru': {'language': 'ru', 'value': 'Чехи (племя)'}, 'cs': {'language': 'cs', 'value': 'Čechové'}, 'ro': {'language': 'ro', 'value': 'cehi'}}
{'es': {'language': 'es', 'value': 'Wasco'}, 'ca': {'language': 'ca', 'value': 'wasco'}, 'vec': {'language': 'vec', 'value': 'wasco'}}
136.00067710876465
ethnicity


In [154]:
for col in property_labels.keys():
    vals = property_labels[col]
    rang = len(vals) - 1
    label_list = vals[-1]
    label_dict = {}
    for i in range(rang):
        label_dict[vals[i]] = label_list[i]
    property_labels[col] = label_dict
    print(col)

sexuality
country
language
religion
gender
ethnicity


In [158]:
with open("data/property_dict.pkl", "wb") as f:
    pkl.dump(property_labels,f)

In [182]:
with open("data/property_dict.pkl", "rb") as f:
    property_labels = pkl.load(f)

In [183]:
property_labels

{'sexuality': {'Q6636': 'homosexuality',
  'Q6649': 'lesbianism',
  'Q339014': 'non-heterosexuality',
  'Q43200': 'bisexuality',
  'Q592': 'gay',
  'Q1035954': 'heterosexuality',
  'Q724351': 'asexuality',
  'Q271534': 'pansexuality'},
 'country': {'Q16': 'Canada',
  'Q30': 'United States of America',
  'Q38': 'Italy',
  'Q21': 'England',
  'Q145': 'United Kingdom',
  'Q183': 'Germany',
  'Q174193': 'United Kingdom of Great Britain and Ireland',
  'Q161885': 'Great Britain',
  'Q27': 'Republic of Ireland',
  'Q142': 'France',
  'Q2305208': 'Russian Socialist Federative Soviet Republic',
  'Q15180': 'Soviet Union',
  'Q713750': 'West Germany',
  'Q159': 'Russia',
  'Q34266': 'Russian Empire',
  'Q207272': 'Second Polish Republic',
  'Q129286': 'British India',
  'Q1775277': 'Dominion of India',
  'Q668': 'India',
  'Q148540': 'Republic of Florence',
  'Q25': 'Wales',
  'Q739': 'Colombia',
  'Q408': 'Australia',
  'Q45': 'Portugal',
  'Q35': 'Denmark',
  'Q29': 'Spain',
  'Q2227570': 'Du

In [169]:
for col in authors.columns[9:-2]:
    lista = authors[col]
    new_lista = []
    for item in lista:
        if item!="":
            items = item.split(", ")
            new_items = []
            for it in items:
                new_it = property_labels[col][it]
                new_items.append(new_it)
            new_items = ", ".join(new_items)
        else: 
            new_items = ""
        new_lista.append(new_items)
    authors[col] = new_lista
    print(col)

sexuality
country
language
religion
gender
ethnicity


In [172]:
authors["gender"].unique()

array(['male', 'female', '', 'transgender female', 'non-binary, female',
       'transgender male', 'transgender female, female', 'non-binary',
       'male, female', 'genderfluid, male', 'female, male', 'intersex',
       'intersex, non-binary', 'female, intersex', 'transfeminine, male'],
      dtype=object)

In [184]:
len(authors)

39642

# DONE!

In [174]:
authors.to_csv("data/final_authors_after_properties_and_decoding.csv")