# Wikidata properties through wbgetentities

In [20]:
import sys
sys.path.append("..")

from elastic_wikidata.wd_entities import get_entities, simplify_wbgetentities_result

import pandas as pd
import re
from heritageconnector.utils.wikidata import raise_invalid_qid
from heritageconnector.disambiguation.retrieve import get_wikidata_fields

In [36]:
# TODO: split into get_properties and get_labels_for_properties
# then replace retrieve.get_wikidata_fields with this

class wbentities:
    def __init__(self, api_timeout=6):
        self.timeout = api_timeout
    
    def get_properties(self, qids: list, pids: list, pids_to_label = None, page_size=50) -> pd.DataFrame:
        ge = get_entities()
        res_generator = ge.result_generator(qids, page_limit=page_size, timeout=self.timeout)
        
        if pids_to_label is not None:
            if isinstance(pids_to_label, list):
                pids_all = pids + pids_to_label
            elif pids_to_label == "all":
                pids_all = pids
                pids_to_label = pids_all
        
        docs = [simplify_wbgetentities_result(doc, lang='en', properties=pids_all) for doc in res_generator][0]
        doc_df = pd.json_normalize(docs)
        
        # add columns with empty string values for any that are missing
        proposed_cols = self._pids_to_df_cols(pids_all)
        actual_cols = [col for col in doc_df.columns if col.startswith('claims')]        
        extra_cols = list(set(proposed_cols) - set(actual_cols))
        
        for c in extra_cols: 
            doc_df[c] = ""
        
        self.doc_df = doc_df
        
        if pids_to_label is not None:
            self.get_labels_for_properties(pids_to_label)
            
    def _is_qid(self, val: str) -> bool:
        return len(re.findall(r"(Q\d+)", val)) == 1
    
    def _pids_to_df_cols(self, pids: list) -> list:
        """Transform PIDs to doc_df column names: P79 -> claims.P79"""
        return [f"claims.{pid}" for pid in pids]
    
    def _replace_qid_with_label(self, v):
        """Replace QID with label from qid_label_mapping. Return original value if 
        QID is not in keys of qid_label_mapping."""
        if (not isinstance(v, list)) or len(v) == 0:
            return v

        else:
            return [self.qid_label_mapping.get(i, i) for i in v]
        
    def _copy_and_clean_df_for_export(self, df):
        """
        Replace one-item lists with strings across the dataframe.
        Replace empty lists with empty strings.
        Replace nan values with empty strings.
        """
        
        export_df = df.copy()
        
        export_df = export_df.applymap(lambda i: i[0] if isinstance(i, list) and len(i) == 1 else i)
        export_df = export_df.applymap(lambda i: "" if isinstance(i, list) and len(i) == 0 else i)
        export_df = export_df.fillna("")
        
        return export_df
    
    def get_labels_for_properties(self, pids: list):
        cols = self._pids_to_df_cols(pids)        
        
        # make list of qids to get labels for
        qids_getlabels = []
        for idx, row in self.doc_df.iterrows():
            for col in cols:
                if isinstance(row[col], str) and self._is_qid(row[col]):
                    qids_getlabels.append(row[col])

                elif isinstance(row[col], list):
                    [qids_getlabels.append(val) for val in row[col] if self._is_qid(val)]
        
        # get labels if the list is not empty
        if len(qids_getlabels) > 0:
            qids_getlabels = list(set(qids_getlabels))
            self.qid_label_mapping = ge.get_labels(qids_getlabels, timeout=self.timeout)

            for col in cols:
                self.doc_df[col] = self.doc_df[col].map(lambda i: self._replace_qid_with_label(i))

                
    def get_results(self):
        """Get cleaned dataframe"""
        
        return self._copy_and_clean_df_for_export(self.doc_df)


In [37]:
qids = ['Q203545', 'Q706475', 'Q18637243']
pids = ['P31', 'P21', 'P735', 'P734', 'P1971', 'P36']

ent = wbentities()
ent.get_properties(qids, pids, pids_to_label=["P734", "P735", "P36"])
ent.get_results()

Getting 6 wikidata documents in pages of 50


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,id,labels,descriptions,aliases,claims.P31,claims.P21,claims.P735,claims.P734,claims.P1971,claims.P36
0,Q203545,Michael Gambon,Irish-born British actor,"[Sir Michael Gambon, Michael John Gambon]",Q5,Q6581097,Michael,Gambon,,
1,Q706475,Steve McQueen,British film director and video artist,Steven Rodney McQueen,Q5,Q6581097,Steve,McQueen,2.0,
2,Q18637243,Michaela Coel,"British poet, singer-songwriter, screenwriter,...","[Michaela Ewuraba O Collinson, Michaela Collin...",Q5,Q6581072,Michaela,Coel,,
