# Transforming and importing the document-metadata in Wikibase

Using WikibaseIntegrator:<br>
https://pypi.org/project/wikibaseintegrator/

## Modules

In [10]:
import os
import json
from progress.spinner import Spinner
from python_wikibase import PyWikibase
import pandas as pd
import datetime

from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator import WikibaseIntegrator, wbi_login, wbi_helpers
from wikibaseintegrator.wbi_enums import ActionIfExists
from wikibaseintegrator.wbi_enums import WikibaseSnakType
from wikibaseintegrator.datatypes import String
from wikibaseintegrator.datatypes import Item
from wikibaseintegrator.datatypes import URL
from wikibaseintegrator.datatypes.extra import EDTF
from wikibaseintegrator.datatypes import Quantity
from wikibaseintegrator.datatypes import GlobeCoordinate

spinner = Spinner("Processing... ")
line = "----------"

## Connecting to own Wikibase
1. Change the placeholders in 'data/config.json'

In [2]:
### Reading config-data (URLs, access-tokens and -secrets)
f = open(r'C:\Users\henri\OneDrive\Desktop\Wikibase-BA\Daten-Pipeline\config.json') #Change this path to 'data/config.json'
config = json.load(f)
f.close()

### Connecting to Wikibase
wbi_config['MEDIAWIKI_API_URL'] = config['MEDIAWIKI_API_URL']
wbi_config['SPARQL_ENDPOINT_URL'] = config['SPARQL_ENDPOINT_URL']
wbi_config['WIKIBASE_URL'] = config['WIKIBASE_URL']
wbi_config['USER_AGENT'] = config['USER_AGENT']

login_instance = wbi_login.OAuth1(consumer_token=config['oauthCredentials']['consumerToken'], 
                                  consumer_secret=config['oauthCredentials']['consumerSecret'],
                                  access_token=config['oauthCredentials']['accessToken'], 
                                  access_secret=config['oauthCredentials']['accessSecret'])

wbi = WikibaseIntegrator(login=login_instance)

## Initializing script

In [15]:
wb_script = 'import_metadata' #Used as label for an already exisiting item
print("Running script: " + wb_script + " with the following dataset:")
print(line)
tbl_input = pd.read_csv('data/RIDGES_9.0_doc-metadata_test.csv') #Table of input data
print(tbl_input.info(show_counts=False))
print(line)
### Asking for confirmation
confirm = input("Correct dataset? [y/n]: ")
if confirm.__contains__('y') == True:
    print()
else:
    print("Exiting...")
    exit()

Running script: import_metadata with the following dataset:
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 27 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   title          object
 1   Column         int64 
 2   file_name      object
 3   author         object
 4   translator     object
 5   trans_from     object
 6   date           int64 
 7   place          object
 8   publisher      object
 9   bibl           object
 10  version        object
 11  editor         object
 12  edition_first  object
 13  issue          object
 14  maintopic      object
 15  register       object
 16  topic          object
 17  lingualism     object
 18  orig_date      object
 19  orig_place     object
 20  repository     object
 21  lang_type      object
 22  lang_area      object
 23  text_type      object
 24  lyric_type     object
 25  wormwood       object
 26  herb_sorting   object
dtypes: int64(2), object(25)
memory usage: 34

## Script

In [18]:
for i in range(len(tbl_input)):

    ### Creating item for import, adding claims later
    current_time = str(datetime.datetime.now())
    item = wbi.item.new()
    item.labels.set(language='en', value=(wb_script + "_" + current_time))
    item.descriptions.set(language='en', value="Imported via script: " + wb_script + " on: " + current_time)
    instance_script = Item(value='Q10', prop_nr='P2')
    data = [instance_script]
    item.claims.add(data)
    item.write()

    ### Creating item for document, adding claims later
    item = wbi.item.new()
    item.labels.set(language='en', value=tbl_input.at[i, 'file_name'])
    item.descriptions.set(language='en', value=tbl_input.at[i, 'title'])
    item.write()
    
    ###Getting QIDs for match, document and named entities (author, editor, publisher, translator, place)
    qid_script = wbi_helpers.search_entities(wb_script)
    qid_document = wbi_helpers.search_entities(tbl_input.at[i, 'file_name'])

    qid_author = wbi_helpers.search_entities(tbl_input.at[i, 'author'])
    qid_editor = wbi_helpers.search_entities(tbl_input.at[i, 'editor'])
    qid_publisher = wbi_helpers.search_entities(tbl_input.at[i, 'publisher'])
    qid_translator = wbi_helpers.search_entities(tbl_input.at[i, 'translator'])
    qid_place = wbi_helpers.search_entities(tbl_input.at[i, 'place'])        

    if bool(qid_author) == False: #False = list is empty
        item = wbi.item.new()
        item.labels.set(language='en', value=tbl_input.at[i, 'author'])
        instance_person = Item(value='Q13', prop_nr='P2')
        based_on = Item(value=qid_document[0], prop_nr='P8')
        imported_by = Item(value=qid_script[0], prop_nr='P70')
        
        data = [instance_person, based_on, imported_by]
        item.claims.add(data)
        item.write()
        #Modifiying tbl_input
        qid_author = wbi_helpers.search_entities(str(tbl_input.at[i, 'author']))
        print("list is empty - qid_author: " + str(qid_author))
        qid_author = str(qid_author).replace("['","*")
        qid_author = str(qid_author).replace("']","*")
        qid_author = qid_author.split("*")
        qid_author = qid_author[1]
        print("correct qid: " + qid_author)
        
        tbl_input.replace(str(tbl_input.at[i, 'author']),str(qid_author),inplace=True)
    else:
        tbl_input.replace(str(tbl_input.at[i, 'author']),str(qid_author),inplace=True)
    
    #print("qid_author: " + str(qid_author))
    print("tbl_input.at[i, 'author']: " + str(tbl_input.at[i, 'author']))
   
    if tbl_input.dtypes['editor'] != "float64":
        if bool(qid_editor) == False:
            item = wbi.item.new()
            item.labels.set(language='en', value=tbl_input.at[i, 'editor'])
            instance_person = Item(value='Q13', prop_nr='P2')
            based_on = Item(value=qid_document[0], prop_nr='P8')
            imported_by = Item(value=qid_script[0], prop_nr='P70')
            
            data = [instance_person, based_on, imported_by]
            item.claims.add(data)
            item.write()
            #Modifiying tbl_input
            qid_editor = wbi_helpers.search_entities(tbl_input.at[i, 'editor'])
            qid_editor = str(qid_editor).replace("['","*")
            qid_editor = str(qid_editor).replace("']","*")
            qid_editor = qid_editor.split("*")
            qid_editor = qid_editor[1]
            tbl_input.replace(str(tbl_input.at[i, 'editor']),str(qid_editor),inplace=True)
        else:
            tbl_input.replace(str(tbl_input.at[i, 'editor']),str(qid_editor),inplace=True)
         
    if tbl_input.dtypes['translator'] != "float64":
        if bool(qid_translator) == False:
            item = wbi.item.new()
            item.labels.set(language='en', value=tbl_input.at[i, 'translator'])
            instance_person = Item(value='Q13', prop_nr='P2')
            based_on = Item(value=qid_document[0], prop_nr='P8')
            imported_by = Item(value=qid_script[0], prop_nr='P70')
            
            data = [instance_person, based_on, imported_by]
            item.claims.add(data)
            item.write()
            #Modifiying tbl_input
            qid_translator = wbi_helpers.search_entities(tbl_input.at[i, 'translator'])
            qid_translator = str(qid_translator).replace("['","*")
            qid_translator = str(qid_translator).replace("']","*")
            qid_translator = qid_translator.split("*")
            qid_translator = qid_translator[1]
            tbl_input.replace(str(tbl_input.at[i, 'translator']),str(qid_translator),inplace=True)
        else:
            tbl_input.replace(str(tbl_input.at[i, 'translator']),str(qid_translator),inplace=True)
    
    if bool(qid_publisher) == False:
        item = wbi.item.new()
        item.labels.set(language='en', value=tbl_input.at[i, 'publisher'])
        instance_person = Item(value='Q13', prop_nr='P2')
        based_on = Item(value=qid_document[0], prop_nr='P8')
        imported_by = Item(value=qid_script[0], prop_nr='P70')
        
        data = [instance_person, based_on, imported_by]
        item.claims.add(data)
        item.write()
        #Modifiying tbl_input
        qid_publisher = wbi_helpers.search_entities(tbl_input.at[i, 'publisher'])
        print("qid for publisher: " + str(qid_publisher))
        qid_publisher = str(qid_publisher).replace("['","*")
        qid_publisher = str(qid_publisher).replace("']","*")
        qid_publisher = qid_publisher.split("*")
        qid_publisher = qid_publisher[1]
        tbl_input.replace(str(tbl_input.at[i, 'publisher']),str(qid_publisher),inplace=True)
    else:
        tbl_input.replace(str(tbl_input.at[i, 'publisher']),str(qid_publisher),inplace=True)

    if bool(qid_place) == False:
        item = wbi.item.new()
        item.labels.set(language='en', value=tbl_input.at[i, 'place'])
        instance_place = Item(value='Q14', prop_nr='P2')
        based_on = Item(value=qid_document[0], prop_nr='P8')
        imported_by = Item(value=qid_script[0], prop_nr='P71')
        
        data = [instance_place, based_on, imported_by]
        item.claims.add(data)
        item.write()
        #Modifiying tbl_input
        qid_place = wbi_helpers.search_entities(tbl_input.at[i, 'place'])
        qid_place = str(qid_place).replace("['","*")
        qid_place = str(qid_place).replace("']","*")
        qid_place = qid_place.split("*")
        qid_place = qid_place[1]
        tbl_input.replace(str(tbl_input.at[i, 'place']),str(qid_place),inplace=True)
    else:
        tbl_input.replace(str(tbl_input.at[i, 'place']),str(qid_place),inplace=True)


    item = wbi.item.get(entity_id=qid_document[0])
    
    data = []
    
    instance_document = Item(value='Q16', prop_nr='P2')
    data.append(instance_document)
    
    title = String(value=tbl_input.at[i, 'title'], prop_nr='P35')
    data.append(title)
    
    author = Item(value=str(tbl_input.at[i, 'author']), prop_nr='P25')
    data.append(author)
    
    if tbl_input.at[i, 'translator'] != "empty":
        translator = Item(value=tbl_input.at[i, 'translator'], prop_nr='P33')
        data.append(translator)
    
    if tbl_input.at[i, 'trans_from'] != "empty":
        trans_from = String(value=tbl_input.at[i, 'trans_from'], prop_nr='P34')
        data.append(trans_from)
        
    if tbl_input.at[i, 'date'] != "empty":
        date = EDTF(value=str(tbl_input.at[i, 'date']), prop_nr='P28')
        data.append(date)
        
    if tbl_input.at[i, 'place'] != "empty":
        place = Item(value=tbl_input.at[i, 'place'], prop_nr='P30')
        data.append(place)
    
    if tbl_input.at[i, 'publisher'] != "empty":
        publisher = Item(value=tbl_input.at[i, 'publisher'], prop_nr='P37')
        data.append(publisher)
    
    if tbl_input.at[i, 'bibl'] != "empty":
        bibl = String(value=tbl_input.at[i, 'bibl'], prop_nr='P26')
        data.append(bibl)
    
    if tbl_input.at[i, 'version'] != "empty":
        version = Quantity(value=tbl_input.at[i, 'version'], prop_nr='P43')
        data.append(version)
    
    if tbl_input.at[i, 'editor'] != "empty":
        editor = Item(value=tbl_input.at[i, 'editor'], prop_nr='P32')
        data.append(editor)
    
    if tbl_input.at[i, 'edition_first'] != "empty":
        edition_first = String(value=tbl_input.at[i, 'edition_first'], prop_nr='P44')
        data.append(edition_first)
    
    if tbl_input.at[i, 'issue'] != "empty":
        issue = Quantity(value=tbl_input.at[i, 'issue'], prop_nr='P47')
        data.append(issue)
        
    if tbl_input.at[i, 'maintopic'] != "empty":
        maintopic = String(value=tbl_input.at[i, 'maintopic'], prop_nr='P38')
        data.append(maintopic)
    
    if tbl_input.at[i, 'register'] != "empty":
        register = String(value=tbl_input.at[i, 'register'], prop_nr='P40')
        data.append(register)
    
    if tbl_input.at[i, 'topic'] != "empty":
        topic = String(value=tbl_input.at[i, 'topic'], prop_nr='P39')
        data.append(topic)
    
    if tbl_input.at[i, 'lingualism'] != "empty":
        lingualism = String(value=tbl_input.at[i, 'lingualism'], prop_nr='P50')
        data.append(lingualism)
    
    if tbl_input.at[i, 'orig_date'] != "empty":
        orig_date = EDTF(value=str(tbl_input.at[i, 'orig_date']), prop_nr='P27')
        data.append(orig_date)
    
    if tbl_input.at[i, 'orig_place'] != "empty":
        orig_place = Item(value=tbl_input.at[i, 'orig_place'], prop_nr='P29')
        data.append(orig_place)
    
    if str(tbl_input.at[i, 'repository']).__contains__('||'):
        URL_list = str(tbl_input.at[i, 'repository']).split('||')
        for c in range(len(URL_list)):
            repository = URL(value=URL_list[c], prop_nr='P36')
        data.append(repository)
    else:
        repository = URL(value=tbl_input.at[i, 'repository'], prop_nr='P36')
        data.append(repository)
    
    if tbl_input.at[i, 'lang_type'] != "empty":
        lang_type = String(value=tbl_input.at[i, 'lang_type'], prop_nr='P49')
        data.append(lang_type)
    
    if tbl_input.at[i, 'lang_area'] != "empty":
        lang_area = String(value=tbl_input.at[i, 'lang_area'], prop_nr='P48')
        data.append(lang_area)
    
    if tbl_input.at[i, 'text_type'] != "empty":
        text_type = String(value=tbl_input.at[i, 'text_type'], prop_nr='P52')
        data.append(text_type)
    
    if tbl_input.at[i, 'lyric_type'] != "empty":
        lyric_type = String(value=tbl_input.at[i, 'lyric_type'], prop_nr='P51')
        data.append(lyric_type)
    
    if tbl_input.at[i, 'wormwood'] != "empty":
        wormwood = String(value=tbl_input.at[i, 'wormwood'], prop_nr='P46')
        data.append(wormwood)
    
    if tbl_input.at[i, 'herb_sorting'] != "empty":
        herb_sorting = String(value=tbl_input.at[i, 'herb_sorting'], prop_nr='P45')
        data.append(herb_sorting)        

    document_id = String(value=tbl_input.at[i, 'file_name'], prop_nr='P31')
    data.append(document_id)
    
    imported_by = Item(value=wbi_helpers.search_entities(wb_script)[0], prop_nr='P42')
    data.append(imported_by)

    item.claims.add(data)
    item.write()
    
tbl_input.to_csv('data/RIDGES_9.0_doc-metadata_test_output.csv')

list is empty - qid_author: ['Q136']
correct qid: Q136
tbl_input.at[i, 'author']: Q136


IndexError: list index out of range