## Cards

In [1]:
import os
import json
import uuid
import pandas as pd
from edtf import text_to_edtf
import datetime
import shutil
import csv
from urllib.parse import urlparse

lookup_path = '../lookup/'
lookup_aspace_path = '../../../lookup/'
source_path = '../source/'
mappings_path = '../mappings/'
packages_path = '../packages/'
save_path = '../../../../digipolis-arches-shoku-pkg/source/'


## Lookups

In [2]:
lookup_imaginary_df = pd.read_csv(lookup_aspace_path + 'aspace/as2arches_agent.csv')
lookup_relation_df = pd.read_csv(lookup_aspace_path + 'isaar_relatie_types.csv')

In [3]:
with open(mappings_path + 'mmb_mappings_e73.json') as f:
    multi_value_mappings = json.load(f)  

with open(packages_path + 'resource_models_mmb.json') as f:
    resource_model_list = json.load(f)   

with open(mappings_path + 'static.json') as f:
    static_list = json.load(f)       

## Functions

In [4]:
def lookup_column_value(code, lookup_file, card, mmb_id, source_field):
    lookup_df = pd.read_csv(lookup_path + lookup_file)
    try:
        concept_name = lookup_df[lookup_df['code'] == code]['concept'].iloc[0]
        return concept_name
    except:
        #print('%s,%s,%s,%s,%s' % (card, brocade_id, source_field, code, lookup_file))
        print(mmb_id, ',', code + ' NOT FOUND IN,' + lookup_file)
        return code + ' NOT FOUND IN: ' + lookup_file

In [5]:
def add_ark_identifier_by_row_value(package, uuid):
    return '%s/ark:/%s/%s' % (static_list[package]['ark_url'], static_list[package]['naan'], uuid)

In [6]:
def save_mapped_set(source_df, _package, _resource_model, _card):
    file_name = '%s/%s/%s/%s.csv' % (save_path, _package.split('_')[1], _resource_model, _card)
    source_df.to_csv(file_name, index=False)
    del source_df
    return 'Saved OK'


In [7]:
def get_constant_type(mapping, source_key):
    # pull constants from a node mapping
    for node_mapping in mapping['node_mappings']:
        if node_mapping['from'] == source_key:
            return node_mapping['constants']

In [8]:
def add_def_dict(obj):
    def_dict = {}
    def_dict['ResourceID'] = obj['ResourceID']
    def_dict['card'] = card                                                                 

    return def_dict

## Single

In [9]:
def make_single(source_list, resource_model, package, dataset, card):

    mappings = multi_value_mappings[card]
    obj_dict = {}
    obj_list = []
    
    for obj in source_list:
        obj_dict['ResourceID'] = obj['ResourceID']
        obj_dict['card'] = 'AA'
        obj_dict['mmb_id'] = obj['mmb_id']
        obj_dict['order'] = '0'
        obj_dict['Object Identifier'] = add_ark_identifier_by_row_value(package, obj['ResourceID'])
        
        for node_mapping in mappings['node_mappings']:
            if node_mapping['from'] in obj.keys():
                if 'constants' in node_mapping.keys():
                    if len(obj[node_mapping['from']]) > 0:
                        constants = get_constant_type(mappings, node_mapping['from'])
                        for _const, _value in constants.items():
                            obj_dict[_const] = _value
                        obj_dict[node_mapping['to']] = obj[node_mapping['from']]
                        
                elif 'function' in node_mapping.keys():
                    if node_mapping['function']['name'] == 'column_lookup':
                        if len(obj[node_mapping['from']]) > 0:
                            _type = lookup_column_value(obj[node_mapping['from']],
                                    node_mapping['function']['args']['lookup_file'],
                                    card, obj['mmb_id'], node_mapping['from'])
                            obj_dict[node_mapping['to']] = _type                                            
                else:
                    obj_dict[node_mapping['to']] = obj[node_mapping['from']]
        
        if dataset == 'Iconografie':
            obj_dict['Record Type'] = 'Tekeningen'
        elif dataset == 'Brief':
            obj_dict['Record Type'] = 'Brieven'
        elif dataset == 'Tekstdrager':
            obj_dict['Record Type'] = 'Documenten'
            
        if resource_model == 'Tekstdrager':    
            if obj_dict['Title Language'] != 'Dutch (language)':
                obj_dict['Title Type'] = 'titles proper'     
        
        obj_list.append(obj_dict)
        obj_dict = {}
        
    source_df = pd.DataFrame(obj_list)
    number_of_records = len(obj_list)
    if number_of_records > 0:
        save_it = save_mapped_set(source_df, package, resource_model, card)
        #source_df.to_csv('out/' + dataset + '_single_mapped.csv', index =False)


    return '%s saved: %s' % (card, number_of_records)

## e73_ExternalRelations

In [10]:
def get_relation_url_type(_domain):

    if _domain == 'https://rkd.nl':
        _type = 'RKDimages'
    elif _domain == 'https://dams.antwerpen.be':
        _type = 'DAMS Archief'
    else:
        _type = 'uniform resource locators'
 
    return _type

def make_external_relations(source_list, resource_model, package, dataset, card):
    mappings = multi_value_mappings[card]
    obj_dict = {}
    obj_list = []
    
    for obj in source_list:                
        for key in obj:
        
            if key == 'gerelateerde objecten':
                if len(obj[key]) > 0:                
                    _url_split = obj[key].split(',')
                    n=1
                    for _url in _url_split:
                        url_parts = urlparse(_url.strip())
                        obj_dict['ResourceID'] = obj['ResourceID']
                        obj_dict['card'] = card
                        obj_dict['mmb_id'] = obj['mmb_id']
                        obj_dict['order'] = 'g_o_' + str(n)                        
                        obj_dict['source_field'] = key

                        obj_dict['External Relation URL Prefix'] = '%s://%s' % (url_parts.scheme, url_parts.netloc)
                        obj_dict['External Relation URL'] = url_parts.path + '?' + url_parts.query if len(url_parts.query) > 0 else url_parts.path
                        obj_dict['External Relation'] = 'gerelateerd object'

                        _domain = obj_dict['External Relation URL Prefix']                    
                        obj_dict['External Relation URL Type'] = get_relation_url_type(_domain)

                        
                        obj_list.append(obj_dict) 
                        obj_dict = {}
                        _domain = ''
                        del url_parts
                        
                        n+=1
            
            elif key.startswith('externe relaties'):
                if len(obj[key]) > 0:
                    obj_dict['ResourceID'] = obj['ResourceID']
                    obj_dict['card'] = card
                    obj_dict['mmb_id'] = obj['mmb_id']
                    try:
                        obj_dict['order'] = key.split(' ')[2]
                    except:
                        obj_dict['order'] = '1'
                    obj_dict['source_field'] = key
                    
                    url_parts = urlparse(obj[key])
                    obj_dict['External Relation URL Prefix'] = '%s://%s' % (url_parts.scheme, url_parts.netloc)
                    obj_dict['External Relation URL'] = url_parts.path + '?' + url_parts.query if len(url_parts.query) > 0 else url_parts.path
                    obj_dict['External Relation'] = 'gerelateerd object'

                    _domain = obj_dict['External Relation URL Prefix']                    
                    obj_dict['External Relation URL Type'] = get_relation_url_type(_domain)
                    
                    obj_list.append(obj_dict) 
                    obj_dict = {}
                    _domain = ''
                    del url_parts
    
    source_df = pd.DataFrame(obj_list)
    
    number_of_records = len(obj_list)
    if number_of_records > 0:
        save_it = save_mapped_set(source_df, package, resource_model, card)
        #source_df.to_csv('out/' + dataset + '_ext_rel_mapped.csv', index =False)

    del source_df
    del source_list
    del obj
    
    return '%s saved: %s' % (card, number_of_records) 

## Make Material Records

In [11]:
def make_material_records(source_list, resource_model, package, dataset, card):
    usr_dict = {}
    usr_list = []
    mapping = multi_value_mappings[card]
    related_resource_dict = {}
    related_resource_list = []
    
    for usr in source_list:
        for node_mapping in mapping['node_mappings']:
            if node_mapping['from'] in usr.keys():
                n=1
                usr_dict['ResourceID'] = usr['ResourceID']
                usr_dict['ResourceID_mat'] = usr['ResourceID_mat']
                usr_dict['mmb_id'] = usr['mmb_id']
                usr_dict['mmb_id_mat'] = usr['mmb_id_mat']
                usr_dict['card'] = card
                usr_dict['order'] = n
                relation = '[{"resourceId": "%s", "ontologyProperty": "", "resourceXresourceId": "%s", "inverseOntologyProperty": ""}]' % (usr_dict['ResourceID_mat'] , uuid.uuid4())
                usr_dict[node_mapping['to']] = relation            
                usr_list.append(usr_dict)
                usr_dict = {}
                
                related_resource_dict['resourceinstanceidfrom'] = usr['ResourceID']
                related_resource_dict['resourceinstanceidto'] = usr['ResourceID_mat']
                related_resource_dict['relationshiptype'] = 'is related to'
                related_resource_dict['datestarted'] = ''
                related_resource_dict['dateended'] = ''
                related_resource_dict['notes'] = ''
                
                related_resource_list.append(related_resource_dict)
                related_resource_dict = {}

    source_df = pd.DataFrame(usr_list)
    related_resource_df = pd.DataFrame(related_resource_list)
    #source_df.to_csv('out/' + dataset + '_material_mapped.csv', index =False)
    # save
    save_it = save_mapped_set(source_df, package, resource_model, card)
    save_it = save_mapped_set(related_resource_df, package, resource_model, 'RelatedResources')
    
    #related_resource_df.to_csv('out/' + dataset + '_resource_relations.csv', index =False)
    return '%s saved: %s' % (card, len(source_df))

## E73_Language

In [12]:
def make_languages(source_list, resource_model, package, dataset, card):
    mappings = multi_value_mappings[card]
    obj_dict = {}
    obj_list = []
    
    for obj in source_list:
        for node_mapping in mappings['node_mappings']:
            if node_mapping['from'] in obj.keys():
                if len(obj[node_mapping['from']]) > 0:
                    obj_dict.update(add_def_dict(obj))
                    obj_dict['source_field'] = node_mapping['from']
                    if 'function' in node_mapping.keys():
                        if node_mapping['function']['name'] == 'column_lookup': 
                            _type = lookup_column_value(obj[node_mapping['from']], 
                                node_mapping['function']['args']['lookup_file'],
                                card, obj['mmb_id'], node_mapping['from'])
                            obj_dict[node_mapping['to']] = _type                            
                    obj_list.append(obj_dict)
                    obj_dict = {}

    source_df = pd.DataFrame(obj_list)
    number_of_records = len(obj_list)
    if number_of_records > 0:
        save_it = save_mapped_set(source_df, package, resource_model, card)
        #source_df.to_csv('out/' + dataset + '_lang_mapped.csv', index =False)

    del source_df
    del source_list
    del obj
    return '%s saved: %s' % (card, number_of_records)

## Creation Places

In [13]:
def make_places(source_list, resource_model, package, dataset, card):

    lookup_places_df = pd.read_csv(lookup_aspace_path + 'aspace/as2arches_place.csv')
    mappings = multi_value_mappings[card]
    obj_dict = {}
    obj_list = []
    
    for obj in source_list:
        for node_mapping in mappings['node_mappings']:
            if node_mapping['from'] in obj.keys():
                obj_dict['ResourceID'] = obj['ResourceID']
                obj_dict['mmb_id'] = obj['mmb_id']
                obj_dict['card'] = card
                obj_dict['order'] = 1                
                obj_dict[node_mapping['to']] = obj[node_mapping['from']]
            obj_list.append(obj_dict)
            obj_dict = {}
    source_df = pd.DataFrame(obj_list)
    number_of_records = len(obj_list)
    
    source_df['Creation Place Widget'] = source_df[node_mapping['to']].map(lookup_places_df.set_index('archesID')['json'])    

    if number_of_records > 0:
        save_it = save_mapped_set(source_df, package, resource_model, card)
        #source_df.to_csv('out/%s_%s_Places_%s.csv' % (resource_model, card, package), index=False)
    del source_df
    del source_list
    del obj_list
    del lookup_places_df
    return '%s saved: %s' % (card, number_of_records)

## Creation Timespans

In [14]:
def make_time_spans(source_list, resource_model, package, dataset, card):

    mappings = multi_value_mappings[card]
    obj_dict = {}
    obj_list = []
    default_dict_items = {}
    date_list = []
    begin_dict = {}
    end_dict = {}
            
    for obj in source_list:
        if len(obj['begindatum']) > 0 or len(obj['einddatum']) > 0: 
            obj_dict['ResourceID'] = obj['ResourceID']
            obj_dict['mmb_id'] = obj['mmb_id']
            obj_dict['card'] = card
            obj_dict['order'] = 1                
            obj_dict['Start Date'] = obj['begindatum'].replace('x', 'u').replace('-uu-uu', '')
            obj_dict['End Date'] = obj['einddatum'].replace('x', 'u').replace('-uu-uu', '')
                
            obj_list.append(obj_dict)
            obj_dict = {}

    source_df = pd.DataFrame(obj_list)
    number_of_records = len(obj_list)
    if number_of_records > 0:
        save_it = save_mapped_set(source_df, package, resource_model, card)
        #source_df.to_csv('out/' + dataset + '_time_mapped.csv', index =False)

    del source_df
    del source_list
    del obj    

    return '%s saved: %s' % (card, number_of_records) 

## Creation Actors

In [15]:
def add_def_actor_dict(obj, card, order):
    def_dict = {}
    def_dict['ResourceID'] = obj['ResourceID']
    def_dict['mmb_id'] = obj['mmb_id']
    def_dict['card'] = card
    def_dict['order'] = order                                    

    return def_dict

In [16]:
def make_actors(source_list, resource_model, package, dataset, card):
    lookup_agents_df = pd.read_csv(lookup_aspace_path + 'aspace/as2arches_merged.csv')

    mappings = multi_value_mappings[card]
    obj_dict = {}
    obj_list = []
    actor_group = ['1', '2', '3']
        
    for obj in source_list:
        
        for actor in actor_group:
            if 'au:code actor ' + actor in obj.keys():
                if len(obj['au:code actor ' + actor]) > 0:

                    obj_dict.update(add_def_actor_dict(obj, card, actor))
                    obj_dict['au_code'] = obj['au:code actor ' + actor ] + ':1'
                    obj_dict['au_name'] = obj['naam actor ' + actor]
                    if len(obj['type relatie ' + actor]) > 0:
                        obj_dict['Creation Actor Role'] = lookup_column_value(obj['type relatie ' + actor],
                                    'isaar_relatie_types.csv',
                                    card, obj['mmb_id'], 'type relatie ' + actor)     
                    if obj['type relatie ' + actor] != 'onderwerp':
                        obj_list.append(obj_dict)
                    obj_dict = {}
                elif len(obj['naam actor ' + actor]) > 0:
                    obj_dict.update(add_def_actor_dict(obj, card, actor))
                    obj_dict['au_name'] = obj['naam actor ' + actor]
                    obj_dict['Creation Actor Note'] = '%s (%s)' % (obj['naam actor ' + actor], obj['type relatie ' + actor])
                    if len(obj['type relatie ' + actor]) > 0:
                        obj_dict['Creation Actor Role'] = lookup_column_value(obj['type relatie ' + actor],
                                    'isaar_relatie_types.csv',
                                    card, obj['mmb_id'], 'type relatie ' + actor)     
                    if obj['type relatie ' + actor] != 'onderwerp':
                        obj_list.append(obj_dict)
                    obj_dict = {}



    source_df = pd.DataFrame(obj_list)
    number_of_records = len(obj_list)
    
    source_df['Creation Actor Widget'] = source_df['au_code'].map(lookup_agents_df.set_index('archesID')['json'])

    for idx, row in source_df.iterrows():
        if pd.notna(row['au_code']) and pd.isnull(row['Creation Actor Widget']):
            source_df.loc[idx, 'Creation Actor Note'] = 'No match for %s, %s, %s' % (row['au_code'][:-2], row['Creation Actor Role'], row['au_name'])
 
    if number_of_records > 0:
        save_it = save_mapped_set(source_df, package, resource_model, card)
        #source_df.to_csv('out/' + dataset + '_actors_mapped.csv', index=False)
        source_df = None
    
    source_list = []
    obj_list = []


    return '%s saved: %s' % (card, number_of_records)             
            


## Connected Subjects

In [17]:
def make_subjects(source_list, resource_model, package, dataset, card):
    lookup_agents_df = pd.read_csv(lookup_aspace_path + 'aspace/as2arches_merged.csv')

    mappings = multi_value_mappings[card]
    obj_dict = {}
    obj_list = []
    actor_group = ['1', '2', '3']

    for obj in source_list:

        for actor in actor_group:
            if 'au:code actor ' + actor in obj.keys():
                if len(obj['au:code actor ' + actor]) > 0:

                    obj_dict.update(add_def_actor_dict(obj, card, actor))
                    obj_dict['au_code'] = obj['au:code actor ' + actor ] + ':1'
                    obj_dict['au_name'] = obj['naam actor ' + actor]
                    if len(obj['type relatie ' + actor]) > 0:
                        obj_dict['Connected Subject Role'] = lookup_column_value(obj['type relatie ' + actor],
                                    'isaar_relatie_types.csv',
                                    card, obj['mmb_id'], 'type relatie ' + actor)
                    if obj['type relatie ' + actor] == 'onderwerp':
                        obj_list.append(obj_dict)
                        #print(obj['type relatie ' + actor])
                    obj_dict = {}
                elif len(obj['naam actor ' + actor]) > 0:
                    obj_dict.update(add_def_actor_dict(obj, card, actor))
                    obj_dict['au_name'] = obj['naam actor ' + actor]
                    #obj_dict['Connected Subject Widget'] = '%s (%s)' % (obj['naam actor ' + actor], obj['type relatie ' + actor])
                    if len(obj['type relatie ' + actor]) > 0:
                        obj_dict['Connected Subject Role'] = lookup_column_value(obj['type relatie ' + actor],
                                    'isaar_relatie_types.csv',
                                    card, obj['mmb_id'], 'type relatie ' + actor)
                    if obj['type relatie ' + actor] == 'onderwerp':
                        obj_list.append(obj_dict)
                        #print(obj['type relatie ' + actor])
                    obj_dict = {}

    number_of_records = 0
    if len(obj_list) > 0:
        source_df = pd.DataFrame(obj_list)
        number_of_records = len(obj_list)

        source_df['Connected Subject Widget'] = source_df['au_code'].map(lookup_agents_df.set_index('archesID')['json'])

        for idx, row in source_df.iterrows():
            if pd.isnull(row['au_code']) and pd.isnull(row['Connected Subject Widget']):
                source_df.loc[idx, 'Connected Subject Note'] = row['au_name']                

        if number_of_records > 0:
            save_it = save_mapped_set(source_df, package, resource_model, card)
            #source_df.to_csv('out/' + dataset + '_actors_mapped.csv', index=False)
            source_df = None

        source_list = []
        obj_list = []


    return '%s saved: %s' % (card, number_of_records)

## Generate dataset

In [18]:
def get_card_keys(v):
    group_key_list = []
    for mapping in v['node_mappings']:
        group_key_list.append(mapping['from'].split('.')[0])
    group_key_list = list(dict.fromkeys(group_key_list) ) 
    return(group_key_list)

In [19]:
%%time

source_df = []
record_dict = {}
record_list = []
records_list = []
material_list = []
CreationTimeSpans_list = []
CreationActors_list = []
CreationPlaces_list = []
ConnectedSubjects_list = []



for resource_model, packages in resource_model_list.items():
    print(resource_model)    
    for package, datasets in packages.items():
        
        Single_list = []
        ExternalRelations_list = []
        MaterialRecords_list = []
        Languages_list = []
                
        print('- ', package)
        for dataset in datasets:
            print('   - ', dataset)

            source_file_name = source_path + dataset + '.csv'
            records = csv.DictReader(open(source_file_name, encoding='utf-8-sig'), delimiter=';')                
            not_empty = False            
            for record in records: 
                
                for card, v in multi_value_mappings.items():
                    record_dict['ResourceID'] = str(uuid.uuid5(uuid.NAMESPACE_DNS, record['recordnr']))                                    
                    record_dict['ResourceID_mat'] = str(uuid.uuid5(uuid.NAMESPACE_DNS, record['recordnr'] + "_#1"))                                    
                    record_dict['mmb_id'] = record['recordnr']
                    record_dict['mmb_id_mat'] = record['recordnr'] + '_#1'
                    record_dict['card'] = card
                    record_dict['dataset'] = dataset

                    for card_key in get_card_keys(v): 

                        if card_key in record.keys():
                            if len(record[card_key]) > 0:
                                not_empty = True
                            record_dict[card_key] = record[card_key]
                    if not_empty: # only add if some field has values
                        record_list.append(record_dict)                        
    
                    record_dict = {}
                    not_empty = False                        
                      
                                    
                    
                    if card == 'e73_Single':
                        Single_list.extend(record_list)
                    if card == 'e73_ExternalRelations':
                        ExternalRelations_list.extend(record_list)
                    if card == 'e73_MaterialRecords': 
                        MaterialRecords_list.extend(record_list)                                    
                    if card == 'e73_Languages':
                        Languages_list.extend(record_list)                                    

                    if card == 'e73_CreationTimeSpans':
                        CreationTimeSpans_list.extend(record_list)
                    if card == 'e73_CreationActors':
                        CreationActors_list.extend(record_list)
                    if card == 'e73_CreationPlaces': 
                        CreationPlaces_list.extend(record_list)                                                                
                    if card == 'e73_ConnectedSubjects':
                        ConnectedSubjects_list.extend(record_list)


                    record_list = [] 


            if len(Single_list) > 0:                
                Single = make_single(Single_list, resource_model, package, dataset, 'e73_Single')
                Single_list = []
                print('      - ', Single)   
            if len(MaterialRecords_list) > 0:
                MaterialRecords = make_material_records(MaterialRecords_list, resource_model, package, dataset, 'e73_MaterialRecords')
                MaterialRecords_list = []
                print('      - ', MaterialRecords)    
            if len(ExternalRelations_list) > 0:
                ExternalRelations = make_external_relations(ExternalRelations_list, resource_model, package, dataset, 'e73_ExternalRelations')
                ExternalRelations_list = []
                print('      - ', ExternalRelations) 
            if len(Languages_list) > 0:
                Languages = make_languages(Languages_list, resource_model, package, dataset, 'e73_Languages')
                Languages_list = []
                print('      - ', Languages)    
            if len(CreationTimeSpans_list) > 0:
                CreationTimeSpans = make_time_spans(CreationTimeSpans_list, resource_model, package, dataset, 'e73_CreationTimeSpans')
                print('      - ', CreationTimeSpans)
                CreationTimeSpans_list = []
            if len(CreationActors_list) > 0:
                CreationActors = make_actors(CreationActors_list, resource_model, package, dataset, 'e73_CreationActors')
                CreationActors_list = []
                print('      - ', CreationActors)                       
            if len(ConnectedSubjects_list) > 0:
                ConnectedSubjects = make_subjects(ConnectedSubjects_list, resource_model, package, dataset, 'e73_ConnectedSubjects')
                ConnectedSubjects_list = []
                print('      - ', ConnectedSubjects)
            #if len(CreationPlaces_list) > 0:
            #    CreationPlaces = make_places(CreationPlaces_list, resource_model, package, dataset, 'e73_CreationPlaces')
            #    print('      - ', CreationPlaces)                                                          
                
                   
print('---------') 
print(datetime.datetime.now())
%reset -f 

Iconografie
-  pkg_mmb
   -  Iconografie
      -  e73_Single saved: 19
      -  e73_MaterialRecords saved: 19
      -  e73_CreationTimeSpans saved: 18
      -  e73_CreationActors saved: 18
      -  e73_ConnectedSubjects saved: 0
Brief
-  pkg_mmb
   -  Brief
      -  e73_Single saved: 2365
      -  e73_MaterialRecords saved: 2365
      -  e73_ExternalRelations saved: 1301
      -  e73_Languages saved: 2365
      -  e73_CreationTimeSpans saved: 2160
      -  e73_CreationActors saved: 4872
      -  e73_ConnectedSubjects saved: 23
Tekstdrager
-  pkg_mmb
   -  Tekstdrager
      -  e73_Single saved: 167
      -  e73_MaterialRecords saved: 167
      -  e73_ExternalRelations saved: 7
      -  e73_Languages saved: 164
      -  e73_CreationTimeSpans saved: 126
      -  e73_CreationActors saved: 99
      -  e73_ConnectedSubjects saved: 84
---------
2023-10-31 09:39:30.481884
CPU times: user 11.5 s, sys: 1.07 s, total: 12.6 s
Wall time: 12.6 s
