## Cards

In [None]:
import os
import json
import uuid
import pandas as pd
import datetime
import shutil
import csv

lookup_path = './lookup/'
source_path = './out/'
mappings_path = './mappings/'
#save_path = '../../../../digipolis-arches-shoku-pkg/source/'

## Functions

In [None]:
def resource_identifier(source_id):
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, source_id))


def generate_uri(source_id):
    return str(f'https://www.geonames.org/{source_id}')


def coordinates_geo_json(source_value):    
    return f'POINT {source_value}'.replace(",", "")


def coordinates_wkt(source_value):    
    return f'{source_value}'.replace(",", "")


def get_constants(node_mapping):
    constant_dict = {}
    constants = node_mapping['constants']
    for constant in constants:
        for constant_label, value in constant.items():
            constant_dict[constant_label] = value
    return constant_dict


def lookup(code, lookup_file):
    lookup_df = pd.read_csv(lookup_path + lookup_file)
    try:
        concept_name = lookup_df[lookup_df['code'] == code]['concept'].iloc[0]
        return concept_name
    except:
        print(f'{code} NOT FOUND IN {lookup_file}')
        return f'{code} NOT FOUND IN {lookup_file}'


## Save csv

In [None]:
def save_mapped_set(source_df, _package, _resource_model, _card):
    
    package = _package.split('_')[1]     
    
    file_name = '%s/%s/%s/%s.csv' % (save_path, package, _resource_model, _card)
    source_df.to_csv(file_name, index=False)    
    del source_df
    return 'Saved OK'

In [None]:
def related_resource(source_value):
    resourceid = str(uuid.uuid5(uuid.NAMESPACE_DNS, source_value))
    resourceXresourceId = str(uuid.uuid5(uuid.NAMESPACE_DNS, source_value + 'relation'))
    relation = '[{"resourceId": "%s", "ontologyProperty": "", "resourceXresourceId": "%s", "inverseOntologyProperty": ""}]' % (resourceid, resourceXresourceId)
    return relation

## Mapper

In [None]:
def make_mapping(source_list, node_mappings, graph, dataset, card_name, card_order):

    obj_dict = {}
    obj_list = []
    for obj in source_list:        
        
        for node_mapping in node_mappings:
            
            if node_mapping['from'] in obj.keys():
                
                if 'resource_id' in node_mapping.keys():
                    obj_dict['ResourceID'] = resource_identifier(str(obj[node_mapping['from']]))
                    obj_dict['card'] = card_name
                    obj_dict['card_order'] = card_order
                    
                if 'constants' in node_mapping.keys():
                    obj_dict[node_mapping['to']] = obj[node_mapping['from']]  
                    obj_dict.update(get_constants(node_mapping))
                               
                if 'function' in node_mapping.keys():
                    
                    # OBS, try: result = globals()[func_name]()
                    
                    if node_mapping['function']['name'] == 'generate_uri':
                        obj_dict[node_mapping['to']] = generate_uri(obj[node_mapping['from']])                                            
                    
                    if node_mapping['function']['name'] == 'lookup':
                        obj_dict[node_mapping['to']] = lookup(obj[node_mapping['from']], node_mapping['function']['args']['lookup_file'])                                            
                    
                    if node_mapping['function']['name'] == 'related_resource':
                        obj_dict[node_mapping['to']] = related_resource(obj[node_mapping['from']])      

                    if node_mapping['function']['name'] == 'coordinates_geo_json':
                        obj_dict[node_mapping['to']] = coordinates_geo_json(obj[node_mapping['from']])      
                    
                    if node_mapping['function']['name'] == 'coordinates_wkt':
                        obj_dict[node_mapping['to']] = coordinates_wkt(obj[node_mapping['from']])      
                else:
                    obj_dict[node_mapping['to']] = obj[node_mapping['from']]  
        
        obj_list.append(obj_dict)
        obj_dict = {}
        
    source_df = pd.DataFrame(obj_list)
    number_of_records = len(obj_list)
    if number_of_records > 0:
        #save_it = save_mapped_set(source_df, package, resource_model, card)
        source_df.to_csv(f'out/{card_name}_mapped.csv', index =False)
        

    #return '%s saved: %s' % (card_name, number_of_records)
    return obj_list


#relation = '[{"resourceId": "%s", "ontologyProperty": "", "resourceXresourceId": "%s", "inverseOntologyProperty": ""}]' % (usr_dict['ResourceID_mat'] , uuid.uuid4())


## Generate dataset

In [None]:
%%time

with open(mappings_path + 'mappings_geonames.json') as f:
    mappings_dict = json.load(f)  
    
    record_dict = {}
    record_list = []

    single_list = []
    alternate_name_list = []
    digital_reference_list = []

    mappings = mappings_dict['mappings']
    name = mappings_dict['name']
    graph = mappings_dict['graph']
    print('-', name)
    print('  -', graph)
    
    for mapping in mappings:  
        dataset = mapping['dataset']
        
        for card in mapping['cards']:      
            card_name = card['card_name']
            card_order = card['card_order']
            node_mappings = card['node_mappings']
            
            # list of all keys to be able to dump all source fields not in the mapping
            node_mappings_keys = [item['from'] for item in card['node_mappings']]
            
            # read file, DictReader to preserve order
            records = csv.DictReader(open(f'{source_path}{dataset}', encoding='utf-8-sig'), delimiter=',')

            exclude_list = ['post', 'link', 'unlc', 'wkdt', 'unic', 'iata']
            #if 'isolanguage' in record.keys() and record['isolanguage'] not in exclude_list:
            
            for record in records:
                ## add filter from mapping!
                
                for mapping_key in node_mappings_keys:                
                    if mapping_key in record.keys():
                        if len(record[mapping_key]) > 0:                                
                            record_dict[mapping_key] = record[mapping_key]
                    
                record_list.append(record_dict)                            
                record_dict = {}                   

            if card_name == 'single':                
                single_list.extend(record_list)                    
            if card_name == 'alternate_name':
                alternate_name_list.extend(record_list)                    
            if card_name == 'digital_reference':
                digital_reference_list.extend(record_list)                    
                        
            record_list = []     


        
        if len(single_list) > 0:
            Single = make_mapping(single_list, node_mappings, graph, dataset, card_name, card_order)
            print('    -', "Single", len(Single))

        if len(alternate_name_list) > 0:
            AlternateName = make_mapping(alternate_name_list, node_mappings, graph, dataset, card_name, card_order)
            print('    -', "AlternateName", len(AlternateName))

        if len(digital_reference_list) > 0:
            DigitalReference = make_mapping(digital_reference_list, node_mappings, graph, dataset, card_name, card_order)
            print('    -', "DigitalReference", len(DigitalReference))


print('---------') 
print(datetime.datetime.now())
#%reset -f 

# Merge Cards

In [None]:
Single_df = pd.DataFrame(Single)
AlternateName_df = pd.DataFrame(AlternateName)

geonames_df = pd.concat([
    Single_df,
    Alternate_df
])

sorted_geonames_df = geonames_df.sort_values(by=['ResourceID', 'card_order'])
sorted_geonames_df.to_csv('./out/geonames_mapped.csv', index=False)
sorted_geonames_df