# 2. Copy content of 957 fields to new NZ fields
This process analyse records et prepare xml data when content of 957 fields need to be copied in new fields. The fields are copied only if a similar field doesn't already exist.

For all 6XX fields, only gnd fields can be considered as already existing similar data.

![Use of 957 field to reimport Aleph data in NZ](records_transformation.drawio.png)

In [13]:
from lxml import etree
import os
import re
from almapiwrapper.record import JsonData, XmlData
from almapiwrapper.inventory import IzBib, NzBib, Holding, Item
from copy import deepcopy
import pandas as pd

In [14]:
def is_gnd_field(datafield):
    """Check if the field has gnd"""
    subfield_2 = datafield.find('subfield[@code="2"]')
    if subfield_2 is not None:
        return 'gnd' in ['gnd', 'gnd-content', 'gnd-carrier']
    return False

def gnd_required(tag):
    """Only for certain fields does it make sense to test
    whether gnd is available
    """
    return {
        '502': False,
        '505': False,
        '600': True,
        '610': True,
        '611': True,
        '648': True,
        '650': True,
        '651': True,
        '655': True,
        '780': False,
        '785': False
    }[tag]

def is_similar_field(datafield_957):
    """Check if the corresponding field to the 957
    is already available in the record 
    """
    record = datafield_957.getparent()
    
    # Get the tag in the subfield $7. A regexp is useful because some other $7 are possible. 
    tag = [subfield_7.text for subfield_7 in datafield_957.findall(f'subfield[@code="7"]')
           if re.match(r'\d{3}', subfield_7.text) is not None][0]
    
    # Check existing datafield with the tag found in 957 $$7
    datafields = record.findall(f'datafield[@tag="{tag}"]')
    
    if gnd_required(tag) is True:
        # Filter the gnd fields
        datafields = [datafield for datafield in datafields if is_gnd_field(datafield) is True]
    
    return len(datafields) > 0

def save_collection(collection):
    """Save collection in file"""
    collection_data = etree.tostring(collection, pretty_print=True)
    with open(f'./data/fields_957_data_to_merge.xml', 'wb') as f:
        f.write(collection_data)

In [15]:
file_path = './data/nz_records_to_update.xml'

id_correspondance = pd.read_csv('./data/ABN_duplicated_records_ids_complete.csv', dtype=str)

data = XmlData(filepath=file_path)

collection = etree.Element('collection')

records = data.content.findall('record')

for i, record in enumerate(records):    

    if (i+1) % 10 == 0:
        print('records: ', i+1, '/', len(records))
        
    # Useful to check if new fields have been added to the record
    new_addition_to_nz_record = False
    
    mms_id = record.find('controlfield[@tag="001"]').text
    if mms_id not in id_correspondance['ABN MMS ID'].values:
        print(f'{mms_id} not in correspondnace file')
        continue
    nz_mms_id = id_correspondance.loc[id_correspondance['ABN MMS ID']==mms_id, 'Network Id'].values[0]
    abn035 = id_correspondance.loc[id_correspondance['ABN MMS ID']==mms_id, 'ABN035'].values[0]
    new_record = etree.Element('record')

    # controlfield 001
    controlfield001 = etree.Element('controlfield', tag='001')
    controlfield001.text = nz_mms_id
    new_record.append(controlfield001)

    # datafield 035
    datafield035 = etree.Element('datafield', tag='035', ind1=' ', ind2=' ')
    subfield = etree.Element('subfield', code='a')
    subfield.text = abn035
    datafield035.append(subfield)
    new_record.append(datafield035)

    # Copy of 957 fields
    
    # get all subfields 957 $$7 and keep only the records with tag
    for subfield_957_7 in record.findall('datafield[@tag="957"]/subfield[@code="7"]'):
        if re.match(r'\d{3}', subfield_957_7.text) is None:
            continue
        
        # Get the datafield 957
        datafield = subfield_957_7.getparent()
        tag = subfield_957_7.text

        # Check gnd, if gnd is required and the source data aren't gnd data we ignore them
        if gnd_required(tag) and is_gnd_field(datafield) is False:
            continue

        # Check if a similar field already exists (same tag as tag in 957 $$7)
        if is_similar_field(datafield) is False:
            
            # deep copy to make an independant copy of the field
            newfield = deepcopy(datafield)
            for subfield_7 in newfield.findall(f'subfield[@code="7"]'):
                if re.match(r'\d{3}', subfield_7.text) is not None:
                    newfield.remove(subfield_7)
            newfield.remove(newfield.find(f'subfield[@code="9"]'))
            
            # Replace 957 tag with the new value from $$7
            newfield.set('tag', tag) 
            
            # Add the new datafield to the record
            new_record.append(newfield)
            
            new_addition_to_nz_record = True
    
    # We want only add records with new fields added
    if new_addition_to_nz_record is True:
        collection.append(new_record)

save_collection(collection)

records:  10 / 50
records:  20 / 50
records:  30 / 50
records:  40 / 50
records:  50 / 50
