# Python training: some basics 4
## lxml introduction 

In [75]:
from lxml import etree
from copy import deepcopy

In [76]:
# Always this way, to load data!
with open('BIBLIOGRAPHIC_51683500040005501_1.xml', 'r') as f:
    data = etree.parse(f, parser=etree.XMLParser(remove_blank_text=True))
xml = data.getroot()

In [77]:
print(etree.tostring(data, pretty_print=True).decode())

<collection>
  <record>
    <leader>01489nam a2200349 c 4500</leader>
    <controlfield tag="001">991171012127005501</controlfield>
    <controlfield tag="005">20230131165600.0</controlfield>
    <controlfield tag="007">tu</controlfield>
    <controlfield tag="008">220511s2023    gw ||||| |||| 00||||ger  </controlfield>
    <datafield tag="015" ind1=" " ind2=" ">
      <subfield code="a">22,N20</subfield>
      <subfield code="2">dnb</subfield>
    </datafield>
    <datafield tag="016" ind1="7" ind2=" ">
      <subfield code="2">DE-101</subfield>
      <subfield code="a">125725149X</subfield>
    </datafield>
    <datafield tag="020" ind1=" " ind2=" ">
      <subfield code="a">9783446472310</subfield>
      <subfield code="q">Print</subfield>
      <subfield code="c">circa EUR 39.99 (DE), circa EUR 41.20 (AT)</subfield>
    </datafield>
    <datafield tag="020" ind1=" " ind2=" ">
      <subfield code="a">3446472312</subfield>
    </datafield>
    <datafield tag="035" ind1=" " ind2=" ">

In [78]:
# Get the tag of the element
xml.tag

'collection'

In [79]:
# get the 5th record
xml[4]

<Element record at 0x7fe670650640>

In [80]:
# Get all the records in a list
records = xml.findall('record')
records

[<Element record at 0x7fe67062d340>,
 <Element record at 0x7fe6705629c0>,
 <Element record at 0x7fe670562840>,
 <Element record at 0x7fe670562680>,
 <Element record at 0x7fe670650640>,
 <Element record at 0x7fe670562540>,
 <Element record at 0x7fe670562c00>,
 <Element record at 0x7fe670562800>,
 <Element record at 0x7fe670562880>,
 <Element record at 0x7fe670562b00>,
 <Element record at 0x7fe6706b1f00>,
 <Element record at 0x7fe670562b40>,
 <Element record at 0x7fe670562bc0>,
 <Element record at 0x7fe670562cc0>,
 <Element record at 0x7fe670562b80>]

In [81]:
# Get 6th field of 4th record
records[3][5]

<Element datafield at 0x7fe67065dac0>

In [82]:
# Use find to get the first matching element => good in this case each record has only one controlfield 001
for record in records:
    print(record.find('controlfield[@tag="001"]').text)

991171012127005501
991171016129705501
991171021422205501
991171033010205501
991171056243005501
991171104173205501
991171105403405501
991171113493405501
991171113616005501
991171118031905501
991171120270805501
991171120406805501
991171130508305501
991171137585305501
991171141888005501


In [83]:
# Use findall to get a list of all matching elements
for record in records:
    datafields = record.findall('datafield')
    for datafield in datafields:
        # Use get to get an attribute value
        print('Tag: ', datafield.get('tag'))
        
        # to change an attribute, two possibilities
        datafield.set('ind1', '1')
        datafield.attrib['ind2'] = '3'
        
        # Use attr to get all atributes as dictionary
        print('All attributes: ', datafield.attrib)
        

Tag:  015
All attributes:  {'tag': '015', 'ind1': '1', 'ind2': '3'}
Tag:  016
All attributes:  {'tag': '016', 'ind1': '1', 'ind2': '3'}
Tag:  020
All attributes:  {'tag': '020', 'ind1': '1', 'ind2': '3'}
Tag:  020
All attributes:  {'tag': '020', 'ind1': '1', 'ind2': '3'}
Tag:  035
All attributes:  {'tag': '035', 'ind1': '1', 'ind2': '3'}
Tag:  035
All attributes:  {'tag': '035', 'ind1': '1', 'ind2': '3'}
Tag:  040
All attributes:  {'tag': '040', 'ind1': '1', 'ind2': '3'}
Tag:  044
All attributes:  {'tag': '044', 'ind1': '1', 'ind2': '3'}
Tag:  100
All attributes:  {'tag': '100', 'ind1': '1', 'ind2': '3'}
Tag:  245
All attributes:  {'tag': '245', 'ind1': '1', 'ind2': '3'}
Tag:  264
All attributes:  {'tag': '264', 'ind1': '1', 'ind2': '3'}
Tag:  300
All attributes:  {'tag': '300', 'ind1': '1', 'ind2': '3'}
Tag:  336
All attributes:  {'tag': '336', 'ind1': '1', 'ind2': '3'}
Tag:  337
All attributes:  {'tag': '337', 'ind1': '1', 'ind2': '3'}
Tag:  338
All attributes:  {'tag': '338', 'ind1'

In [84]:
for record in records:
    # ".//" means that we look for a field in all children elements (not only direct children)
    print(record.find('.//subfield').text)

22,N20
9783527719952
GBC244409
  2022004761
9783031137136
  2022026521
GBC2K0036
22,N47
9783527718498
GBC2K4879
GBC2K4887
  2022037925
CH-ZuSLS HESSO HEPIA
  2022056310
9783031176456


In [85]:
# Create a new node
new_field = etree.XML('''<datafield tag="500" ind1=" " ind2=" ">
                             <subfield code="a">TEST</subfield>
                         </datafield>''')
new_field

<Element datafield at 0x7fe670649080>

In [86]:
print(etree.tostring(new_field, pretty_print=True).decode())

<datafield tag="500" ind1=" " ind2=" ">
                             <subfield code="a">TEST</subfield>
                         </datafield>



In [87]:
for record in records:
    # Important to not add the same node several times => use deepcopy or create several times the node
    new_copy = deepcopy(new_field)
    
    # Use append to add the field at the last child
    record.append(new_copy)
    
    # Use this line to sort fields / please do not sort automatic subfields => order of subfields matters!
    record[:] = sorted(record, key=lambda field_or_contr: field_or_contr.get('tag', '000'))
    

In [88]:
print(etree.tostring(records[2], pretty_print=True).decode())

<record>
  <leader>01550nam a2200409 c 4500</leader>
  <controlfield tag="001">991171021422205501</controlfield>
  <controlfield tag="005">20230201101208.0</controlfield>
  <controlfield tag="008">220307s2023    xxk           000 0 eng d</controlfield>
  <datafield tag="015" ind1="1" ind2="3">
    <subfield code="a">GBC244409</subfield>
    <subfield code="2">bnb</subfield>
  </datafield>
  <datafield tag="016" ind1="1" ind2="3">
    <subfield code="a">020518196</subfield>
    <subfield code="2">Uk</subfield>
  </datafield>
  <datafield tag="020" ind1="1" ind2="3">
    <subfield code="a">9781292424125</subfield>
    <subfield code="q">pbk.</subfield>
  </datafield>
  <datafield tag="020" ind1="1" ind2="3">
    <subfield code="a">1292424125</subfield>
  </datafield>
  <datafield tag="035" ind1="1" ind2="3">
    <subfield code="a">(OCoLC)1308489142</subfield>
  </datafield>
  <datafield tag="040" ind1="1" ind2="3">
    <subfield code="a">UKMGB</subfield>
    <subfield code="b">ger</subfi