# Some idea solutions about the projects of part 5

In [1]:
# Import libraries
from almasru.client import SruClient, SruRecord, IzSruRecord, SruRequest
from almasru.utils import check_removable_records, analyse_records
from almasru import config_log

import pandas as pd
import numpy as np
from typing import Tuple, Dict, List, Set, Optional

# Config logs
config_log()

## Part 5: projects - Fetch analytical records

In [2]:
mms_id = '990009912310205526'
zone = '41SLSP_RZH'
SruClient.set_base_url(f'https://swisscovery.ch/view/sru/{zone}')
rec = IzSruRecord(mms_id)
iz_analytical = rec.get_child_analytical_records()
print('IZ RZH: ', len(iz_analytical))

mms_id = rec.get_nz_mms_id()
zone = '41SLSP_NETWORK'
SruClient.set_base_url(f'https://swisscovery.ch/view/sru/{zone}')
rec = SruRecord(mms_id)
nz_analytical = rec.get_child_analytical_records()
print('NZ: ', len(nz_analytical))

2025-09-09 06:23:46,455 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_RZH?query=alma.mms_id%3D990009912310205526&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=10
2025-09-09 06:23:46,461 - INFO - Records 1 - 1 / 1, "alma.mms_id=990009912310205526": 1
2025-09-09 06:23:49,175 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_RZH?query=alma.other_system_number%3D%28NEBIS%29000991231EBI01&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=50
2025-09-09 06:23:49,187 - INFO - Records 1 - 21 / 21, "alma.other_system_number=(NEBIS)000991231EBI01": 21
2025-09-09 06:23:49,897 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_RZH?query=alma.other_system_number%3D991124328919705501&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=50
2025-09-09 06:23:51,073 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_RZH?query=alma.other_system_number%3D%3D%28swissbib%29217572502-41slsp_netwo

## Part 5: projects - Check removable records

In [3]:
mms_ids_to_check = [
    '991171228773105501', '991153648929705501', '991095338679705501',
    '991108597099705501', '991107511699705501', '991132863209705501',
    '991152338469705501', '991171357508305501', '991126902329705501',
    '991171337525005501', '991094843849705501', '991093934809705501',
    '991154177219705501', '991156446029705501', '991170675750105501',
    '991006488319705501', '991153029939705501', '991149212459705501',
    '991170647161305501', '991171866808105501', '991055945739705501',
]
df = check_removable_records(mms_ids_to_check)
df

2025-09-09 06:24:00,029 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_NETWORK?query=alma.mms_id%3D991171228773105501&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=10
2025-09-09 06:24:00,036 - INFO - Records 1 - 1 / 1, "alma.mms_id=991171228773105501": 1
2025-09-09 06:24:00,742 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_NETWORK?query=alma.mms_id%3D991153648929705501&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=10
2025-09-09 06:24:00,748 - INFO - Records 1 - 1 / 1, "alma.mms_id=991153648929705501": 1
2025-09-09 06:24:01,621 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_NETWORK?query=alma.mms_id%3D991095338679705501&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=10
2025-09-09 06:24:01,627 - INFO - Records 1 - 1 / 1, "alma.mms_id=991095338679705501": 1
2025-09-09 06:24:02,446 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_NETWORK?query=alma.mms_id%3D

Unnamed: 0_level_0,removable,comment,bib_level,IZ_with_inventory,child_records,parent_records,fields_to_check,warning,error,additional_mms_id
mms_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
991171228773105501,True,REMOVABLE,m,,,,,,False,False
991153648929705501,True,REMOVABLE,m,,,,,,False,False
991095338679705501,False,Record used in at least one IZ,m,41SLSP_ETH|41SLSP_EPF,,,,,False,False
991108597099705501,False,Record used in at least one IZ,m,41SLSP_SUP,,,,,False,False
991107511699705501,False,Record used in at least one IZ,s,41SLSP_RZH|41SLSP_VGE|41SLSP_UBS,,,,,False,False
991132863209705501,True,REMOVABLE,m,,,,,,False,False
991152338469705501,False,Record used in at least one IZ,m,41SLSP_UZB|41SLSP_UNE|41SLSP_RZS|41SLSP_TRI|41...,,,,,False,False
991171357508305501,False,Record used in at least one IZ,m,41SLSP_UBE,,,,,False,False
991126902329705501,False,Parent record has inventory,a,,,9.910707121497056e+17,,,False,False
991171337525005501,False,Record used in at least one IZ,m,41SLSP_UZB|41SLSP_BCUFR,,,,,False,False


## Part 5: projects - Check inventory of libraries

In [4]:
# Import libraries
from almasru.client import SruClient, SruRecord, IzSruRecord, SruRequest
from almasru.utils import check_removable_records, analyse_records
from almasru import dedup
from almasru.briefrecord import BriefRecFactory, BriefRec
from almasru import config_log
from lxml import etree

parser = etree.XMLParser(remove_blank_text=True, remove_comments=True, ns_clean=True)

ns = {
    "srw": "http://www.loc.gov/zing/srw/",
    "marc": "http://www.loc.gov/MARC21/slim"
}

# Config logs
config_log()
zone = '41SLSP_NETWORK'
SruClient.set_base_url(f'https://swisscovery.ch/view/sru/{zone}')
std_num = '0020-7985'
records = SruRequest(f'alma.standard_number={std_num}').records

def has_record_std_num_in_02x(std_num, record):
    std_nums = [f.text.replace('-', '') for f in record.data.findall('.//marc:datafield[@tag="020"]/marc:subfield[@code="a"]', namespaces=ns)]
    std_nums += [f.text.replace('-', '') for f in record.data.findall('.//marc:datafield[@tag="022"]/marc:subfield[@code="a"]', namespaces=ns)]
    if std_num.replace('-', '') in std_nums:
        return True
    else:
        return False

records = [r for r in records if has_record_std_num_in_02x(std_num, r)]

result = dict()
for record in records:
    izs = record.get_iz_using_rec()
    for iz in izs:
        result[iz] = set()
        iz_rec = record.get_iz_record(f'https://swisscovery.ch/view/sru/{iz}')
        inventory = iz_rec.get_inventory_info()
        for hol in inventory:
            result[iz].add(hol['library'])
            
result

2025-09-09 06:24:45,324 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41SLSP_NETWORK?query=alma.standard_number%3D0020-7985&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=10
2025-09-09 06:24:45,332 - INFO - Records 1 - 9 / 9, "alma.standard_number=0020-7985": 9
2025-09-09 06:24:45,339 - INFO - SruRecord('991170911588205501', base_url='https://swisscovery.ch/view/sru/41SLSP_NETWORK'): record used in IZ: 41BIG_INST
2025-09-09 06:24:45,342 - INFO - SruRecord('991170911588205501', base_url='https://swisscovery.ch/view/sru/41SLSP_NETWORK'): 1 records in IZ found
2025-09-09 06:24:46,839 - INFO - SRU data fetched: https://swisscovery.ch/view/sru/41BIG_INST?query=alma.mms_id%3D993900804101791&version=1.2&operation=searchRetrieve&startRecord=1&maximumRecords=10
2025-09-09 06:24:46,847 - INFO - Records 1 - 1 / 1, "alma.mms_id=993900804101791": 1
2025-09-09 06:24:46,850 - INFO - IzSruRecord('993900804101791', base_url='https://swisscovery.ch/view/sru/41BIG_INST'): 

{'41BIG_INST': {'BIG'},
 '41SLSP_RZH': {'E19'},
 '41SLSP_SBK': {'UZSBK', 'Z0SBK'},
 '41SLSP_UBS': {'A125'},
 '41SLSP_UGE': {'uge_mail'},
 '41SLSP_UNE': {'une_sfm'},
 '41SLSP_IID': {'iid'},
 '41SLSP_VGE': {'vge_bge'},
 '41SLSP_BCUFR': {'BCUF_BP2'}}

## Part 5: projects - Deduplication

In [5]:
# Import libraries
from dedupmarcxml.evaluate import evaluate_records_similarity, get_similarity_score
from dedupmarcxml.briefrecord import XmlBriefRec

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
def is_duplicated(rec1: XmlBriefRec, rec2: XmlBriefRec) -> Tuple[bool, float]:
    """
    Determine whether two MARC XML records are considered duplicates.

    Parameters
    ----------
    rec1 : XmlBriefRec
        First record, constructed from an etree.Element representing a MARC XML record.
    rec2 : XmlBriefRec
        Second record to compare, also constructed from an etree.Element.

    Returns
    -------
    Tuple[bool, float]
        A tuple containing:
        - A boolean indicating whether the records are considered duplicates.
        - A float representing the similarity score (≥ 0.5 indicates duplication).
    """
    score_detailed = evaluate_records_similarity(rec1, rec2)
    return get_similarity_score(score_detailed, method='random_forest_general') >= 0.5, get_similarity_score(score_detailed, method='random_forest_general')

In [7]:
# Same material type (ISBN: 9782412077467)
SruClient.set_base_url('https://renouvaud.primo.exlibrisgroup.com/view/sru/41BCULAUSA_NETWORK')
r = SruRequest(query='alma.mms_id=991024372153702851')
bcul_rec = r.records[0] 
bcul_briefrec = XmlBriefRec(bcul_rec.data)
SruClient.set_base_url('https://swisscovery.ch/view/sru/41SLSP_NETWORK')
r = SruRequest(query='alma.mms_id=991171135496005501')
slsp_rec = r.records[0] 
slsp_briefrec = XmlBriefRec(slsp_rec.data)
result, score = is_duplicated(slsp_briefrec, bcul_briefrec)
print(f'Is duplicated: {result} ({score})')

2025-09-09 06:25:16,396 - INFO - Records 1 - 1 / 1, "alma.mms_id=991024372153702851": 1
2025-09-09 06:25:16,406 - INFO - Records 1 - 1 / 1, "alma.mms_id=991171135496005501": 1
Is duplicated: True (0.8056666666666668)


In [8]:
# Print compared to e-version (ISBN: 9782412077467)
SruClient.set_base_url('https://renouvaud.primo.exlibrisgroup.com/view/sru/41BCULAUSA_NETWORK')
r = SruRequest(query='alma.mms_id=991024372153702851')
bcul_rec = r.records[0] 
bcul_briefrec = XmlBriefRec(bcul_rec.data)
SruClient.set_base_url('https://swisscovery.ch/view/sru/41SLSP_NETWORK')
r = SruRequest(query='alma.mms_id=991171940910205501')
slsp_rec = r.records[0] 
slsp_briefrec = XmlBriefRec(slsp_rec.data)
result, score = is_duplicated(slsp_briefrec, bcul_briefrec)
print(f'Is duplicated: {result} ({score})')

2025-09-09 06:25:16,436 - INFO - Records 1 - 1 / 1, "alma.mms_id=991024372153702851": 1
2025-09-09 06:25:16,442 - INFO - Records 1 - 1 / 1, "alma.mms_id=991171940910205501": 1
Is duplicated: False (0.3653333333333333)


In [9]:
print(bcul_briefrec)

{
    "rec_id": "991024372153702851",
    "format": {
        "type": "Book",
        "access": "Physical",
        "analytical": false,
        "f33x": "txt;n;nc"
    },
    "titles": [
        {
            "m": "Python pour la finance :",
            "s": "ma\u00eetriser la finance algorithmique /"
        }
    ],
    "short_titles": [
        "Python pour la finance :"
    ],
    "creators": [
        "Hilpisch, Yves"
    ],
    "corp_creators": null,
    "languages": [
        "fre"
    ],
    "extent": {
        "nb": [
            689,
            23
        ],
        "txt": "xxiii, 689 pages :"
    },
    "editions": null,
    "years": {
        "y1": [
            2022
        ],
        "y2": 2024
    },
    "publishers": [
        "First Interactive,",
        "O'Reilly"
    ],
    "series": null,
    "parent": null,
    "std_nums": [
        "9782412077467"
    ],
    "sys_nums": [
        "(RNV_B)0003003761"
    ]
}


In [10]:
print(slsp_briefrec)

{
    "rec_id": "991171940910205501",
    "format": {
        "type": "Book",
        "access": "Online",
        "analytical": false,
        "f33x": "txt;c;cr"
    },
    "titles": [
        {
            "m": "Python pour la finance /",
            "s": ""
        }
    ],
    "short_titles": [
        "Python pour la finance /"
    ],
    "creators": [
        "Hilpisch, Yves J.,",
        "Engler, Olivier,"
    ],
    "corp_creators": null,
    "languages": [
        "fre"
    ],
    "extent": {
        "nb": [
            689,
            1
        ],
        "txt": "1 online resource (689 pages) :"
    },
    "editions": [
        {
            "nb": [
                2
            ],
            "txt": "Second edition."
        }
    ],
    "years": {
        "y1": [
            2022
        ]
    },
    "publishers": [
        "E\u0301ditions First,"
    ],
    "series": null,
    "parent": null,
    "std_nums": [
        "2412077460",
        "9782412077467"
    ],
    "sys_n

In [11]:
def get_ldr_6_and_7_req(document_type: str) -> str:
    """Convert document type of short records into the original value
    of the leader field position 6 and 7. It returns the SRU request arguments
    to filter the results acording to the leader."""
    if document_type is None:
        return ''
    
    return f'%20and%20alma.type_of_record={document_type[0]}%20and%20alma.bib_level={document_type[1]}'

def get_requests(rec: BriefRec) -> Dict:
    """Create a list of arguments for SRU requests. Requests are grouped in
    two lists: "simple" and "extended". Idea is to use "extended" requests
    only when no match is found with "simple" requests."""
    
    # Create lists of requests
    reqs = {'simple': [],
            'extended': []}
    
    # Short title is used in various requests
    short_title = f'alma.title="{rec.data["titles"][0]["s"]}"'
    
    # isbn requests
    if rec.data['std_nums']:
        for isbn in rec.data['std_nums']:
            reqs['simple'].append(f'alma.isbn={isbn}')
    
    # issn requests
    if rec.data['std_nums']:
        for issn in rec.data['std_nums']:
            reqs['simple'].append(f'alma.issn={issn}')
    
    # sysnums
    if rec.data['sys_nums'] is not None:
        for sysnum in rec.data['sys_nums']:
            reqs['simple'].append(f'alma.other_system_number={sysnum}')
    
    # title
    reqs['simple'].append(f'alma.title="{rec.data["titles"][0]["m"]}"')

    # add filter on the format for each request
    reqs['simple'] = [req + get_ldr_6_and_7_req('am') for req in reqs['simple']]
    reqs['simple'] = [req + '%20and%20alma.is_linked==0' for req in reqs['simple']]
    
    # short title + creator
    if rec.data['creators'] is not None:
        for creator in rec.data['creators']:
            simple_creator = creator.split(',')[0].strip()
            reqs['extended'].append(f'{short_title}%20and%20alma.creator="{simple_creator}"')
    
    # short title + organisation
    if rec.data['corp_creators'] is not None:
        for creator in rec.data['corp_creators']:
            reqs['extended'].append(f'{short_title}%20and%20alma.creator="{creator}"')
    
    # date + creator
    if rec.data['years'] is not None and rec.data['years']['y1'] is not None and rec.data['creators'] is not None:
        pub_date = str(rec.data['years']['y1'][0])
        for creator in rec.data['creators']:
            simple_creator = creator.split(',')[0].strip()
            reqs['extended'].append(f'alma.main_pub_date={pub_date}%20and%20alma.creator="{simple_creator}"')
    
    # date + organisation
    if rec.data['years'] is not None and rec.data['years']['y1'] is not None and rec.data['corp_creators'] is not None:
        date_1 = str(rec.data['years']['y1'][0])
        for creator in rec.data['corp_creators']:
            reqs['extended'].append(f'alma.main_pub_date={date_1}%20and%20alma.creator="{creator}"')
    
    # short title
    reqs['extended'].append(short_title)
    
    # general keywords
    if rec.data['years'] is not None and rec.data['years']['y1'] is not None:
        date_1 = str(rec.data['years']['y1'][0])
        reqs['extended'].append(f'alma.all_for_ui="{short_title}"%20and%20alma.all_for_ui={date_1}"')
    
    # add filter on the format for each request
    reqs['extended'] = [req + get_ldr_6_and_7_req('am') for req in reqs['extended']]
    reqs['extended'] = [req + '%20and%20alma.is_linked==0' for req in reqs['extended']]
    
    return reqs


def get_records_from_requests(reqs: List[str], limit: int) -> Set['BriefRec']:
    """Use the provided list of requests to fetch records
    on the SRU server. Transform the results in `BriefRec`
    records.
    """
    records = set()
    
    for req in reqs:
        r = SruClient().fetch_records(req, limit)
        records = records.union(r.records)

    records = {XmlBriefRec(rec.data) for rec in records}
    return records

In [12]:
SruClient.set_base_url('https://renouvaud.primo.exlibrisgroup.com/view/sru/41BCULAUSA_NETWORK')
r = SruRequest(query='alma.isbn=9782412077467')
bcul_rec = r.records[0]
bcul_rec.mms_id

2025-09-09 06:25:16,521 - INFO - Records 1 - 1 / 1, "alma.isbn=9782412077467": 1


'991024372153702851'

In [16]:
def fetch_duplicated_in_slsp(mms_id: str) -> Tuple[bool, Optional[str], float]:
    """Check if duplicated records are available in SLSP NZ

    Parameter:
    ----------
    mms_id: str containing the mms_id of the Renouvaud record

    Returns:
    --------
    Tuple containing the result of the evaluation
    """
    SruClient.set_base_url('https://renouvaud.primo.exlibrisgroup.com/view/sru/41BCULAUSA_NETWORK')
    bcul_rec = SruRecord(mms_id)
    bcul_briefrec = XmlBriefRec(bcul_rec.data)
    SruClient.set_base_url('https://swisscovery.ch/view/sru/41SLSP_NETWORK')
    reqs = get_requests(bcul_briefrec)
    records_to_compare = get_records_from_requests(reqs['simple'], limit=500)

    duplicated = False
    slsp_mms_id = None
    max_score = 0
    for record in records_to_compare:
        result, score = is_duplicated(bcul_briefrec, record)
        if score >= max_score:
            slsp_mms_id = record.data['rec_id']
            max_score = score
            if max_score >= 0.5:
                duplicated = True
    return (duplicated, slsp_mms_id, max_score)

bcul_mms_id = '991024372153702851'
fetch_duplicated_in_slsp(bcul_mms_id)

(True, '991171135496005501', np.float64(0.8056666666666668))