In [3]:
pip install sickle

Defaulting to user installation because normal site-packages is not writeable
Collecting sickle
  Downloading Sickle-0.7.0-py3-none-any.whl (12 kB)
Installing collected packages: sickle
Successfully installed sickle-0.7.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import time
from sickle import Sickle

# Define the OAI-PMH endpoint
url = 'https://koha.adminkuhn.ch/cgi-bin/koha/oai.pl'

# Create a Sickle instance
sickle = Sickle(url)

# Harvest records with the specified metadata prefix
records = sickle.ListRecords(metadataPrefix='marc21')

# Create the output directory if it doesn't exist
output_dir = 'test-koha'
os.makedirs(output_dir, exist_ok=True)

# Save each record to a separate file with an improved naming scheme
for i, record in enumerate(records):
    timestamp = int(time.time())
    unique_id = record.header.identifier.split(':')[-1]
    file_name = f"{timestamp}_KOHA_OAI_TEST_{unique_id}.xml"
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(record.raw)

In [2]:
import os
import time
from sickle import Sickle

url = 'https://slsp-hph-psb.alma.exlibrisgroup.com/view/oai/41SLSP_HPH/request'
output_dir = 'test-alma'
metadata_prefix = 'marc21'
set_spec = 'SLSP_FHGR'

sickle = Sickle(url)
os.makedirs(output_dir, exist_ok=True)

records = sickle.ListRecords(metadataPrefix=metadata_prefix, set=set_spec)

for record_count, record in enumerate(records, start=1):
    timestamp = int(time.time())
    unique_id = record.header.identifier.split(':')[-1]
    file_name = f"{timestamp}_ALMA_OAI_TEST_{unique_id}.xml"
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(record.raw)

KeyboardInterrupt: 

In [1]:
import os
import time
from sickle import Sickle

# Define the OAI-PMH endpoint
url = 'https://sandbox.archivesspace.org/oai'

# Create a Sickle instance
sickle = Sickle(url)

# Create the output directory if it doesn't exist
output_dir = 'test-archivesspace'
os.makedirs(output_dir, exist_ok=True)

# Harvest records with the specified metadata prefix
print("Starting to harvest records...")
records = sickle.ListRecords(metadataPrefix='oai_ead')

for record in records:
    timestamp = int(time.time())
    unique_id = record.header.identifier.split(':')[-1]
    # Sanitize the unique_id to remove or replace invalid characters
    sanitized_unique_id = unique_id.replace('/', '_')
    file_name = f"{timestamp}_ARCHIVESSPACE_OAI_{sanitized_unique_id}.xml"
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(record.raw)
    print(f"Saved record as {file_name}")

print("Harvesting completed successfully.")

Starting to harvest records...
Saved record as 1735549041_ARCHIVESSPACE_OAI_archivesspace__repositories_2_resources_2.xml
Saved record as 1735549042_ARCHIVESSPACE_OAI_archivesspace__repositories_2_resources_3.xml
Saved record as 1735549043_ARCHIVESSPACE_OAI_archivesspace__repositories_2_resources_4.xml
Saved record as 1735549045_ARCHIVESSPACE_OAI_archivesspace__repositories_2_resources_5.xml
Saved record as 1735549046_ARCHIVESSPACE_OAI_archivesspace__repositories_2_resources_6.xml
Saved record as 1735549048_ARCHIVESSPACE_OAI_archivesspace__repositories_2_resources_7.xml
Saved record as 1735549049_ARCHIVESSPACE_OAI_archivesspace__repositories_4_resources_8.xml
Saved record as 1735549050_ARCHIVESSPACE_OAI_archivesspace__repositories_4_resources_9.xml
Saved record as 1735549052_ARCHIVESSPACE_OAI_archivesspace__repositories_4_resources_10.xml
Saved record as 1735549053_ARCHIVESSPACE_OAI_archivesspace__repositories_4_resources_11.xml
Saved record as 1735549054_ARCHIVESSPACE_OAI_archivesspac

In [14]:
import os
import time
from sickle import Sickle

# Define the OAI-PMH endpoint
url = 'https://demo.dspace.org/oai/request'

# Create a Sickle instance
sickle = Sickle(url)

# Create the output directory if it doesn't exist
output_dir = 'test-dspace'
os.makedirs(output_dir, exist_ok=True)

# Harvest records with the specified metadata prefix
print("Starting to harvest records...")
records = sickle.ListRecords(metadataPrefix='oai_dc')

for record in records:
    timestamp = int(time.time())
    unique_id = record.header.identifier.split(':')[-1]
    # Sanitize the unique_id to remove or replace invalid characters
    sanitized_unique_id = unique_id.replace('/', '_')
    file_name = f"{timestamp}_DSPACE_OAI_{sanitized_unique_id}.xml"
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(record.raw)
    print(f"Saved record as {file_name}")

print("Harvesting completed successfully.")

Starting to harvest records...
Saved record as 1735315236_DSPACE_OAI_10673_2038.xml
Saved record as 1735315236_DSPACE_OAI_10673_1883.xml
Saved record as 1735315236_DSPACE_OAI_10673_2126.xml
Saved record as 1735315236_DSPACE_OAI_10673_4832.xml
Saved record as 1735315236_DSPACE_OAI_10673_5481.xml
Saved record as 1735315236_DSPACE_OAI_10673_6056.xml
Saved record as 1735315236_DSPACE_OAI_10673_4734.xml
Saved record as 1735315236_DSPACE_OAI_10673_6955.xml
Saved record as 1735315236_DSPACE_OAI_10673_6090.xml
Saved record as 1735315236_DSPACE_OAI_10673_5124.xml
Saved record as 1735315236_DSPACE_OAI_10673_2195.xml
Saved record as 1735315236_DSPACE_OAI_10673_6947.xml
Saved record as 1735315236_DSPACE_OAI_10673_6612.xml
Saved record as 1735315236_DSPACE_OAI_10673_6936.xml
Saved record as 1735315236_DSPACE_OAI_10673_6143.xml
Saved record as 1735315236_DSPACE_OAI_10673_5338.xml
Saved record as 1735315236_DSPACE_OAI_10673_6732.xml
Saved record as 1735315236_DSPACE_OAI_10673_6752.xml
Saved record as