## simple script to grab files and rename them from the S3 bepress dump  

NOTE: must have boto3 module installed in environment

In [1]:
import boto3

In [2]:
authfile = open("credentials.txt", "r")
[ACCESS_KEY,SECRET_KEY] = authfile.readline().strip().split('|')
authfile.close()
s3 = boto3.client(
    's3',
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY
)

In [3]:
top_level_folders = s3.list_objects(
    Bucket='bepressarchivemiami',
    Prefix='archive/scholarlyrepository.miami.edu/',
    Delimiter='/'
)
top_level_folders

{'ResponseMetadata': {'RequestId': 'A171B253E7207956',
  'HostId': '/Vvm6C15uxShoQd/tVTS9pr4v/D3oUCeXef0W8e5h8kt+VJdQDduwt85CBwV4y/yFgHI21IpiTM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '/Vvm6C15uxShoQd/tVTS9pr4v/D3oUCeXef0W8e5h8kt+VJdQDduwt85CBwV4y/yFgHI21IpiTM=',
   'x-amz-request-id': 'A171B253E7207956',
   'date': 'Fri, 20 Sep 2019 19:58:00 GMT',
   'x-amz-bucket-region': 'us-west-1',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'IsTruncated': False,
 'Marker': '',
 'Contents': [{'Key': 'archive/scholarlyrepository.miami.edu/archive_manifest.json',
   'LastModified': datetime.datetime(2019, 9, 20, 15, 32, 14, tzinfo=tzutc()),
   'ETag': '"f8f3ad9038a3f442601ed4fff9b99f32"',
   'Size': 1734411,
   'StorageClass': 'STANDARD',
   'Owner': {'DisplayName': 'aws-rap-bepress',
    'ID': '90207bf9eb615ad174ea84170cdb7518fb63f45b5ac8b9c1d760254b2a7948c5'}}],
 'Name': 'bepressarchivemiami',
 'Pref

In [29]:
###
 # Function to recurse over folders and list content as per rules we define
 ##

def explore_folder(folder_prefix):
    
    to_ignore = [
        'archive/scholarlyrepository.miami.edu/anthurium/',
        'archive/scholarlyrepository.miami.edu/collaborations/',
        'archive/scholarlyrepository.miami.edu/collaborations_slideshow/'
    ]
    
    if folder_prefix in to_ignore:
        return
    
    print (folder_prefix)
    objects = s3.list_objects(
        Bucket='bepressarchivemiami',
        Prefix=folder_prefix,
        Delimiter='/'
    )
    
    if 'CommonPrefixes' in objects: # there are more folders
        
        folders = objects['CommonPrefixes']
        if objects['IsTruncated']: # make sure to get all pages (max 1000 per page)
            kwargs = {
                'Bucket': 'bepressarchivemiami',
                'Prefix': folder_prefix,
                'Delimiter': '/',
                'Marker': objects['NextMarker']
            }
            while True:
                objects = s3.list_objects(**kwargs)
                for folder in objects['CommonPrefixes']:
                    folders.append(folder)
                    
                try:
                    kwargs['Marker'] = objects['NextMarker']
                except KeyError:
                    break
                    
        for folder in folders:
            explore_folder(folder['Prefix']) # recursive call to step throught folder structure
            
    else: # we are at the content level, print the list of files
        internal_files = s3.list_objects(Bucket='bepressarchivemiami',Prefix=folder_prefix)
        for file in internal_files['Contents']:
            # this next line os for ETDs and Theses, it will be different for other folders
            if not ('stamped' in file['Key'] or 'metadata' in file['Key']):
                path = file['Key'].split('/')
                filename = path[len(path)-1]
                extension = filename.split('.')[-1]
                foldernumber = path[len(path)-2]
                path = "/".join(path[:-2]) + "/"
                print (filename, "=>",path+foldernumber+"."+extension)
    
    print()

In [47]:
# call the function on some folder. Note: this could be run at the top level, but it will take a while ...
explore_folder('archive/scholarlyrepository.miami.edu/iccaspapers/2/')

archive/scholarlyrepository.miami.edu/iccaspapers/2/
scp00010000030001001.pdf => archive/scholarlyrepository.miami.edu/iccaspapers/2.pdf



## step through OAI data from bepress and restructure

In [230]:
import xml.etree.ElementTree as ET
with open('20190830_bepressOAI.xml', 'r') as xml_file:
    rootDocs = ET.parse(xml_file)

In [231]:
directories = {}
to_ignore = [
        'anthurium',
        'collaborations',
        'collaborations_slideshow'
    ]
documents = rootDocs.find('{http://www.openarchives.org/OAI/2.0/}ListRecords').findall('{http://www.openarchives.org/OAI/2.0/}record')
for document in documents:
    container = document.find('{http://www.openarchives.org/OAI/2.0/}metadata').find('document-export').find('documents').find('document')
    path = container.find('submission-path').text
    folders = path.split('/')
    if folders[0] in directories:
        directories[folders[0]]['count'] += 1
    else:
        if not folders[0] in to_ignore:
            directories[folders[0]] = {}
            directories[folders[0]]['count'] = 1
            directories[folders[0]]['items'] = []
    
    if not len(folders) == 2: # simple numeric file structure
        print (folders[0])
    else:
        item = {
            'title': ''.join(e for e in container.find('title').text.split() if e.isalnum()),
            'year': container.find('publication-date').text[0:4],
            'author': ''
        }
        try:
            item['author'] = container.find('authors').find('author').find('lname').text
        except:
            try:
                item['author'] = container.find('authors').find('author').find('organization').text
            except:     
                try:
                    item['author'] = container.find('authors').find('author').find('institution').text
                except:
                    print ('woof')
            
        directories[folders[0]]['items'].append(item)

In [232]:
directories

{'iccaspapers': {'count': 31,
  'items': [{'title': 'andtheEconomyTheDismalStateofCubanStudies',
    'year': '1996',
    'author': 'George'},
   {'title': 'EnvironmentalTechnologyTransferandForeignInvestmentFactorsImpactingEnvironmentalProtectioninaCuba',
    'year': '1999',
    'author': 'Leiva'},
   {'title': 'TheVisittoCubaanditsAftermath',
    'year': '1999',
    'author': 'Clark'},
   {'title': 'CubaintheMiddleEastABriefChronology',
    'year': '1999',
    'author': 'Amuchástegui'},
   {'title': 'PoliticalPilgrimageto', 'year': '1996', 'author': 'Horowitz'},
   {'title': 'InstituteforCubanandSecondAnnualReport',
    'year': '2001',
    'author': 'Institute for Cuban and Cuban-American Studies'},
   {'title': 'TheOrTotalitarian', 'year': '2005', 'author': 'Yánez'},
   {'title': 'Laalianzaoexpansióndel', 'year': '2005', 'author': 'Yánez'},
   {'title': 'ElEmbargodelosEstadosUnidosHaciaCuba',
    'year': '2002',
    'author': 'Suchlicki'},
   {'title': 'ElServiciodeInteligenciaCastri

## get the OAI feed

In [69]:
import requests
import xml.etree.ElementTree as ET
import datetime

page = 1
print ('page:',page)

ET.register_namespace('oai',"http://www.openarchives.org/OAI/2.0/")

url = "https://scholarlyrepository.miami.edu/do/oai/?verb=ListRecords"
response = requests.get(url+"&metadataPrefix=document-export&set=publication:um_research_publications")

allXML = ET.fromstring(response.content.decode('utf-8'))

documentlist = allXML.find('{http://www.openarchives.org/OAI/2.0/}ListRecords')
resume = documentlist.find('{http://www.openarchives.org/OAI/2.0/}resumptionToken')
documentlist.remove(resume)
resume = resume.text

while not resume == None:
    page += 1
    print ('page:',page)
    response = requests.get(url+'&resumptionToken='+resume)
    pageXML = ET.fromstring(response.content.decode('utf-8'))
    pagelist = pageXML.find('{http://www.openarchives.org/OAI/2.0/}ListRecords')
    resume = pagelist.find('{http://www.openarchives.org/OAI/2.0/}resumptionToken')
    pagelist.remove(resume)
    resume = resume.text
    documentlist.extend(pagelist)
    
root = ET.ElementTree(allXML)
today = datetime.datetime.now().isoformat()[0:10].replace('-', '')
root.write(today+'_bepressOAI.xml',encoding="utf-8",xml_declaration=True)

page: 1


AttributeError: 'NoneType' object has no attribute 'find'

In [70]:
documentlist

## get OAI and modify filenames as we go

In [25]:
import requests
import xml.etree.ElementTree as ET
import datetime

output_directory = "pages/"

def updateXML(doclist):
    for record in doclist.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        document = record.find(
            '{http://www.openarchives.org/OAI/2.0/}metadata/document-export/documents/document'
        )
        path = 'archive/scholarlyrepository.miami.edu/' + document.find('submission-path').text + '/'
        internal_files = s3.list_objects(Bucket='bepressarchivemiami',Prefix=path)
        restricted = False
        doc_type = document.find('document-type').text
        pub_title = document.find('publication-title').text
        if doc_type == 'restricted': 
            restricted = True; 
        elif doc_type == 'withheld':
            got_date = False
            date = ""
            for field in document.findall('fields/field'):
                field_type = field.get('name')
                if field_type == "embargo_date":
                    date = field.find('value').text
                    if date > datetime.datetime.today().strftime('%Y-%m-%d'):
                        restricted = True
                        got_date = True
        elif pub_title == "Master of Fine Arts Creative Writing Theses" or \
            pub_title == "Internship Reports (Restricted)” is permanently UM campus only (RSMAS Internship Reports)" or \
            pub_title == "Scriptwriting Senior Projects (Restricted)" or \
            pub_title == "Archived Data Sets (Restricted)":
            restricted = True; 
        for file in internal_files['Contents']:
            # this next line os for ETDs and Theses, it may be different for other folders
            if not ('stamped' in file['Key'] or 'metadata' in file['Key']):
                filename = file['Key'].split('/')[-1]
                if filename == '':
                    print ('ERROR (no file) for ' + path)
                # as requested, only provide new element for restricted items
                if restricted:
                    s3_path = ET.SubElement(document,'fulltext-s3-path')
                    s3_path.text = path + filename

page = 1
print ('page:',page)

ET.register_namespace('oai',"http://www.openarchives.org/OAI/2.0/")

url = "https://scholarlyrepository.miami.edu/do/oai/?verb=ListRecords"
response = requests.get(url+"&metadataPrefix=document-export&set=publication:um_research_publications")

allXML = ET.fromstring(response.content.decode('utf-8'))

documentlist = allXML.find('{http://www.openarchives.org/OAI/2.0/}ListRecords')
resume = documentlist.find('{http://www.openarchives.org/OAI/2.0/}resumptionToken')
documentlist.remove(resume)
resume = resume.text

updateXML(documentlist)
root = ET.ElementTree(allXML)
today = datetime.datetime.now().isoformat()[0:10].replace('-', '')
root.write(output_directory+today+'_OAI_page'+str(page)+'.xml',encoding="utf-8",xml_declaration=True)

while not resume == None:
    page += 1
    print ('page:',page)
    response = requests.get(url+'&resumptionToken='+resume)
    pageXML = ET.fromstring(response.content.decode('utf-8'))
    pagelist = pageXML.find('{http://www.openarchives.org/OAI/2.0/}ListRecords')
    resume = pagelist.find('{http://www.openarchives.org/OAI/2.0/}resumptionToken')
    pagelist.remove(resume)
    resume = resume.text
    updateXML(pagelist)
    root = ET.ElementTree(pageXML)
    root.write(output_directory+today+'_OAI_page'+str(page)+'.xml',encoding="utf-8",xml_declaration=True)
    documentlist.extend(pagelist)
    
root = ET.ElementTree(allXML)
root.write(output_directory+today+'_OAI_bepress.xml',encoding="utf-8",xml_declaration=True)

page: 1
page: 2
page: 3
page: 4
page: 5
page: 6
page: 7
page: 8
page: 9
page: 10
page: 11
page: 12
page: 13
page: 14
page: 15
page: 16
page: 17
page: 18
page: 19
page: 20
page: 21
page: 22
page: 23
page: 24
page: 25
page: 26
page: 27
page: 28
page: 29
page: 30
page: 31
page: 32
page: 33
page: 34
page: 35
page: 36
page: 37
page: 38
page: 39
page: 40
page: 41
page: 42
page: 43
page: 44
page: 45
page: 46
page: 47
page: 48


KeyError: 'Contents'

In [65]:
root.write(today+'_OAI_page'+str(page)+'.xml',encoding="utf-8")

In [9]:
pagelist

<Element '{http://www.openarchives.org/OAI/2.0/}ListRecords' at 0x1113b7ef0>

In [26]:
for record in pagelist.findall('{http://www.openarchives.org/OAI/2.0/}record'):
    document = record.find(
        '{http://www.openarchives.org/OAI/2.0/}metadata/document-export/documents/document'
    )
    path = 'archive/scholarlyrepository.miami.edu/' + document.find('submission-path').text + '/'
    internal_files = s3.list_objects(Bucket='bepressarchivemiami',Prefix=path)
    restricted = False
    doc_type = document.find('document-type').text
    pub_title = document.find('publication-title').text
    if doc_type == 'restricted': 
        restricted = True; 
    elif doc_type == 'withheld':
        got_date = False
        date = ""
        for field in document.findall('fields/field'):
            field_type = field.get('name')
            if field_type == "embargo_date":
                date = field.find('value').text
                if date > datetime.datetime.today().strftime('%Y-%m-%d'):
                    restricted = True
                    got_date = True
    elif pub_title == "Master of Fine Arts Creative Writing Theses" or \
        pub_title == "Internship Reports (Restricted)” is permanently UM campus only (RSMAS Internship Reports)" or \
        pub_title == "Scriptwriting Senior Projects (Restricted)" or \
        pub_title == "Archived Data Sets (Restricted)":
        restricted = True; 
    for file in internal_files['Contents']:
        # this next line os for ETDs and Theses, it may be different for other folders
        if not ('stamped' in file['Key'] or 'metadata' in file['Key']):
            filename = file['Key'].split('/')[-1]
            if filename == '':
                print ('ERROR (no file) for ' + path)
            # as requested, only provide new element for restricted items
            if restricted:
                s3_path = ET.SubElement(document,'fulltext-s3-path')
                s3_path.text = path + filename

KeyError: 'Contents'

In [27]:
internal_files

{'ResponseMetadata': {'RequestId': '84FF9CF627FDE4E1',
  'HostId': 'gH49zJ0quJezKPUzKaOFPs2eYsn+zDl2DiqGTSZXeuHNCxiG+hLApBaAj82JvMI0m07euEU0VvE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'gH49zJ0quJezKPUzKaOFPs2eYsn+zDl2DiqGTSZXeuHNCxiG+hLApBaAj82JvMI0m07euEU0VvE=',
   'x-amz-request-id': '84FF9CF627FDE4E1',
   'date': 'Fri, 20 Sep 2019 21:15:47 GMT',
   'x-amz-bucket-region': 'us-west-1',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'IsTruncated': False,
 'Marker': '',
 'Name': 'bepressarchivemiami',
 'Prefix': 'archive/scholarlyrepository.miami.edu/oa_dissertations/2329/',
 'MaxKeys': 1000,
 'EncodingType': 'url'}

In [28]:
path

'archive/scholarlyrepository.miami.edu/oa_dissertations/2329/'

In [33]:
explore_folder('archive/scholarlyrepository.miami.edu/oa_dissertations/2330/')

archive/scholarlyrepository.miami.edu/oa_dissertations/2330/
snm76S19.pdf => archive/scholarlyrepository.miami.edu/oa_dissertations/2330.pdf

