## several scripts to verify and manipulate file names in the S3 bepress dump  

NOTE: must have boto3 module installed in python environment. You must 

- modify the third cell to include your values for the S3 bucket
- run the first three cells for all other scripts to work

### The main scripts in the notebook

- [Recursive function to explore folders in the S3 bucket](#explore)
- [Get OAI and add S3 filenames as &lt;fulltext-s3-path&gt; in xml](#rename) 
- [Restructure OAI-PMH xml as json](#xmltojson)

In [None]:
import boto3

In [None]:
 ##
 # Get credentials for the S3 bucket and login. Credentials are stored in a plain text file as 
 # access key and secret key both on one line separated by the '|' character. The file must
 # have a second blank line (two lines total)
##
authfile = open("credentials.txt", "r")
[ACCESS_KEY,SECRET_KEY] = authfile.readline().strip().split('|')
authfile.close()
s3 = boto3.client(
    's3',
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY
)

In [None]:
 ##
 # Set some variables that get reused a lot (parameters)
##

# folders to ignore
to_ignore = [
    'archive/scholarlyrepository.miami.edu/anthurium/',
    'archive/scholarlyrepository.miami.edu/collaborations/',
    'archive/scholarlyrepository.miami.edu/collaborations_slideshow/'
]

# your bucket name here
bucket = 'bepressarchivemiami'

# your prefix here
prefix = 'archive/scholarlyrepository.miami.edu/'

In [None]:
 ##
 # Read through the top level of the bucket
##
top_level_folders = s3.list_objects(
    Bucket=bucket,
    Prefix=prefix,
    Delimiter='/'
)
top_level_folders

<a name="explore"></a>

## Function to explore folders on S3

In [None]:
###
 # Function to recurse over folders and list content as per rules we define
 ##

def explore_folder(bucket, folder_prefix, to_ignore):
    
    if folder_prefix in to_ignore:
        return
    
    print (folder_prefix)
    objects = s3.list_objects(
        Bucket=bucket,
        Prefix=folder_prefix,
        Delimiter='/'
    )
    
    if 'CommonPrefixes' in objects: # there are more folders
        
        folders = objects['CommonPrefixes']
        if objects['IsTruncated']: # make sure to get all pages (max 1000 per page)
            kwargs = {
                'Bucket': bucket,
                'Prefix': folder_prefix,
                'Delimiter': '/',
                'Marker': objects['NextMarker']
            }
            while True:
                objects = s3.list_objects(**kwargs)
                for folder in objects['CommonPrefixes']:
                    folders.append(folder)
                    
                try:
                    kwargs['Marker'] = objects['NextMarker']
                except KeyError:
                    break
                    
        for folder in folders:
            explore_folder(bucket,folder['Prefix'],to_ignore) # recursive call to step throught folder structure
            
    else: # we are at the content level, print the list of files
        internal_files = s3.list_objects(Bucket=bucket,Prefix=folder_prefix)
        for file in internal_files['Contents']:
            # just some extra parsing
            path = file['Key'].split('/')
            filename = path[len(path)-1]
            extension = filename.split('.')[-1]
            foldernumber = path[len(path)-2]
            path = "/".join(path[:-2]) + "/"
            star = ''
            # this next line os for ETDs and Theses, it will be different for other folders
            # it simply highlights the main download
            if not ('stamped' in file['Key'] or 'metadata' in file['Key']):
                star = ' *'

            print (path+filename+star)
    
    print()

In [None]:
# call the function on some folder. Note: this could be run at the top level, but it will take a while ...
# for example:
explore_folder(
    bucket, 
    'archive/scholarlyrepository.miami.edu/socomm_cinema_scripts/211',
    to_ignore
)

In [None]:
# Check to see if we can download a file
with open('vpk6wuij76lfpov4cdt3c7k3ur9skvxe.pdf', 'wb') as f:
    s3.download_fileobj('bepressarchivemiami', 'archive/scholarlyrepository.miami.edu/socomm_cinema_scripts/199/wcehn5pxu7a9l6a1y81e49wynl5tn3mn.pdf', f)

<a name="rename"></a>

## Get OAI and add S3 filenames as &lt;fulltext-s3-path&gt; in xml

This is the main tool in this notebook. It steps through the OAI-PMH feed from bepress and then finds the corresponding filenames on S3 for all restricted materials and adds the element &lt;fulltext-s3-path&gt; to the xml.

The output is a set of xml files as pages in the 'pages' folder
    
Other output notes:

- this script prints the current page number for each step
- the script will print an error message if no files were found in the S3 bucket
- the script creates an 'errors' array that holds a list of errors

In [None]:
# the bepress oai endpoint with any qualifiers for the request
url = "https://scholarlyrepository.miami.edu/do/oai/?verb=ListRecords"
qualifier = "&metadataPrefix=document-export&set=publication:um_research_publications"

# an array with a list of known restricted series in bepress
known_restricted = [
    "Master of Fine Arts Creative Writing Theses",
    "Internship Reports (Restricted)",
    "Scriptwriting Senior Projects (Restricted)",
    "Archived Data Sets (Restricted)"
]

In [None]:
import requests
import xml.etree.ElementTree as ET
import datetime

# write all xml pages to this directory
output_directory = "pages/"
errors = []

###
 # step through xml page and create <fulltext-s3-path> elements with a filename
 # for any restricted documents on bepress
 ##
def updateXML(doclist):
    for record in doclist.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        
        # get document and some of the meta
        document = record.find(
            '{http://www.openarchives.org/OAI/2.0/}metadata/document-export/documents/document'
        )
        path = prefix + document.find('submission-path').text + '/'
        internal_files = s3.list_objects(Bucket=bucket,Prefix=path)
        restricted = False
        doc_type = document.find('document-type').text
        pub_title = document.find('publication-title').text
        
        # check to see if document is restricted
        if doc_type == 'restricted': 
            restricted = True; 
        elif doc_type == 'withheld':
            got_date = False
            date = ""
            for field in document.findall('fields/field'):
                field_type = field.get('name')
                if field_type == "embargo_date":
                    date = field.find('value').text
                    if date > datetime.datetime.today().strftime('%Y-%m-%d'):
                        restricted = True
                        got_date = True
        elif pub_title in known_restricted:
            restricted = True;
            
        # if needed add the <fulltext-s3-path> element
        if 'Contents' in internal_files:
            for file in internal_files['Contents']:
                # this next line os for ETDs and Theses, it may be different for other folders
                if not ('stamped' in file['Key'] or 'metadata' in file['Key']):
                    filename = file['Key'].split('/')[-1]
                    if filename == '':
                        print ('ERROR: no download file for ' + path)
                        errors.append('ERROR: no download file for: ' + path)
                    # as requested, only provide new element for restricted items
                    if restricted:
                        s3_path = ET.SubElement(document,'fulltext-s3-path')
                        s3_path.text = path + filename
                        
        # print error if no file was found
        else:
            print ('ERROR: no files at all for: ' + path)
            errors.append('ERROR: no files at all for: ' + path)

            
###
 # Start of main script
 ##

ET.register_namespace('oai',"http://www.openarchives.org/OAI/2.0/")
page = 1
print ('page:',page)

# make the first request
response = requests.get(url+qualifier)
allXML = ET.fromstring(response.content.decode('utf-8'))

# get the first page and the resumption token for the next page
documentlist = allXML.find('{http://www.openarchives.org/OAI/2.0/}ListRecords')
resume = documentlist.find('{http://www.openarchives.org/OAI/2.0/}resumptionToken')
documentlist.remove(resume)
resume = resume.text

# add <fulltext-s3-path> elements as needed
updateXML(documentlist)

# write the first page to a file
root = ET.ElementTree(allXML)
today = datetime.datetime.now().isoformat()[0:10].replace('-', '')
root.write(output_directory+today+'_OAI_page'+str(page)+'.xml',encoding="utf-8",xml_declaration=True)

# loop through the rest of the pages
while not resume == None:
    page += 1
    print ('page:',page)
    
    # make the page request
    response = requests.get(url+'&resumptionToken='+resume)
    pageXML = ET.fromstring(response.content.decode('utf-8'))
    
    # get the page and the resumption token
    pagelist = pageXML.find('{http://www.openarchives.org/OAI/2.0/}ListRecords')
    resume = pagelist.find('{http://www.openarchives.org/OAI/2.0/}resumptionToken')
    pagelist.remove(resume)
    resume = resume.text
    
    # add <fulltext-s3-path> elements as needed
    updateXML(pagelist)
    
    # write page to file
    root = ET.ElementTree(pageXML)
    root.write(output_directory+today+'_OAI_page'+str(page)+'.xml',encoding="utf-8",xml_declaration=True)
    
    # add page to entire xml document (no pages)
    documentlist.extend(pagelist)
    
# write entire xml document to file (no pages)
root = ET.ElementTree(allXML)
root.write(output_directory+today+'_OAI_bepress.xml',encoding="utf-8",xml_declaration=True)

In [None]:
errors

<a name="xmltojson"></a>

## step through OAI data from bepress and restructure

Just for exploration

In [None]:
import xml.etree.ElementTree as ET
# paste your xml file here
with open('pages/20190925_OAI_bepress.xml', 'r') as xml_file:
    rootDocs = ET.parse(xml_file)

In [None]:
directories = {}
documents = rootDocs.find('{http://www.openarchives.org/OAI/2.0/}ListRecords').findall('{http://www.openarchives.org/OAI/2.0/}record')
for document in documents:
    container = document.find('{http://www.openarchives.org/OAI/2.0/}metadata').find('document-export').find('documents').find('document')
    path = container.find('submission-path').text
    folders = path.split('/')
    if folders[0] in directories:
        directories[folders[0]]['count'] += 1
    else:
        if not folders[0] in to_ignore:
            directories[folders[0]] = {}
            directories[folders[0]]['count'] = 1
            directories[folders[0]]['items'] = []
    
    if not len(folders) == 2: # simple numeric file structure
        print (folders[0])
    else:
        item = {
            'title': ''.join(e for e in container.find('title').text.split() if e.isalnum()),
            'year': container.find('publication-date').text[0:4],
            'author': ''
        }
        try:
            item['author'] = container.find('authors').find('author').find('lname').text
        except:
            try:
                item['author'] = container.find('authors').find('author').find('organization').text
            except:     
                try:
                    item['author'] = container.find('authors').find('author').find('institution').text
                except:
                    print ('woof')
            
        directories[folders[0]]['items'].append(item)

In [None]:
directories