From 1c51a6fa9fef24e14d5e24b66aeb88bd10ce2baf Mon Sep 17 00:00:00 2001 From: John Kranz Date: Fri, 7 Jan 2022 08:48:50 -0700 Subject: [PATCH] RMS-6798: Add MD5 example to python SDK --- samples/puttingDataWithMD5.py | 105 ++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 samples/puttingDataWithMD5.py diff --git a/samples/puttingDataWithMD5.py b/samples/puttingDataWithMD5.py new file mode 100644 index 0000000..6751b25 --- /dev/null +++ b/samples/puttingDataWithMD5.py @@ -0,0 +1,105 @@ +# Copyright 2014-2022 Spectra Logic Corporation. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use +# this file except in compliance with the License. A copy of the License is located at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# or in the "license" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from ds3 import ds3 +import os +import time +import base64 +import hashlib + +client = ds3.createClientFromEnv() + +bucketName = "books" + +# make sure the bucket that we will be sending objects to exists +client.put_bucket(ds3.PutBucketRequest(bucketName)) + +# create your list of objects that will be sent to DS3 +# this example assumes that these files exist on the file system + +fileList = ["beowulf.txt", "sherlock_holmes.txt", "tale_of_two_cities.txt", "ulysses.txt"] + +# this method is used to map a file path to a Ds3PutObject +def fileNameToDs3PutObject(filePath, prefix=""): + size = os.stat(pathForResource(filePath)).st_size + return ds3.Ds3PutObject(prefix + filePath, size) + +# this method is used to get the os specific path for an object located in the resources folder +def pathForResource(resourceName): + currentPath = os.path.dirname(str(__file__)) + return os.path.join(currentPath, "resources", resourceName) + +# get the sizes for each file +fileList = list(map(fileNameToDs3PutObject, fileList)) + +# submit the put bulk request to DS3 +bulkResult = client.put_bulk_job_spectra_s3(ds3.PutBulkJobSpectraS3Request(bucketName, fileList)) + +# the bulk request will split the files over several chunks if it needs to. +# we then need to ask what chunks we can send, and then send them making +# sure we don't resend the same chunks + +# create a set of the chunk ids which will be used to track +# what chunks have not been sent +chunkIds = set([x['ChunkId'] for x in bulkResult.result['ObjectsList']]) + +# while we still have chunks to send +while len(chunkIds) > 0: + # get a list of the available chunks that we can send + availableChunks = client.get_job_chunks_ready_for_client_processing_spectra_s3( + ds3.GetJobChunksReadyForClientProcessingSpectraS3Request(bulkResult.result['JobId'])) + + chunks = availableChunks.result['ObjectsList'] + + # check to make sure we got some chunks, if we did not + # sleep and retry. This could mean that the cache is full + if len(chunks) == 0: + time.sleep(60) + continue + + # for each chunk that is available, check to make sure + # we have not sent it, and if not, send that object + for chunk in chunks: + if not chunk['ChunkId'] in chunkIds: + continue + + chunkIds.remove(chunk['ChunkId']) + for obj in chunk['ObjectList']: + # it is possible that if we start resending a chunk, due to the program crashing, that + # some objects will already be in cache. Check to make sure that they are not, and then + # send the object to Spectra S3 + if obj['InCache'] == 'false': + localFileName = "resources/" + obj['Name'] + objectDataStream = open(localFileName, "rb") + objectDataStream.seek(int(obj['Offset']), 0) + objectChunk = objectDataStream.read(int(obj['Length'])) + checksum = hashlib.md5(objectChunk) + encodedChecksum = base64.b64encode(checksum.digest()).decode('utf-8') + objectDataStream.seek(int(obj['Offset']), 0) + client.put_object(ds3.PutObjectRequest(bucketName, + obj['Name'], + obj['Length'], + objectDataStream, + offset=int(obj['Offset']), + job=bulkResult.result['JobId'], + headers={"Content-MD5": encodedChecksum})) + +# we now verify that all our objects have been sent to DS3 +bucketResponse = client.get_bucket(ds3.GetBucketRequest(bucketName)) + +for obj in bucketResponse.result['ContentsList']: + print(obj['Key']) + +# delete the bucket by first deleting all the objects, and then deleting the bucket +for obj in bucketResponse.result['ContentsList']: + client.delete_object(ds3.DeleteObjectRequest(bucketName, obj['Key'])) + +client.delete_bucket(ds3.DeleteBucketRequest(bucketName))