## Objective
- To provide a file sync experience(and eficciency) similar to `repo sync`, or `awscli s3 sync`, etc.
- Allow files to be uploaded on downloaded **ONLY** if there is a difference in the file's loacal cs remote copy
- Provide a pip package that wraps this sync access to s3 remote assets
- Provide this as a convinience package for using remote assets in notebooks

## Procedure
- allow downloading or uploading files to an s3 bucket.
- when asked to download a file from s3:
  - check if the file is already present in the local directory
  - if the file is present in the local directory:
    - check if its hash matches the hash of the s3 version of this file
    - if there is a mismatch in the hash of the file between the local and the s3 copy, it means that the contents of the file has changed
    - if hashes mismatch: proceed with download
- when asked to upload a file to s3:
  - repeat the same process as involved in dowloading a file

## Available appraoches
- Option 1: wrap the command line call to `awscli s3 sync` using python `subprocess` library
- Option 2: write the sync functionality from scratch ✅

## Dependencies
- boto3

In [33]:
import os
from botocore.exceptions import NoCredentialsError
import shutil
import boto3 as boto
import multiprocessing
import copy
import hashlib
import logging
from pathlib import Path
log = logging.getLogger(__name__)

class LocalObjectCache:
    """Provides a local cache of an S3 bucket on disk, with the ability to sync up to the latest version of all files"""
    _DEFAULT_PATH = '/tmp/local_object_store/'

    def __init__(self, bucket_name, prefix='', path=None):
        """Init Method
        :param bucket_name: str, the name of the S3 bucket
        :param prefix: str, the prefix up to which you want to sync
        :param path: (optional, str) a path to store the local files
        """
        self.bucket_name = bucket_name
        self.prefix = prefix

        if not path:
            path = self._DEFAULT_PATH + self.bucket_name + '/'

        self.path = path
        os.makedirs(path, exist_ok=True)

        s3 = boto.resource('s3')
        self.bucket = s3.Bucket(self.bucket_name)

    def __enter__(self):
        """Provides a context manager which will open but not sync, then delete the cache on exit"""
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """Provides a context manager which will open but not sync, then delete the cache on exit"""
        self.close()

    def __getstate__(self):
        # Require to pickle and un-pickle the self object between multiprocessing pools
        out = copy.copy(self.__dict__)
        out['bucket'] = None
        return out

    def __setstate__(self, d):
        # Require to pickle and un-pickle the self object between multiprocessing pools
        s3 = boto.resource('s3')
        d['bucket'] = s3.Bucket(d['bucket_name'])
        self.__dict__ = d

    def get_path(self, key):
        """Returns the local file storage path for a given file key"""
        return os.path.join(self.path, self.prefix, key)

    @staticmethod
    def calculate_s3_etag(file, chunk_size=8 * 1024 * 1024):
        """Calculates the S3 custom e-tag (a specially formatted MD5 hash)"""
        md5s = []
        
        while True:
            data = file.read(chunk_size)
            if not data:
                break
            md5s.append(hashlib.md5(data))

        if len(md5s) == 1:
            return '"{}"'.format(md5s[0].hexdigest())

        digests = b''.join(m.digest() for m in md5s)
        digests_md5 = hashlib.md5(digests)
        return '"{}-{}"'.format(digests_md5.hexdigest(), len(md5s))

    def _get_obj(self, key, tag=None):
        """Downloads an object at key to file path, checking to see if an existing file matches the current hash"""
        path = os.path.join(self.path, key)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        dl_flag = True
        try:
            f = open(path, 'rb')
            if tag == self.calculate_s3_etag(f):
                log.info('Cache Hit')
                dl_flag = False
            f.close()
        except FileNotFoundError as e:
            pass

        if dl_flag:
            log.info('Cache Miss')
            self.bucket.download_file(key, path)
    
    def _set_obj(self, key, tag, path=None):
        """Uploads an object at key to aws, checking to see if an existing file matches the current hash"""
        path_check = os.path.join(self.path, key)
        os.makedirs(os.path.dirname(path_check), exist_ok=True)
        dl_flag = True
        try:
            f = open(path_check, 'rb')
            if tag == self.calculate_s3_etag(f):
                log.info('cache Hit')
                dl_flag = False
            f.close()
        except FileNotFoundError as e:
            pass

        if dl_flag:
            log.info('Cache Miss')
            self.bucket.upload_file(key, self.prefix+Path(key).name)

    def sync(self):
        """Syncs the local and remote S3 copies"""
        pool = multiprocessing.Pool()
        keys = [(obj.key, obj.e_tag) for obj in self.bucket.objects.filter(Prefix=self.prefix)]
        pool.starmap(self._get_obj, keys)

    def close(self):
        """Deletes all local files"""
        shutil.rmtree(self.path)
