In [1]:
import boto3
import tempfile

import pandas as pd

# Check if versioning is enabled

In [2]:
def bucket_versioning_status(bucket_name):
    s3 = boto3.resource('s3')
    response = s3.BucketVersioning(bucket_name)
    return response

In [3]:
def is_bucket_versioning_enabled(bucket_name):
    response = bucket_versioning_status(bucket_name)
    return response.status if response.status != None else False

In [4]:
is_bucket_versioning_enabled('verta-condacon')

False

In [5]:
is_bucket_versioning_enabled('verta-versioned-bucket')

'Enabled'

# list versions of a key

In [6]:
def list_versions_of_keys (bucket_name, key):
    resp = boto3.client('s3').list_object_versions(Prefix=key, Bucket=bucket_name)
#     print(resp)
    versions = resp.get('Versions')
#     pagination support available for list_object_versions
    ret = {'bucket_name':bucket_name}
    for version in versions:
        if version['Key'] not in ret:
            ret[version['Key']] = {'active_versions':[], 'deleted_versions':[]}
        ver_val = {
            'ETag': version['ETag'],
            'Size': version['Size'],
            'Key': version['Key'],
            'VersionId': version['VersionId'],
            'IsLatest': version['IsLatest'],
            'LastModified': version['LastModified']
        }
        ret[version['Key']]['active_versions'].append(ver_val)
    if 'DeleteMarkers' in resp:
        del_markers = resp.get('DeleteMarkers')
        for dmarker in del_markers:
            if dmarker['Key'] not in ret:
                ret[dmarker['Key']]= {'deleted_versions':[]}
            d_val ={
                'Key': dmarker['Key'],
                'VersionId': dmarker['VersionId'],
                'IsLatest': dmarker['IsLatest'],
                'LastModified': dmarker['LastModified']}
            ret[dmarker['Key']]['deleted_versions'].append(d_val)
            
    return ret

In [17]:
list_versions_of_keys('verta-versioned-bucket','data/census-train.csv')

{'bucket_name': 'verta-versioned-bucket',
 'data/census-train.csv': {'active_versions': [{'ETag': '"0711e7f8423cefe8e23c9df261c6d40d"',
    'Size': 1639,
    'Key': 'data/census-train.csv',
    'VersionId': 'hv89wUgi0C2LFlI1amva_M4hmBj6jj9k',
    'IsLatest': False,
    'LastModified': datetime.datetime(2020, 4, 9, 6, 27, 44, tzinfo=tzutc())},
   {'ETag': '"ce6561e4349f9bebe665d5f3e08ac00b"',
    'Size': 1728,
    'Key': 'data/census-train.csv',
    'VersionId': 'wWJisVfhpLxcy7xQyGDTDfGaKqWZ2LTE',
    'IsLatest': False,
    'LastModified': datetime.datetime(2020, 4, 9, 6, 24, 25, tzinfo=tzutc())},
   {'ETag': '"64af2ff44dd04acceb277d024939b619"',
    'Size': 3271573,
    'Key': 'data/census-train.csv',
    'VersionId': 'mXtX.wsT.oOXRn7J9fDIMCllJ74azd7K',
    'IsLatest': False,
    'LastModified': datetime.datetime(2020, 4, 9, 5, 47, 8, tzinfo=tzutc())},
   {'ETag': '"64af2ff44dd04acceb277d024939b619"',
    'Size': 3271573,
    'Key': 'data/census-train.csv',
    'VersionId': '.fJavu3sHe

# Getting the latest version
## Imagine : user using this and storing the version in verta

In [8]:
def get_details_latest_version_key (bucket_name, key):
    versions = boto3.client('s3').list_object_versions(Prefix=key, Bucket=bucket_name).get('Versions')
#     pagination support available for list_object_versions
    ret = {'bucket_name':bucket_name}
    for version in versions:
        if version['IsLatest']:
            ver_val = {
                'ETag': version['ETag'],
                'Size': version['Size'],
                'Key': version['Key'],
                'VersionId': version['VersionId'],
                'IsLatest': version['IsLatest'],
                'LastModified': version['LastModified']
            }
            ret[version['Key']] = ver_val
    return ret

In [9]:
get_details_latest_version_key('verta-versioned-bucket','data/census-train.csv')

{'bucket_name': 'verta-versioned-bucket',
 'data/census-train.csv': {'ETag': '"0711e7f8423cefe8e23c9df261c6d40d"',
  'Size': 1639,
  'Key': 'data/census-train.csv',
  'VersionId': 'hv89wUgi0C2LFlI1amva_M4hmBj6jj9k',
  'IsLatest': True,
  'LastModified': datetime.datetime(2020, 4, 9, 6, 27, 44, tzinfo=tzutc())}}

# Using a previously logged version
## Imagine : user reads the dataset infomration form verta and uses information to download file and train model

In [11]:
def download_versioned_file(bucket_name, key, version_id):
    fp = tempfile.NamedTemporaryFile()
    s3 = boto3.client('s3')
    s3.download_file(bucket_name, key, fp.name, ExtraArgs={'VersionId': version_id})
    return fp

In [12]:
file = download_versioned_file('verta-versioned-bucket','data/census-train.csv','01ei62mnhNekpcK8PTKsRBxD.6ZbucWZ')

In [13]:
df_train = pd.read_csv(file.name)
df_train.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,workclass_without-pay,...,occupation_handlers-cleaners,occupation_machine-op-inspct,occupation_other-service,occupation_priv-house-serv,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,>50k
0,44,0,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,53,7298,0,60,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,49,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,53,0,1485,40,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


# ... And you know the rest

# .
# .
# .
# .
# .
# .
# .
# .
# .
# .

# ...but what if the file was deleted?

In [14]:
file = download_versioned_file('verta-versioned-bucket','data/census-train.csv','01ei62mnhNekpcK8PTKsRBxD.6ZbucWZ')
df_train = pd.read_csv(file.name)
df_train.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,workclass_without-pay,...,occupation_handlers-cleaners,occupation_machine-op-inspct,occupation_other-service,occupation_priv-house-serv,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,>50k
0,44,0,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,53,7298,0,60,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,49,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,53,0,1485,40,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


# actual restore

In [21]:
def restore_deleted_s3_version (bucket_name, key, version):
    s3 = boto3.resource('s3')
    fileobjver = s3.ObjectVersion(
            bucket_name,
            key,
            version
        )
    fileobjver.delete()

In [22]:
restore_deleted_s3_version('verta-versioned-bucket','data/census-train.csv','qB2ZN.4cvpBXj4w_1H2ZpDjz6g_K1yZX')