In [3]:
import boto3
import requests
import json
import os
import argparse
import pandas as pd
import time

import numpy as np
from datetime import datetime 
#from brtdevkit.ml import Workflow
import pytz

#from brtdevkit.ml import Workflow
from brtdevkit.ml.core import KubeflowPipeline, KubeflowPipelineRun

image_ingestion_pipeline = None
isp_target_rom = '07090103'

utc = pytz.UTC

from warnings import filterwarnings
filterwarnings("ignore")

In [4]:
# Check To See if All Files Have Been uploaded from a SSD

import time
s3 = boto3.client('s3')
image_ingestion_experiment_name = 'shasta_image_ingest'
image_ingestion_pipeline_name = 'Shasta Image Data Processor'
image_ingestion_pipeline = None

def get_logfile_directories(bucket, prefix, just_fsize =False):
    """
    Finds S3 directories with flatbuffer logfiles in them.
    :param bucket: S3 bucket to search in
    :param prefix: the prefix (pathname) underneath the bucket to be searched
    """

    s3 = boto3.client('s3')

    dir_list = []
    fsize = 0
    kwargs = {'Bucket': bucket, 'Prefix': prefix}
    while True:
        resp = s3.list_objects_v2(MaxKeys =10000, **kwargs)
        
        # This bit of code handles pagination in the boto library.
        if 'Contents' in resp:
            for entry in resp['Contents']:
                if entry['Key'].endswith('.bfbs'):
                    dir_list.append(os.path.dirname(entry['Key']))
                    fsize += entry['Size']
            try:
                kwargs['ContinuationToken'] = resp['NextContinuationToken']
            except KeyError:
                break
        else:
            break
    if just_fsize == True:
        return fsize
    else:
        return dir_list, fsize
    
def get_SSD_file_size(start_path):
    """
    Given a path to an SSD, get the size of the files. 
    """
    
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

def check_upload_complete(bucket, prefix, SSD_path, vpu):
    """
    Given a bucket and prefix in S3, as well as the path to the SSD from my laptop
    Verify that eveything on the SSD was in fact uploaded to S3. 
    
    """
    
    # Retrieve all log file names on the SSD
    SSD_dirs = []
    for f in os.listdir(SSD_path):
        li = os.listdir(SSD_path + f)
        SSD_dirs.append(prefix  + vpu +'/full/' +f)

    # Retrieve all logfile names in S3
    s3_dirs, fsize = get_logfile_directories(bucket = bucket, prefix = prefix)
    s3_dirs = set(s3_dirs)

    print(f'There are {len(SSD_dirs)} logfiles on the SSD and {len(s3_dirs)} in s3.')
    
    # Check SSD against S3
    s3_fsize = 0
    for n in np.unique(SSD_dirs):
        if n in s3_dirs:
            fsize = get_logfile_directories(bucket = bucket, prefix = n, just_fsize=True)
            s3_fsize = s3_fsize + fsize
    
    # Compare filesizes
    s3_size = s3_fsize/1000000
    SSD_fsize = get_SSD_file_size(SSD_path)/1000000
    equal_fsize = abs(SSD_fsize - s3_size) < 1000
    
    # Compare logs
    all_logs_in_s3 = set(SSD_dirs).discard(set(s3_dirs)) == None
    
    if all_logs_in_s3 and equal_fsize:
        print(f'File size of {s3_size:.0f} MB confirmed')
        print('All files on drive accounted for in S3.')
        #return dirs, s3_size
    elif all_logs_in_s3:
        print(f'Could not verify filesize. There is a discepancy of {SSD_fsize - s3_size} out of {SSD_fsize}. Check paths')
    
    else:
        print('Upload looks strange. Verify path.')
        return set(SSD_dirs).discard(set(s3_dirs)), s3_size


In [8]:
vpu = 'vpu0-4a' # change this if needed, for instance if verifying a machine upload on vpu0-4a
bucket = 'brt-dcm-data'
prefix = 'db1/'
SSD_path ='/media/williamroberts/SSD298/db1-sysbox1/'+ vpu + '/full/'

check_upload_complete(bucket , prefix , SSD_path , vpu = vpu)

There are 21 logfiles on the SSD and 18158 in s3.
File size of 197416 MB confirmed
All files on drive accounted for in S3.


In [10]:
vpu = 'vpu0-4a' # change this if needed, for instance if verifying a machine upload on vpu0-4a
bucket = 'brt-dcm-data'
prefix = 'db4/'
SSD_path ='/media/williamroberts/SSD273/db4-sysbox1/'+ vpu + '/full/'

check_upload_complete(bucket , prefix , SSD_path , vpu = vpu)

There are 7 logfiles on the SSD and 4320 in s3.
File size of 11911 MB confirmed
All files on drive accounted for in S3.


In [9]:
vpu = 'vpu0-0a' # change this if needed, for instance if verifying a machine upload on vpu0-4a
bucket = 'brt-dcm-data'
prefix = 'dcm12/'
SSD_path ='/media/williamroberts/SSD209/dcm12-sysbox1/'+ vpu + '/full/'

check_upload_complete(bucket , prefix , SSD_path , vpu = vpu)

There are 219 logfiles on the SSD and 1021 in s3.
File size of 1727225 MB confirmed
All files on drive accounted for in S3.
