In [14]:
#in the project-dev-wm bucket, delete all files in optimised/datedim and optmised/locationdim

import boto3

s3 = boto3.resource('s3')
bucket = s3.Bucket('my-config-bucket-dev')
bucket.objects.filter(Prefix='').delete()

[]

In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df = spark.read.parquet("s3://project-dev-wm/optimised/date_dim/full/202305231149/")

df.printSchema()


                                                                                

root
 |-- date_key: long (nullable = true)
 |-- date: string (nullable = true)
 |-- day_of_week: long (nullable = true)
 |-- day_name: string (nullable = true)
 |-- day_of_month: long (nullable = true)
 |-- day_of_year: long (nullable = true)
 |-- week_of_year: long (nullable = true)
 |-- month: long (nullable = true)
 |-- month_name: string (nullable = true)
 |-- quarter: long (nullable = true)
 |-- year: long (nullable = true)
 |-- is_weekend: boolean (nullable = true)
 |-- is_weekday: boolean (nullable = true)
 |-- is_holiday: boolean (nullable = true)



In [2]:
from typing import Union, Iterable
from etl.addresses import Bucket

def get_timestamp_of_most_recently_created_file(*, bucket_name:Union[Bucket,str], stem:str) -> str:
    """Return the most recent date in a subset of a bucket's file paths.

    Args:
        bucket_name (Bucket): The name of the bucket to search.
        stem (str): The key of the file objects to search for.

    Returns:
        str: The most recent date in the bucket.

    Raises:
        ValueError: If no files are found in the bucket.
        ValueError: If no dates are found in the bucket.
    """

    bucket = boto3.resource('s3').Bucket(str(bucket_name))
    bucket_objects:Iterable = bucket.objects

    file_paths = [
        obj
        for obj
        in bucket.objects.filter(Prefix=stem)
    ]
    
    if not file_paths:
        raise ValueError("No files found in bucket.")

    dates = []
    for file in file_paths:
        date_string = file.key.split('/')[-1].split('.')[0]
        if date_string:
            dates.append(date_string)

    if not dates:
        raise ValueError("No recent dates found in bucket.")

    return dates[-1]

In [8]:
import boto3 
get_timestamp_of_most_recently_created_file(bucket_name=Bucket.RAW, stem='raw/claim_db/policyholder/full/')

'part-00000-05a2301a-b24c-4a16-b10b-192246a7c843-c000'

In [9]:
import boto3

def get_lexicographically_highest_subdirectory(bucket_name, prefix):
    """
    List all subdirectories under a prefix in an S3 bucket and return the 
    lexicographically highest.

    Parameters:
        bucket_name (str): The name of the S3 bucket.
        prefix (str): The prefix (i.e., "directory path") to search under.

    Returns:
        str: The lexicographically highest subdirectory under the prefix.        
    """
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(str(bucket_name))
    
    subdirs = set()
    for obj in bucket.objects.filter(Prefix=prefix):
        subdir = '/'.join(obj.key.split('/')[-2:])
        subdirs.add(subdir)
    
    return max(subdirs) if subdirs else None

# Usage
bucket_name = "test-dev-wm"
prefix = "raw/claim_db/claim/full/"
latest_subdir = get_latest_subdirectory(bucket_name, prefix)
print(f"The latest subdirectory under {prefix} is {latest_subdir}")


NameError: name 'get_latest_subdirectory' is not defined

In [13]:
# create an s3 folder in target bucket with a given prefix.`
import boto3
from datetime import datetime

def create_s3_folder(bucket_name, prefix):
    """
    Create an S3 folder with the given prefix in the given bucket.

    Parameters:
        bucket_name (str): The name of the S3 bucket.
        prefix (str): The prefix (i.e., "directory path") to create.

    Returns:
        None
    """
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(str(bucket_name))
    bucket.put_object(Key=(prefix + f"/{datetime.now().strftime('%Y%m%d%H%M')}"))
    
bucket_name = 'demo-bucket-wmaz'
prefix = '/scripts'

create_s3_folder(bucket_name, prefix)


In [1]:
3

3

In [14]:
import boto3

def create_folder(bucket_name: str, folder_name: str):
    s3 = boto3.resource('s3')
    s3.Object(bucket_name, folder_name + '/').put()

create_folder('demo-bucket-wmaz', 'my_folder')


In [16]:
s3 = boto3.resource('s3')
s3.Object(
    'raw-dev-wm', 
    f"raw/claim_db/claim/full/{datetime.now().strftime('%Y%m%d%H%M')}"
).put(Body='')

# delete the object
s3.Object('raw-dev-wm', 'raw/claim_db/claim/full/202305211851').delete()

{'ResponseMetadata': {'RequestId': '264NJX0P3E6S30XC',
  'HostId': 'f2D299jfsY7m2Kux15Zj0CggcAXjh0Rnz3ATU/y13ILnpb/zeJnky/W5phnl+chRKSdX2EiGhyo=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'f2D299jfsY7m2Kux15Zj0CggcAXjh0Rnz3ATU/y13ILnpb/zeJnky/W5phnl+chRKSdX2EiGhyo=',
   'x-amz-request-id': '264NJX0P3E6S30XC',
   'date': 'Mon, 22 May 2023 11:48:13 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 1},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ServerSideEncryption': 'AES256'}

In [18]:
extreme_future_time = (
        datetime(year=2999, month=1, day=1, hour=1, minute=1)
            .strftime("%Y%m%d%H%M")
    )

s3 = boto3.resource('s3')
bucket_name = 'raw-dev-wm'
object_key = f"raw/claim_db/claim/full/{extreme_future_time}/"

object_key = f"raw/claim_db/claim/full/{extreme_future_time}/"
s3.Object(bucket_name, object_key).put(Body='')

# Clean up the mock S3
# s3.Object(bucket_name, object_key).delete()

{'ResponseMetadata': {'RequestId': '4NTT4QTT5ZNDEH4G',
  'HostId': '+Re7UqSEY1NJMULgzxffzQeQkO05auSLathgcukr+1siIw82Ia+UlfVkJLU+T33o7kg9xPAcIMs=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '+Re7UqSEY1NJMULgzxffzQeQkO05auSLathgcukr+1siIw82Ia+UlfVkJLU+T33o7kg9xPAcIMs=',
   'x-amz-request-id': '4NTT4QTT5ZNDEH4G',
   'date': 'Mon, 22 May 2023 12:28:22 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 1},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
 'ServerSideEncryption': 'AES256'}

In [19]:
def get_lexicographically_highest_subdirectory(bucket_name, prefix) -> str:
    """
    List all subdirectories under a prefix in an S3 bucket and return the 
    lexicographically highest.

    Parameters:
        bucket_name (str): The name of the S3 bucket.
        prefix (str): The prefix (i.e., "directory path") to search under.

    Returns:
        str: The lexicographically highest subdirectory under the prefix.
    """
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(str(bucket_name))
    
    subdirs = set()
    for obj in bucket.objects.filter(Prefix=prefix):
        subdir = '/'.join(obj.key.split('/')[-1:])
        subdirs.add(subdir)
    
    return max(subdirs) + '/' if subdirs else 'did_not_work'

In [2]:
from pyspark.sql import SparkSession

from etl.jobs.access.provider import input_path, output_path

print(input_path)


ImportError: cannot import name 'output_path' from 'etl.jobs.access.provider' (/workspaces/ETL-TDD/etl/jobs/access/provider.py)

In [2]:
spark.read.parquet(input_path).show()

                                                                                

+-----------+--------------------+--------------------+---------------------+----------------------+--------------------+-----------------------+--------------------+--------------------+-----+--------+
|provider_id|       provider_name|             address|provider_phone_number|provider_email_address|       provider_type|provider_license_number|              street|              suburb|state|postcode|
+-----------+--------------------+--------------------+---------------------+----------------------+--------------------+-----------------------+--------------------+--------------------+-----+--------+
|     558433|        Cox and Sons|695 Ryan Nook, St...|      +61 2 4485 8099|  rmcdowell@example...|          Pharmacist|               82002543|       695 Ryan Nook|         St. Patrick|  VIC|    2697|
|     739426|        Dillon-Myers|606 Hamilton Circ...|      +61-3-9481-5525|    wmccoy@example.org|General Practitioner|               26265469| 606 Hamilton Circle|          Munozburgh| 

In [4]:
df.show()

                                                                                

+-----------+--------------------+--------------------+---------------------+----------------------+--------------------+-----------------------+
|provider_id|       provider_name|    provider_address|provider_phone_number|provider_email_address|       provider_type|provider_license_number|
+-----------+--------------------+--------------------+---------------------+----------------------+--------------------+-----------------------+
|     558433|        Cox and Sons|695 Ryan Nook, St...|      +61 2 4485 8099|  rmcdowell@example...|          Pharmacist|               82002543|
|     739426|        Dillon-Myers|606 Hamilton Circ...|      +61-3-9481-5525|    wmccoy@example.org|General Practitioner|               26265469|
|     849574|      Thompson-Silva|Apt. 486 646 Mich...|      +61-416-882-223|  jeremyfox@example...|          Specialist|               47912254|
|     631140|Fitzgerald, Moran...|Level 2 85 Rhonda...|         0446 612 209|  terrydeborah@exam...|     Physiotherapist|   

In [9]:
def arguments(hi, hello):
    a = 3
    print(locals())

In [10]:
arguments('hi', 'hello')

{'hi': 'hi', 'hello': 'hello', 'a': 3}


In [69]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(str('raw-dev-wm'))
prefix = 'raw/claim_db/claim/full/'

candidate_dates = []
for obj in bucket.objects.filter(Prefix=prefix):
    
    subdirs = obj.key.split('/')
    leaf = subdirs[-2:][0]
    if leaf.isdigit():
        candidate_dates.append(leaf)
        
candidate_dates.append('39990101010')

sorted(candidate_dates)[-1]

'39990101010'

In [12]:
from etl.addresses import create_path
from etl.addresses import Bucket, Source, Tier, Table, Environment, Load

create_path(
            environment=Environment.PROD,
            bucket=Bucket.TEST,
            tier=Tier.LANDING,
            source=Source.CLAIM_DB,
            table=Table.CLAIM,
            load=Load.FULL,
            time_required='recent',
            file_extension='.csv'
        )

's3://test-dev-wm/landing/claim_db/claim/full/202306211851/.csv'

In [13]:
from etl.addresses import get_timestamp_of_most_recently_created_file


get_timestamp_of_most_recently_created_file(bucket_name=Bucket.TEST,stem='landing/claim_db/claim/full/')

'202306211851'