In [22]:
import boto3
import datetime
import json

In [8]:
# Create bucket

"""
    region = 'ap-southeast-1' 
"""
s3 = boto3.client('s3')

bucket_name = f"my-data-lake-{datetime.datetime.now().strftime('%Y-%m-%d')}"
s3.create_bucket(Bucket=bucket_name,
                 CreateBucketConfiguration={'LocationConstraint': 'ap-southeast-1'})
print(f"Bucket {bucket_name} created successfully.")

Bucket my-data-lake-2026-02-08 created successfully.


In [9]:
# Create versioning on the bucket
s3.put_bucket_versioning(
    Bucket=bucket_name,
    VersioningConfiguration={
        'Status': 'Enabled'
    }
)
print(f"Versioning enabled on bucket {bucket_name}.")

# Fungsi versioning: menyimpan beberapa versi dari objek yang sama di dalam bucket S3.

Versioning enabled on bucket my-data-lake-2026-02-08.


In [12]:
"""
    echo "test data v1" > test.txt
    aws s3 cp test.txt s3://my-data-lake-*/test.txt

    # 4. Overwrite file (creates new version)
    echo "test data v2" > test.txt
    aws s3 cp test.txt s3://my-data-lake-*/test.txt
"""
!echo "test data v1" > test.txt
!aws s3 cp test.txt s3://{bucket_name}/test.txt

upload: ./test.txt to s3://my-data-lake-2026-02-08/test.txt      


In [13]:
!echo "test data v2" > test.txt
!aws s3 cp test.txt s3://{bucket_name}/test.txt

upload: ./test.txt to s3://my-data-lake-2026-02-08/test.txt      


In [15]:
!aws s3api list-object-versions --bucket {bucket_name}

{
    "Versions": [
        {
            "ETag": "\"750e8f9c50d5ea3842aaca37a510d2cd\"",
            "ChecksumAlgorithm": [
                "CRC64NVME"
            ],
            "ChecksumType": "FULL_OBJECT",
            "Size": 13,
            "StorageClass": "STANDARD",
            "Key": "test.txt",
            "VersionId": "wMZDvu2Vx63BPiOb7Y8vR2c2fjNShO0i",
            "IsLatest": true,
            "LastModified": "2026-02-08T11:40:47+00:00",
            "Owner": {
                "ID": "9ecb2983ed608beb3016b33508a99a71b42c786f228e1f4a77061b0ee92ee5f2"
            }
        },
        {
            "ETag": "\"650bcb00b97ecfc4752442c138d3b4c4\"",
            "ChecksumAlgorithm": [
                "CRC64NVME"
            ],
            "ChecksumType": "FULL_OBJECT",
            "Size": 13,
            "StorageClass": "STANDARD",
            "Key": "test.txt",
            "VersionId": "0sZuzXFvKUJQizvt2hksVAAjo0tCsvHl",
            "IsLatest": false,
            "LastModified": "20

In [None]:
# Lifecycle policy

lifecycle_json = {
  "Rules": [
    {
      "ID": "TransitionOldData",
      "Filter": {"Prefix": ""},
      "Status": "Enabled",
      "Transitions": [
        {
          "Days": 30,
          "StorageClass": "STANDARD_IA"
        },
        {
          "Days": 90,
          "StorageClass": "GLACIER_IR"
        }
      ],
      "Expiration": {
        "Days": 365
      }
    }
  ]
}

s3.put_bucket_lifecycle_configuration(
    Bucket=bucket_name,
    LifecycleConfiguration=lifecycle_json
)

# Verify lifecycle policy
lifecycle = s3.get_bucket_lifecycle_configuration(Bucket=bucket_name)
print("Lifecycle configuration:")
for rule in lifecycle['Rules']:
    print(rule)

Lifecycle configuration:
{'Expiration': {'Days': 365}, 'ID': 'TransitionOldData', 'Filter': {'Prefix': ''}, 'Status': 'Enabled', 'Transitions': [{'Days': 30, 'StorageClass': 'STANDARD_IA'}, {'Days': 90, 'StorageClass': 'GLACIER_IR'}]}


In [24]:
bucket_policy_json = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Deny",
      "Principal": "*",
      "Action": "s3:GetObject",
      # PENTING: Nama bucket harus spesifik, tidak boleh pakai * di tengah nama bucket
      "Resource": [
          f"arn:aws:s3:::{bucket_name}",      # Akses ke bucket itu sendiri
          f"arn:aws:s3:::{bucket_name}/*"    # Akses ke objek di dalam bucket
      ],
      "Condition": {
        "StringNotLike": {
          "aws:userid": "*:my-lambda-role"
        }
      }
    }
  ]
}

s3.put_bucket_policy(
    Bucket=bucket_name,
    Policy=json.dumps(bucket_policy_json)
)
print(f"Bucket policy applied to bucket {bucket_name}.")

s3.put_public_access_block(
    Bucket=bucket_name,
    PublicAccessBlockConfiguration={
        'BlockPublicAcls': True,
        'IgnorePublicAcls': True,
        'BlockPublicPolicy': True,
        'RestrictPublicBuckets': True
    }
)
print(f"Public access block applied to bucket {bucket_name}.")


Bucket policy applied to bucket my-data-lake-2026-02-08.
Public access block applied to bucket my-data-lake-2026-02-08.


In [27]:
# Create folders (prefixes)
!aws s3api put-object --bucket {bucket_name} --key bronze/
!aws s3api put-object --bucket {bucket_name} --key silver/
!aws s3api put-object --bucket {bucket_name} --key gold/
print("Folders created successfully.")

# Create source subfolders
!aws s3api put-object --bucket {bucket_name} --key bronze/ecommerce/2024-01-15/
!aws s3api put-object --bucket {bucket_name} --key bronze/api-logs/2024-01-15/
print("Source subfolders created successfully.")

{
    "Expiration": "expiry-date=\"Tue, 09 Feb 2027 00:00:00 GMT\", rule-id=\"TransitionOldData\"",
    "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
    "ChecksumCRC64NVME": "AAAAAAAAAAA=",
    "ChecksumType": "FULL_OBJECT",
    "ServerSideEncryption": "AES256",
    "VersionId": "4Pnho7YCvbjSb.8423.VM.YzRBVZnEDv"
}
{
    "Expiration": "expiry-date=\"Tue, 09 Feb 2027 00:00:00 GMT\", rule-id=\"TransitionOldData\"",
    "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
    "ChecksumCRC64NVME": "AAAAAAAAAAA=",
    "ChecksumType": "FULL_OBJECT",
    "ServerSideEncryption": "AES256",
    "VersionId": "5_fo5HCxNWylfa7s6K8mqa1TJfArGaRG"
}
{
    "Expiration": "expiry-date=\"Tue, 09 Feb 2027 00:00:00 GMT\", rule-id=\"TransitionOldData\"",
    "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
    "ChecksumCRC64NVME": "AAAAAAAAAAA=",
    "ChecksumType": "FULL_OBJECT",
    "ServerSideEncryption": "AES256",
    "VersionId": "aWsptxoqH8DkSOCjLyXmvgX2KaDUYyEo"
}
Folders created successfully.
{
    "Exp

In [36]:
customers_csv_path = "..//data//raw//customers//2024-01-15//customers.csv"
orders_csv_path = "..//data//raw//ecommerce//2024-01-15//orders.csv"
# Upload sample data
!aws s3 cp {orders_csv_path} s3://{bucket_name}/bronze/ecommerce/2024-01-15/orders.csv
!aws s3 cp {customers_csv_path} s3://{bucket_name}/bronze/customers/2024-01-15/customers.csv
print("Sample data uploaded successfully.")

# Verify structure
!aws s3 ls s3://{bucket_name} --recursive

upload: ../data/raw/ecommerce/2024-01-15/orders.csv to s3://my-data-lake-2026-02-08/bronze/ecommerce/2024-01-15/orders.csv
upload: ../data/raw/customers/2024-01-15/customers.csv to s3://my-data-lake-2026-02-08/bronze/customers/2024-01-15/customers.csv
Sample data uploaded successfully.
2026-02-08 18:55:29          0 bronze/
2026-02-08 18:55:38          0 bronze/api-logs/2024-01-15/
2026-02-08 18:59:54       1374 bronze/customers/2024-01-15/customers.csv
2026-02-08 18:55:36          0 bronze/ecommerce/2024-01-15/
2026-02-08 18:59:51       1350 bronze/ecommerce/2024-01-15/orders.csv
2026-02-08 18:55:33          0 gold/
2026-02-08 18:55:32          0 silver/
2026-02-08 18:40:47         13 test.txt


# Partitioning Strategy

Bronze Layer:
s3://lake/bronze/{source}/{year}/{month}/{day}/data.csv

Silver Layer:
s3://lake/silver/{entity}/year={YYYY}/month={MM}/day={DD}/data.parquet

Gold Layer:
s3://lake/gold/fact_{entity}/year={YYYY}/month={MM}/day={DD}/facts.parquet

Benefits:
- Bronze: Easy source identification, full history
- Silver: Hive-partitioned (Athena optimized)
- Gold: Optimized for analytical queries
- Cost: Query only partitions needed (scan less data)

Example Query:
SELECT * FROM gold_fact_orders 
WHERE year=2024 AND month=01 
â†’ Only scans gold/fact_orders/year=2024/month=01/ (fast!)

In [40]:
# IAM create user
iam = boto3.client('iam')

iam.create_user(
    UserName='john_data_engineer'
)
print("IAM user 'john_data_engineer' created successfully.")

iam.create_access_key(
    UserName='john_data_engineer'
)
print("Access key for 'john_data_engineer' created successfully.")

IAM user 'john_data_engineer' created successfully.
Access key for 'john_data_engineer' created successfully.


In [41]:
data_engineer_policy = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Sid": "ListDataLakeBucket",
      "Effect": "Allow",
      "Action": "s3:ListBucket",
      "Resource": "arn:aws:s3:::my-data-lake-*"
    },
    {
      "Sid": "ReadBronzeSilver",
      "Effect": "Allow",
      "Action": "s3:GetObject",
      "Resource": [
        "arn:aws:s3:::my-data-lake-*/bronze/*",
        "arn:aws:s3:::my-data-lake-*/silver/*"
      ]
    },
    {
      "Sid": "WriteSilver",
      "Effect": "Allow",
      "Action": "s3:PutObject",
      "Resource": "arn:aws:s3:::my-data-lake-*/silver/*"
    }
  ]
}

iam.put_user_policy(
    UserName='john_data_engineer',
    PolicyName='DataEngineerS3AccessPolicy',
    PolicyDocument=json.dumps(data_engineer_policy)
)
print("Inline policy 'DataEngineerS3AccessPolicy' attached to user 'john_data_engineer'.")

Inline policy 'DataEngineerS3AccessPolicy' attached to user 'john_data_engineer'.
