In [None]:
# Partition Projection

In [None]:
import getpass
from datetime import datetime
import pandas as pd
import awswrangler as wr

In [None]:
bucket = getpass.getpass()

In [None]:
# Integer projection

In [None]:
df = pd.DataFrame({"value": [1, 2, 3], "year": [2019, 2020, 2021], "month": [10, 11, 12], "day": [25, 26, 27]})
df

In [None]:
# Integer projection

In [None]:
df = pd.DataFrame({"value": [1, 2, 3], "year": [2019, 2020, 2021], "month": [10, 11, 12], "day": [25, 26, 27]})
df

In [None]:
wr.s3.to_parquet(
    df=df,
    path=f"s3://{bucket}/table_integer/",
    dataset=True,
    partition_cols=["year", "month", "day"],
    database="default",
    table="table_integer",
    athena_partition_projection_settings={
        "projection_types": {"year": "integer", "month": "integer", "day": "integer"},
        "projection_ranges": {"year": "2000,2025", "month": "1,12", "day": "1,31"},
    },
)

In [None]:
wr.athena.read_sql_query("SELECT * FROM table_integer", database="default")

In [None]:
# Enum projection

In [None]:
df = pd.DataFrame(
    {
        "value": [1, 2, 3],
        "city": ["São Paulo", "Tokio", "Seattle"],
    }
)
df

In [None]:
wr.s3.to_parquet(
    df=df,
    path=f"s3://{bucket}/table_enum/",
    dataset=True,
    partition_cols=["city"],
    database="default",
    table="table_enum",
    athena_partition_projection_settings={
        "projection_types": {
            "city": "enum",
        },
        "projection_values": {"city": "São Paulo,Tokio,Seattle"},
    },
)

In [None]:
wr.athena.read_sql_query("SELECT * FROM table_enum", database="default")

In [None]:
# Date projection

In [None]:
def ts(x):
    return datetime.strptime(x, "%Y-%m-%d %H:%M:%S")


def dt(x):
    return datetime.strptime(x, "%Y-%m-%d").date()


df = pd.DataFrame(
    {
        "value": [1, 2, 3],
        "dt": [dt("2020-01-01"), dt("2020-01-02"), dt("2020-01-03")],
        "ts": [ts("2020-01-01 00:00:00"), ts("2020-01-01 00:00:01"), ts("2020-01-01 00:00:02")],
    }
)

In [None]:
wr.s3.to_parquet(
    df=df,
    path=f"s3://{bucket}/table_date/",
    dataset=True,
    partition_cols=["dt", "ts"],
    database="default",
    table="table_date",
    athena_partition_projection_settings={
        "projection_types": {
            "dt": "date",
            "ts": "date",
        },
        "projection_ranges": {"dt": "2020-01-01,2020-01-03", "ts": "2020-01-01 00:00:00,2020-01-01 00:00:02"},
    },
)

In [None]:
wr.athena.read_sql_query("SELECT * FROM table_date", database="default")

In [None]:
# Injected projection

In [None]:
df = pd.DataFrame(
    {
        "value": [1, 2, 3],
        "uuid": [
            "761e2488-a078-11ea-bb37-0242ac130002",
            "b89ed095-8179-4635-9537-88592c0f6bc3",
            "87adc586-ce88-4f0a-b1c8-bf8e00d32249",
        ],
    }
)

df

In [None]:
wr.s3.to_parquet(
    df=df,
    path=f"s3://{bucket}/table_injected/",
    dataset=True,
    partition_cols=["uuid"],
    database="default",
    table="table_injected",
    athena_partition_projection_settings={
        "projection_types": {
            "uuid": "injected",
        }
    },
)

In [None]:
wr.athena.read_sql_query(
    sql="SELECT * FROM table_injected WHERE uuid='b89ed095-8179-4635-9537-88592c0f6bc3'", database="default"
)

In [None]:
Cleaning Up

In [None]:
wr.s3.delete_objects(f"s3://{bucket}/table_integer/")
wr.s3.delete_objects(f"s3://{bucket}/table_enum/")
wr.s3.delete_objects(f"s3://{bucket}/table_date/")
wr.s3.delete_objects(f"s3://{bucket}/table_injected/")

In [None]:
wr.catalog.delete_table_if_exists(table="table_integer", database="default")
wr.catalog.delete_table_if_exists(table="table_enum", database="default")
wr.catalog.delete_table_if_exists(table="table_date", database="default")
wr.catalog.delete_table_if_exists(table="table_injected", database="default") Amazon Athena