In [None]:
# awswrangler has 3 different write modes to store CSV Datasets on Amazon S3.

# append (Default)

# Only adds new files without any delete.

# overwrite

# Deletes everything in the target directory and then add new files.

# overwrite_partitions (Partition Upsert)

# Only deletes the paths of partitions that should be updated and then writes the new partitions files. It’s like a “partition Upsert”.

In [None]:
from datetime import date
import pandas as pd
import awswrangler as wr

In [None]:
# Enter your bucket name

In [None]:
import getpass
bucket = getpass.getpass()
path = f"s3://{bucket}/dataset/"

In [None]:
# Checking/Creating Glue Catalog Databases

In [None]:
if "awswrangler_test" not in wr.catalog.databases().values:
    wr.catalog.create_database("awswrangler_test")

In [None]:
# Creating the Dataset

In [None]:
df = pd.DataFrame({"id": [1, 2], "value": ["foo", "boo"], "date": [date(2020, 1, 1), date(2020, 1, 2)]})

wr.s3.to_csv(
    df=df, path=path, index=False, dataset=True, mode="overwrite", database="awswrangler_test", table="csv_dataset"
)

wr.athena.read_sql_table(database="awswrangler_test", table="csv_dataset")

In [None]:
# Appending

In [None]:
df = pd.DataFrame({"id": [3], "value": ["bar"], "date": [date(2020, 1, 3)]})

wr.s3.to_csv(
    df=df, path=path, index=False, dataset=True, mode="append", database="awswrangler_test", table="csv_dataset"
)

wr.athena.read_sql_table(database="awswrangler_test", table="csv_dataset")

In [None]:
# Overwriting

In [None]:
wr.s3.to_csv(
    df=df, path=path, index=False, dataset=True, mode="overwrite", database="awswrangler_test", table="csv_dataset"
)

wr.athena.read_sql_table(database="awswrangler_test", table="csv_dataset")

In [None]:
# Creating a Partitioned Dataset

In [None]:
df = pd.DataFrame({"id": [1, 2], "value": ["foo", "boo"], "date": [date(2020, 1, 1), date(2020, 1, 2)]})

wr.s3.to_csv(
    df=df,
    path=path,
    index=False,
    dataset=True,
    mode="overwrite",
    database="awswrangler_test",
    table="csv_dataset",
    partition_cols=["date"],
)

wr.athena.read_sql_table(database="awswrangler_test", table="csv_dataset")

In [None]:
# Upserting partitions

In [None]:
df = pd.DataFrame({"id": [2, 3], "value": ["xoo", "bar"], "date": [date(2020, 1, 2), date(2020, 1, 3)]})

wr.s3.to_csv(
    df=df,
    path=path,
    index=False,
    dataset=True,
    mode="overwrite_partitions",
    database="awswrangler_test",
    table="csv_dataset",
    partition_cols=["date"],
)

wr.athena.read_sql_table(database="awswrangler_test", table="csv_dataset")

In [None]:
# BONUS - Glue/Athena integration

In [None]:
df = pd.DataFrame({"id": [1, 2], "value": ["foo", "boo"], "date": [date(2020, 1, 1), date(2020, 1, 2)]})

wr.s3.to_csv(
    df=df,
    path=path,
    dataset=True,
    index=False,
    mode="overwrite",
    database="aws_sdk_pandas",
    table="my_table",
    compression="gzip",
)

wr.athena.read_sql_query("SELECT * FROM my_table", database="aws_sdk_pandas")