In [None]:
# Parquet Crawler

In [None]:
# awswrangler can extract only the metadata from Parquet files and Partitions and then add it to the Glue Catalog.

In [None]:
import awswrangler as wr

In [None]:
import getpass
bucket = getpass.getpass()
path = f"s3://{bucket}/data/"

In [None]:
# Creating a Parquet Table from the NOAA’s CSV files

In [None]:
cols = ["id", "dt", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"]
df = wr.s3.read_csv(
    path="s3://noaa-ghcn-pds/csv/by_year/189", names=cols, parse_dates=["dt", "obs_time"]
)  # Read 10 files from the 1890 decade (~1GB)
df

In [None]:
res = wr.s3.to_parquet(
    df=df,
    path=path,
    dataset=True,
    mode="overwrite",
    partition_cols=["year"],
)

In [None]:
# Crawling!

In [None]:
%%time

res = wr.s3.store_parquet_metadata(
    path=path, database="awswrangler_test", table="crawler", dataset=True, mode="overwrite", dtype={"year": "int"}
)

In [None]:
# Checking

In [None]:
wr.catalog.table(database="awswrangler_test", table="crawler")

In [None]:
%%time

wr.athena.read_sql_query("SELECT * FROM crawler WHERE year=1890", database="awswrangler_test")

In [None]:
# Cleaning Up S3

In [None]:
wr.s3.delete_objects(path)

In [None]:
# Cleaning Up the Database

In [None]:
for table in wr.catalog.get_tables(database="awswrangler_test"):
    wr.catalog.delete_table_if_exists(database="awswrangler_test", table=table["Name"])