# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

#### Create a DynamicFrame from a table given access through Lake Formation in the AWS Glue Data Catalog and display its schema


In [None]:
dyf = glueContext.create_dynamic_frame.from_catalog(database='tpc', table_name='dl_tpc_household_demographics')
dyf.printSchema()

#### Convert the DynamicFrame to a Spark DataFrame and display a sample of the data

In [None]:
df = dyf.toDF()
df.select('hd_buy_potential','hd_income_band_sk','hd_demo_sk','hd_dep_count','hd_vehicle_count')
df.where("hd_buy_potential = '>10000'").show()

#### Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog

In [None]:
DyF1 = DynamicFrame.fromDF(df, glueContext, "DyF1")

s3output = glueContext.getSink(
  path="s3://${BUCKET_NAME}/gluenotebook/curated/",                     # <------- PLEASE REPLACE ONLY THE ${BUCKET_NAME} HERE (Keep the "s3://" and the final "/" part)
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)
s3output.setCatalogInfo(
  catalogDatabase="tpc", catalogTableName="dl_tpc_household_demographics_above10000"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(DyF1)

#### Create a DynamicFrame from a table not given access through Lake Formation in the AWS Glue Data Catalog and display its schema

In [None]:
dyf2 = glueContext.create_dynamic_frame.from_catalog(database='tpc', table_name='dl_tpc_customer_address')
dyf2.printSchema()