In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.35 
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::087763889191:role/service-role/AWSGlueServiceRole-imba-nining
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: bbe23e31-eb6f-4608-9ffe-e330843f2efe
Applying the following default arguments:
--glue_kernel_version 0.35
--enable-glue-datacatalog true
Waiting for session bbe23e31-eb6f-4608-9ffe-e330843f2efe to get into ready status...
Session bbe23e31-eb6f-4608-9ffe-e330843f2efe has been created




In [2]:
#crawler the data from data source, then define raw data zone as database for the data.
orders = glueContext.create_dynamic_frame.from_catalog(database="raw data zone", table_name="orders")
print("Count: " + str(orders.count()))
print(type(orders))
orders.printSchema()

products = glueContext.create_dynamic_frame.from_catalog(database="raw data zone", table_name="products")
print("Count: " + str(products.count()))
products.printSchema()

order_products = glueContext.create_dynamic_frame.from_catalog(database="raw data zone", table_name="order_products")
print("Count: " + str(order_products.count()))
order_products.printSchema()

Count: 39
<class 'awsglue.dynamicframe.DynamicFrame'>
root
|-- order_id: long
|-- user_id: long
|-- eval_set: string
|-- order_number: long
|-- order_dow: long
|-- order_hour_of_day: long
|-- days_since_prior_order: long

Count: 49688
root
|-- product_id: long
|-- product_name: string
|-- aisle_id: long
|-- department_id: long

Count: 384
root
|-- order_id: long
|-- product_id: long
|-- add_to_cart_order: long
|-- reordered: long
|-- partition_0: string


In [4]:
#departments is just one table in database, so you can load other tables like aisles, orders, etc. one by one
df_orders = orders.toDF()
df_orders.show(5)

df_products = products.toDF()
df_products.show(5)

df_order_products = order_products.toDF()
df_order_products.show(5)

+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
| 2539329|      1|   prior|           1|        2|                8|                  null|
| 2398795|      1|   prior|           2|        3|                7|                    15|
|  473747|      1|   prior|           3|        3|               12|                    21|
| 2254736|      1|   prior|           4|        4|                7|                    29|
|  431534|      1|   prior|           5|        4|               15|                    28|
+--------+-------+--------+------------+---------+-----------------+----------------------+
only showing top 5 rows

+----------+--------------------+--------+-------------+
|product_id|        product_name|aisle_id|department_id|
+----------+--------------------+

In [6]:
#data clean -- in this part, only one way, deduplication, is designed to demonstrate the clean process, you can add other methods to clean data
loc = locals()      #get string name
def get_variable_name(variable):
    for k,v in loc.items():
        if loc[k] is variable:
            return k


# def getString_name(df_name):
#      return list(dict(df_name = df_name).keys())[0]#,type(list(dict(df_name = df_name).values())[0])

def deduplicate_df(df_name):
    if df_name.count() != df_name.distinct().count():
        print( str(get_variable_name(df_name)) + ' have duplicated row:',df_name.count()- df_name.distinct().count())
        df_name = df_name.dropDuplicates()
        print('duplicated data has been cleared')
    else:
        print( str(get_variable_name(df_name)) + ' hasn\'t duplication, the totle distinct row',df_name.count())
    return df_name

df_orders_clean = deduplicate_df(df_orders)
df_orders_clean.show(5)
df_products_clean = deduplicate_df(df_products)
df_order_products_clean = deduplicate_df(df_order_products)

df_orders hasn't duplication, the totle distinct row 39
+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
| 2539329|      1|   prior|           1|        2|                8|                  null|
| 2398795|      1|   prior|           2|        3|                7|                    15|
|  473747|      1|   prior|           3|        3|               12|                    21|
| 2254736|      1|   prior|           4|        4|                7|                    29|
|  431534|      1|   prior|           5|        4|               15|                    28|
+--------+-------+--------+------------+---------+-----------------+----------------------+
only showing top 5 rows

df_products hasn't duplication, the totle distinct row 49688
df_order_products hasn't dupli

In [None]:
# put cleaned departments table into my datalake 
df_products_clean =df_products_clean.repartition(1)
df_products_clean.write.parquet('s3://project-imba/data-lake/raw-data/products_clean)

In [10]:
wide_table_for_ML=(df_orders_clean.select("order_id","user_id","order_number")\
.join(df_order_products,df_orders_clean.order_id==df_order_products.order_id,"left")\
.drop(df_orders_clean.order_id)\
.join(df_products,df_products.product_id == df_order_products.product_id,"left")\
.drop(df_products.product_id)\
.orderBy("user_id",df_orders_clean.order_number,"add_to_cart_order")\
.drop("reordered","product_name"))\
.select("user_id","order_id","order_number","department_id","aisle_id","product_id","add_to_cart_order","partition_0")\






In [12]:
wide_table_for_ML.show(503)

+-------+--------+------------+-------------+--------+----------+-----------------+--------------------+
|user_id|order_id|order_number|department_id|aisle_id|product_id|add_to_cart_order|         partition_0|
+-------+--------+------------+-------------+--------+----------+-----------------+--------------------+
|      1| 2539329|           1|            7|      77|       196|                1|order_products_prior|
|      1| 2539329|           1|           16|      91|     14084|                2|order_products_prior|
|      1| 2539329|           1|           19|      23|     12427|                3|order_products_prior|
|      1| 2539329|           1|           19|      23|     26088|                4|order_products_prior|
|      1| 2539329|           1|           17|      54|     26405|                5|order_products_prior|
|      1| 2398795|           2|            7|      77|       196|                1|order_products_prior|
|      1| 2398795|           2|           19|     117| 