In [0]:
# File location and type
file_location = "s3://pipeline-demo-csv/orders.csv"
file_type = "CSV"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_order = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)
df_order.printSchema()
df_order.show(10)
#display(df_order)

root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)

+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
| 2539329|      1|   prior|           1|        2|                8|                  null|
| 2398795|      1|   prior|           2|        3|                7|                  15.0|
|  473747|      1|   prior|           3|        3|               12|                  21.0|
| 2254736|      1|   prior|           4|        4|                7|                  29.0|
|  431534|      1|   prior|           

In [0]:
# Create a view or table

temp_table_name = "orders"

df1_order = df_order.createOrReplaceTempView(temp_table_name)

df_order = df1_order

In [0]:
%sql
SELECT * from orders

order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
2539329,1,prior,1,2,8,
2398795,1,prior,2,3,7,15.0
473747,1,prior,3,3,12,21.0
2254736,1,prior,4,4,7,29.0
431534,1,prior,5,4,15,28.0
3367565,1,prior,6,2,7,19.0
550135,1,prior,7,1,9,20.0
3108588,1,prior,8,1,14,14.0
2295261,1,prior,9,1,16,0.0
2550362,1,prior,10,4,8,30.0


In [0]:
# File location and type
file_location = "s3://pipeline-demo-csv/products.csv"
file_type = "CSV"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_products = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df_products.printSchema()
display(df_products)

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)



product_id,product_name,aisle_id,department_id
1,Chocolate Sandwich Cookies,61,19
2,All-Seasons Salt,104,13
3,Robust Golden Unsweetened Oolong Tea,94,7
4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,38,1
5,Green Chile Anytime Sauce,5,13
6,Dry Nose Oil,11,11
7,Pure Coconut Water With Orange,98,7
8,Cut Russet Potatoes Steam N' Mash,116,1
9,Light Strawberry Blueberry Yogurt,120,16
10,Sparkling Orange Juice & Prickly Pear Beverage,115,7


In [0]:
temp_table_name = "products"

df_products.createOrReplaceTempView(temp_table_name)



In [0]:
%sql

SELECT * FROM products

product_id,product_name,aisle_id,department_id
1,Chocolate Sandwich Cookies,61,19
2,All-Seasons Salt,104,13
3,Robust Golden Unsweetened Oolong Tea,94,7
4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,38,1
5,Green Chile Anytime Sauce,5,13
6,Dry Nose Oil,11,11
7,Pure Coconut Water With Orange,98,7
8,Cut Russet Potatoes Steam N' Mash,116,1
9,Light Strawberry Blueberry Yogurt,120,16
10,Sparkling Orange Juice & Prickly Pear Beverage,115,7


In [0]:
file_location = "s3://pipeline-demo-csv/departments.csv"
file_type = "CSV"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_departments = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df_departments.printSchema()
display(df_departments)

root
 |-- department_id: integer (nullable = true)
 |-- department: string (nullable = true)



department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [0]:
temp_table_name = "departments"

df_departments.createOrReplaceTempView(temp_table_name)

In [0]:
%sql
SELECT * FROM departments

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [0]:
file_location = "s3://pipeline-demo-csv/aisles.csv"
file_type = "CSV"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_aisles = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df_aisles.printSchema()
display(df_aisles)

root
 |-- aisle_id: integer (nullable = true)
 |-- aisle: string (nullable = true)



aisle_id,aisle
1,prepared soups salads
2,specialty cheeses
3,energy granola bars
4,instant foods
5,marinades meat preparation
6,other
7,packaged meat
8,bakery desserts
9,pasta sauce
10,kitchen supplies


In [0]:
temp_table_name = "aisles"

df_aisles.createOrReplaceTempView(temp_table_name)

In [0]:
%sql
SELECT * FROM aisles

aisle_id,aisle
1,prepared soups salads
2,specialty cheeses
3,energy granola bars
4,instant foods
5,marinades meat preparation
6,other
7,packaged meat
8,bakery desserts
9,pasta sauce
10,kitchen supplies


In [0]:
%sql 
CREATE TABLE allproductsinfo_table AS
(SELECT p.product_id, p.product_name, p.aisle_id, a.aisle, p.department_id
FROM products_table p
LEFT JOIN aisles_table a ON p.aisle_id = a.aisle_id
ORDER BY p.product_id
)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-1981344441789327>[0m in [0;36m<cell line: 1>[0;34m()[0m
[1;32m      5[0m     [0mdisplay[0m[0;34m([0m[0mdf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      6[0m     [0;32mreturn[0m [0mdf[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 7[0;31m   [0m_sqldf[0m [0;34m=[0m [0m____databricks_percent_sql[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      8[0m [0;32mfinally[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m      9[0m   [0;32mdel[0m [0m____databricks_percent_sql[0m[0;34m[0m[0;34m[0m[0m

[0;32m<command-1981344441789327>[0m in [0;36m____databricks_percent_sql[0;34m()[0m
[1;32m      2[0m   [0;32mdef[0m [0m____databricks_percent_sql[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m     [0;32mimport

In [0]:
%sql
SELECT* FROM allproductsinfo_table

product_id,product_name,aisle_id,aisle,department_id
1,Chocolate Sandwich Cookies,61,cookies cakes,19
2,All-Seasons Salt,104,spices seasonings,13
3,Robust Golden Unsweetened Oolong Tea,94,tea,7
4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,38,frozen meals,1
5,Green Chile Anytime Sauce,5,marinades meat preparation,13
6,Dry Nose Oil,11,cold flu allergy,11
7,Pure Coconut Water With Orange,98,juice nectars,7
8,Cut Russet Potatoes Steam N' Mash,116,frozen produce,1
9,Light Strawberry Blueberry Yogurt,120,yogurt,16
10,Sparkling Orange Juice & Prickly Pear Beverage,115,water seltzer sparkling water,7


In [0]:
%sql
CREATE TABLE pad_table AS(
SELECT t.product_id, t.product_name, t.aisle_id, t.aisle, t.department_id, d.department
FROM allproductsinfo_table t
LEFT JOIN departments_table d on t.department_id = d.department_id
ORDER BY t.product_id
)

num_affected_rows,num_inserted_rows


In [0]:
%sql
 SELECT* FROM pad_table

product_id,product_name,aisle_id,aisle,department_id,department
1,Chocolate Sandwich Cookies,61,cookies cakes,19,snacks
2,All-Seasons Salt,104,spices seasonings,13,pantry
3,Robust Golden Unsweetened Oolong Tea,94,tea,7,beverages
4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,38,frozen meals,1,frozen
5,Green Chile Anytime Sauce,5,marinades meat preparation,13,pantry
6,Dry Nose Oil,11,cold flu allergy,11,personal care
7,Pure Coconut Water With Orange,98,juice nectars,7,beverages
8,Cut Russet Potatoes Steam N' Mash,116,frozen produce,1,frozen
9,Light Strawberry Blueberry Yogurt,120,yogurt,16,dairy eggs
10,Sparkling Orange Juice & Prickly Pear Beverage,115,water seltzer sparkling water,7,beverages


In [0]:
df_pad = spark.sql("select * from pad_table")
df_pad.show(10)

+----------+--------------------+--------+--------------------+-------------+-------------+
|product_id|        product_name|aisle_id|               aisle|department_id|   department|
+----------+--------------------+--------+--------------------+-------------+-------------+
|         1|Chocolate Sandwic...|      61|       cookies cakes|           19|       snacks|
|         2|    All-Seasons Salt|     104|   spices seasonings|           13|       pantry|
|         3|Robust Golden Uns...|      94|                 tea|            7|    beverages|
|         4|Smart Ones Classi...|      38|        frozen meals|            1|       frozen|
|         5|Green Chile Anyti...|       5|marinades meat pr...|           13|       pantry|
|         6|        Dry Nose Oil|      11|    cold flu allergy|           11|personal care|
|         7|Pure Coconut Wate...|      98|       juice nectars|            7|    beverages|
|         8|Cut Russet Potato...|     116|      frozen produce|            1|   

In [0]:
df_pad1 = df_pad.withColumnRenamed("aisle", "aisle_name")
df_pad = df_pad1
df_pad.show(10)

+----------+--------------------+--------+--------------------+-------------+-------------+
|product_id|        product_name|aisle_id|          aisle_name|department_id|   department|
+----------+--------------------+--------+--------------------+-------------+-------------+
|         1|Chocolate Sandwic...|      61|       cookies cakes|           19|       snacks|
|         2|    All-Seasons Salt|     104|   spices seasonings|           13|       pantry|
|         3|Robust Golden Uns...|      94|                 tea|            7|    beverages|
|         4|Smart Ones Classi...|      38|        frozen meals|            1|       frozen|
|         5|Green Chile Anyti...|       5|marinades meat pr...|           13|       pantry|
|         6|        Dry Nose Oil|      11|    cold flu allergy|           11|personal care|
|         7|Pure Coconut Wate...|      98|       juice nectars|            7|    beverages|
|         8|Cut Russet Potato...|     116|      frozen produce|            1|   

In [0]:
df_pad2 = df_pad.withColumnRenamed("department", "department_name")
df_pad = df_pad2
df_pad.show(10)

+----------+--------------------+--------+--------------------+-------------+---------------+
|product_id|        product_name|aisle_id|          aisle_name|department_id|department_name|
+----------+--------------------+--------+--------------------+-------------+---------------+
|         1|Chocolate Sandwic...|      61|       cookies cakes|           19|         snacks|
|         2|    All-Seasons Salt|     104|   spices seasonings|           13|         pantry|
|         3|Robust Golden Uns...|      94|                 tea|            7|      beverages|
|         4|Smart Ones Classi...|      38|        frozen meals|            1|         frozen|
|         5|Green Chile Anyti...|       5|marinades meat pr...|           13|         pantry|
|         6|        Dry Nose Oil|      11|    cold flu allergy|           11|  personal care|
|         7|Pure Coconut Wate...|      98|       juice nectars|            7|      beverages|
|         8|Cut Russet Potato...|     116|      frozen produ