In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
	builder. \
	config('spark.ui.port','0'). \
	config("spark.sql.warehouse.dir", f"/user/itv000173/warehouse"). \
	enableHiveSupport(). \
	master('yarn'). \
	getOrCreate()

In [2]:
orders_df = spark.read \
.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.load("/public/trendytech/orders_wh/*")

In [3]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [4]:
spark.read \
.csv("/public/trendytech/orders_wh/*", header = "true", inferSchema = "true")

order_id,order_date,customer_id,order_status
1,2013-07-25 00:00:...,11599,CLOSED
2,2013-07-25 00:00:...,256,PENDING_PAYMENT
3,2013-07-25 00:00:...,12111,COMPLETE
4,2013-07-25 00:00:...,8827,CLOSED
5,2013-07-25 00:00:...,11318,COMPLETE
6,2013-07-25 00:00:...,7130,COMPLETE
7,2013-07-25 00:00:...,4530,COMPLETE
8,2013-07-25 00:00:...,2911,PROCESSING
9,2013-07-25 00:00:...,5657,PENDING_PAYMENT
10,2013-07-25 00:00:...,5648,PENDING_PAYMENT


In [5]:
spark.read \
.json("/public/trendytech/datasets/orders.json")

customer_id,order_date,order_id,order_status
11599,2013-07-25 00:00:...,1,CLOSED
256,2013-07-25 00:00:...,2,PENDING_PAYMENT
12111,2013-07-25 00:00:...,3,COMPLETE
8827,2013-07-25 00:00:...,4,CLOSED
11318,2013-07-25 00:00:...,5,COMPLETE
7130,2013-07-25 00:00:...,6,COMPLETE
4530,2013-07-25 00:00:...,7,COMPLETE
2911,2013-07-25 00:00:...,8,PROCESSING
5657,2013-07-25 00:00:...,9,PENDING_PAYMENT
5648,2013-07-25 00:00:...,10,PENDING_PAYMENT


In [6]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [7]:
orders_df = spark.read \
.parquet("/public/trendytech/datasets/ordersparquet")

In [8]:
orders_df.show(5)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
+-----------+--------------------+--------+---------------+
only showing top 5 rows



In [10]:
filtered_df = orders_df.where("customer_id = 11599")

In [11]:
filtered_df.show(truncate = False)

+-----------+---------------------+--------+------------+
|customer_id|order_date           |order_id|order_status|
+-----------+---------------------+--------+------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED      |
|11599      |2013-10-03 00:00:00.0|11397   |COMPLETE    |
|11599      |2013-12-20 00:00:00.0|23908   |COMPLETE    |
|11599      |2014-06-27 00:00:00.0|53545   |PENDING     |
|11599      |2013-10-17 00:00:00.0|59911   |PROCESSING  |
+-----------+---------------------+--------+------------+



In [12]:
filtered_df.show()

+-----------+--------------------+--------+------------+
|customer_id|          order_date|order_id|order_status|
+-----------+--------------------+--------+------------+
|      11599|2013-07-25 00:00:...|       1|      CLOSED|
|      11599|2013-10-03 00:00:...|   11397|    COMPLETE|
|      11599|2013-12-20 00:00:...|   23908|    COMPLETE|
|      11599|2014-06-27 00:00:...|   53545|     PENDING|
|      11599|2013-10-17 00:00:...|   59911|  PROCESSING|
+-----------+--------------------+--------+------------+



In [14]:
filtered_df = orders_df.where("customer_id = 11599")

In [15]:
filtered_df.show()

+-----------+--------------------+--------+------------+
|customer_id|          order_date|order_id|order_status|
+-----------+--------------------+--------+------------+
|      11599|2013-07-25 00:00:...|       1|      CLOSED|
|      11599|2013-10-03 00:00:...|   11397|    COMPLETE|
|      11599|2013-12-20 00:00:...|   23908|    COMPLETE|
|      11599|2014-06-27 00:00:...|   53545|     PENDING|
|      11599|2013-10-17 00:00:...|   59911|  PROCESSING|
+-----------+--------------------+--------+------------+



In [19]:
orders_df.createOrReplaceTempView("orders")

In [18]:
filtered_df = spark.sql("select * from orders where order_status = 'CLOSED'")

In [19]:
filtered_df.show()

+-----------+--------------------+--------+------------+
|customer_id|          order_date|order_id|order_status|
+-----------+--------------------+--------+------------+
|      11599|2013-07-25 00:00:...|       1|      CLOSED|
|       8827|2013-07-25 00:00:...|       4|      CLOSED|
|       1837|2013-07-25 00:00:...|      12|      CLOSED|
|       1205|2013-07-25 00:00:...|      18|      CLOSED|
|      11441|2013-07-25 00:00:...|      24|      CLOSED|
|       9503|2013-07-25 00:00:...|      25|      CLOSED|
|       5863|2013-07-25 00:00:...|      37|      CLOSED|
|      12271|2013-07-25 00:00:...|      51|      CLOSED|
|       7073|2013-07-25 00:00:...|      57|      CLOSED|
|       4791|2013-07-25 00:00:...|      61|      CLOSED|
|       9111|2013-07-25 00:00:...|      62|      CLOSED|
|       3065|2013-07-25 00:00:...|      87|      CLOSED|
|       9131|2013-07-25 00:00:...|      90|      CLOSED|
|       5116|2013-07-25 00:00:...|     101|      CLOSED|
|       8763|2013-07-26 00:00:.

In [20]:
ordersdf = spark.read.table("orders")

In [21]:
ordersdf.show()

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
|       7130|2013-07-25 00:00:...|       6|       COMPLETE|
|       4530|2013-07-25 00:00:...|       7|       COMPLETE|
|       2911|2013-07-25 00:00:...|       8|     PROCESSING|
|       5657|2013-07-25 00:00:...|       9|PENDING_PAYMENT|
|       5648|2013-07-25 00:00:...|      10|PENDING_PAYMENT|
|        918|2013-07-25 00:00:...|      11| PAYMENT_REVIEW|
|       1837|2013-07-25 00:00:...|      12|         CLOSED|
|       9149|2013-07-25 00:00:...|      13|PENDING_PAYMENT|
|       9842|2013-07-25 00:00:...|      

In [4]:
spark.sql("show databases").show()

+--------------------+
|           namespace|
+--------------------+
|00000assign7_itv0...|
| 0000_cache_spark111|
|   0001_av_ivy_tesco|
|          001_retail|
|        003402_hive1|
|    005198_ivy_tesco|
|    005212_ivy_tesco|
| 005222_ivy_practice|
| 005260_ivy_database|
|005876_week5_assi...|
|       005933_retail|
|006018_trendytech...|
|        006539_week5|
|      006539_week5_1|
|     006586_database|
|     006608_database|
|006866_week5_assi...|
|        007384_w5_db|
| 008057_bucketing_db|
|        008211_week5|
+--------------------+
only showing top 20 rows



In [5]:
spark.sql("show databases").filter("namespace = 'itv009538%'")

namespace


In [8]:
spark.sql("show tables").show()

+--------+-----------------+-----------+
|database|        tableName|isTemporary|
+--------+-----------------+-----------+
| default|            1htab|      false|
| default|   41group_movies|      false|
| default|    4group_movies|      false|
| default|             4tab|      false|
| default|    6_flags_simon|      false|
| default|                a|      false|
| default|               aa|      false|
| default|             acid|      false|
| default|            acid1|      false|
| default|     acid_example|      false|
| default|    acid_example1|      false|
| default|    acid_example2|      false|
| default|           adata1|      false|
| default|        adata_ell|      false|
| default|         adata_vr|      false|
| default|    ad_earthquake|      false|
| default|ad_earthquake_par|      false|
| default|           adelta|      false|
| default|       adeltapart|      false|
| default|   adeltapartbuck|      false|
+--------+-----------------+-----------+
only showing top

In [11]:
spark.sql("show databases").where("namespace like '%retail%'")

namespace
001_retail
005933_retail
008368_retaildb
008368_retaildbnew
008858_retaildb
008858_retaildb1
009490_retail
00itv8696_retail
07172021_retail
1540retail_db


In [13]:
spark.sql("show databases").filter("namespace like '%itv009538%'")

namespace


In [15]:
spark.sql("show tables")

database,tableName,isTemporary
default,1htab,False
default,41group_movies,False
default,4group_movies,False
default,4tab,False
default,6_flags_simon,False
default,a,False
default,aa,False
default,acid,False
default,acid1,False
default,acid_example,False


In [24]:
spark.sql("describe extended default.acid_example").show(truncate = False)

+----------------------------+-------------------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                                    |comment|
+----------------------------+-------------------------------------------------------------------------------------------------------------+-------+
|id                          |int                                                                                                          |null   |
|name                        |string                                                                                                       |null   |
|check1                      |string                                                                                                       |null   |
|country                     |string                                                                      