In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Step5-Parquet") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config('spark.driver.memory', '512M')  \
    .config('spark.executor.memory', '512M')  \
    .config('spark.executor.instances', '1')  \
    .config('spark.executor.cores', '1')  \
    .config("spark.cores.max", "1") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/06 02:39:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
! ls -lh /spark-data

total 4.2M
-rwxrwxrwx 1 root root   23K Aug 20  2021 cars.csv
-rwxrwxrwx 1 root root   12K Aug 19  2021 Case.csv
drwxrwxrwx 1 root root  4.0K Oct  5 18:07 datasets
-rwxrwxrwx 1 root root  2.2K Aug 19  2021 deniro.csv
drwxrwxrwx 1 root root  4.0K Oct  5 18:07 graph
drwxrwxrwx 1 root root  4.0K Oct  5 18:07 logs
-rwxrwxrwx 1 root root  1.6M Sep 12  2021 MTA-Bus-Time_.2014-08-01.txt
-rwxrwxrwx 1 root root  227K Aug 19  2021 news.txt
-rwxrwxrwx 1 root root  1.3M Sep 11  2021 products.parquet
-rwxrwxrwx 1 root root   19K Aug 19  2021 Region.csv
-rwxrwxrwx 1 root root 1023K Sep 11  2021 sales.parquet
-rwxrwxrwx 1 root root  4.7K Sep 11  2021 sellers.parquet
-rwxrwxrwx 1 root root   90K Aug 19  2021 TimeProvince.csv
drwxrwxrwx 1 root root  4.0K Oct  5 18:07 trades
-rwxrwxrwx 1 root root  1.2K Oct  4  2021 user.json


In [4]:
products_table = spark.read.parquet("/spark-data/products.parquet")
sales_table = spark.read.parquet("/spark-data/sales.parquet")
sellers_table = spark.read.parquet("/spark-data/sellers.parquet")

                                                                                

In [6]:
products_table.show(10)

[Stage 4:>                                                          (0 + 1) / 1]

+----------+------------+-----+--------------------+
|product_id|product_name|price|              labels|
+----------+------------+-----+--------------------+
|         1|    p_000001|   30|         speak|model|
|         2|    p_000002|   37|piece|peace|per|p...|
|         3|    p_000003|  128|             on|fast|
|         4|    p_000004|  145|                hard|
|         5|    p_000005|   44|around|game|stand...|
|         6|    p_000006|   53|                NULL|
|         7|    p_000007|  104|  north|nature|great|
|         8|    p_000008|   24|budget|clear|audi...|
|         9|    p_000009|   14|             manager|
|        10|    p_000010|   72|                NULL|
+----------+------------+-----+--------------------+
only showing top 10 rows



                                                                                

In [8]:
sellers_table.show(10)

+---------+------------------+------------+
|seller_id|       seller_name|daily_target|
+---------+------------------+------------+
|        0|      Chad Osborne|      250000|
|        1|      Donald Lewis|      900423|
|        2|      Jason Brewer|      540176|
|        3|       Joshua Hall|     1738987|
|        4|     Anthony Clark|     1801017|
|        5|  Christopher Cole|      205667|
|        6|     Crystal Kelly|      128394|
|        7|Christopher Turner|      300273|
|        8|         Jill Vega|      627388|
|        9|         Susan Ray|     1947028|
+---------+------------------+------------+
only showing top 10 rows



In [9]:
sales_table.show(10)

                                                                                

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|             comment|
+--------+----------+---------+----------+---------------+--------------------+
|       1|       891|      543|2023-05-10|             82|Ball learn east b...|
|       2|       487|      827|2023-09-07|             54|Send animal arm m...|
|       3|       729|       67|2023-08-20|             51|Five skill nothin...|
|       4|       833|      484|2023-03-12|              4|Minute itself lif...|
|       5|       325|      371|2023-10-12|             13|Certainly draw ha...|
|       6|       127|       48|2023-06-15|             54|White itself busi...|
|       7|       937|      303|2023-03-15|             78|Fund scene benefi...|
|       8|       913|      424|2023-01-26|             30|Party forget six ...|
|       9|       517|      837|2023-02-04|             36|Yourself series m...|
|      10|       830|      534|2023-05-3

In [10]:
products_table.describe().show()

[Stage 7:>                                                          (0 + 1) / 1]

+-------+-----------------+--------------+-----------------+
|summary|       product_id|  product_name|            price|
+-------+-----------------+--------------+-----------------+
|  count|           100000|        100000|           100000|
|   mean|          50000.5|          NULL|         75.43343|
| stddev|28867.65779668774|          NULL|43.41501971754077|
|    min|                1|product_000001|                1|
|    max|           100000|product_100000|              150|
+-------+-----------------+--------------+-----------------+



                                                                                

In [10]:
sales_table.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- seller_id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- num_pieces_sold: long (nullable = true)
 |-- comment: string (nullable = true)



In [10]:
#   Print the number of orders
print("Number of Orders: {}".format(sales_table.count()))

#   Print the number of sellers
print("Number of sellers: {}".format(sellers_table.count()))

#   Print the number of products
print("Number of products: {}".format(products_table.count()))

Number of Orders: 2000
Number of sellers: 101
Number of products: 100000


In [12]:
products_table.cache()
sales_table.cache()
sellers_table.cache()

23/10/06 08:16:57 WARN CacheManager: Asked to cache already cached data.
23/10/06 08:16:57 WARN CacheManager: Asked to cache already cached data.
23/10/06 08:16:57 WARN CacheManager: Asked to cache already cached data.


DataFrame[seller_id: bigint, seller_name: string, daily_target: bigint]

In [13]:
products_table.select(products_table.product_id).show(5)

[Stage 8:>                                                          (0 + 1) / 1]

+----------+
|product_id|
+----------+
|         1|
|         2|
|         3|
|         4|
|         5|
+----------+
only showing top 5 rows



                                                                                

In [13]:
products_table.select(col('product_id')).show()

+----------+
|product_id|
+----------+
|         1|
|         2|
|         3|
|         4|
|         5|
|         6|
|         7|
|         8|
|         9|
|        10|
|        11|
|        12|
|        13|
|        14|
|        15|
|        16|
|        17|
|        18|
|        19|
|        20|
+----------+
only showing top 20 rows



In [14]:
#   Output how many products have been actually sold at least once
print("Number of products sold at least once")
sales_table.agg(countDistinct(col("product_id"))).show()


Number of products sold at least once


[Stage 13:>                                                         (0 + 1) / 1]

+--------------------------+
|count(DISTINCT product_id)|
+--------------------------+
|                      1000|
+--------------------------+



                                                                                

In [15]:
#   Output which is the product that has been sold in more orders
print("Product present in more orders")
sales_table.groupBy(col("product_id")).agg(
    count("*").alias("cnt")).orderBy(col("cnt").desc()).limit(10).show()

Product present in more orders


[Stage 17:>                                                         (0 + 1) / 2]

+----------+---+
|product_id|cnt|
+----------+---+
|       138|134|
|       997|133|
|       107|126|
|         2|126|
|       817|125|
|       548|125|
|       668|125|
|       325|124|
|       872|124|
|       971|124|
+----------+---+



                                                                                

In [16]:
#   Output which is the product that has been sold in more orders
print("Product present in more orders")
sales_table.groupBy(col("product_id")).agg(
    count("*").alias("cnt")).orderBy(col("cnt").desc()).limit(10).explain()

Product present in more orders
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=10, orderBy=[cnt#617L DESC NULLS LAST], output=[product_id#9L,cnt#617L])
   +- HashAggregate(keys=[product_id#9L], functions=[count(1)])
      +- Exchange hashpartitioning(product_id#9L, 200), ENSURE_REQUIREMENTS, [plan_id=318]
         +- HashAggregate(keys=[product_id#9L], functions=[partial_count(1)])
            +- InMemoryTableScan [product_id#9L]
                  +- InMemoryRelation [order_id#8L, product_id#9L, seller_id#10L, date#11, num_pieces_sold#12L, comment#13], StorageLevel(disk, memory, deserialized, 1 replicas)
                        +- *(1) ColumnarToRow
                           +- FileScan parquet [order_id#8L,product_id#9L,seller_id#10L,date#11,num_pieces_sold#12L,comment#13] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/spark-data/sales.parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: str

In [17]:
products_table.createOrReplaceTempView("products")
sales_table.createOrReplaceTempView("sales")
sellers_table.createOrReplaceTempView("sellers")

In [18]:
spark.sql("select count(distinct product_id) as `product sold`  from sales").show()


+------------+
|product sold|
+------------+
|        1000|
+------------+



In [20]:
print("Product present in more orders")

spark.sql("""select product_id, count(*) as cnt
            from sales
            group by 1
            order by 2 desc 
            limit 10""").show()

Product present in more orders
+----------+---+
|product_id|cnt|
+----------+---+
|       376| 67|
|       600| 71|
|       566| 75|
|       516| 76|
|       587| 76|
|       472| 76|
|       723| 76|
|       181| 77|
|       157| 77|
|       299| 77|
+----------+---+



In [22]:
print("Product present in more orders")

df10=spark.sql("""select product_name, count(*) as cnt
            from sales s inner join products p on p.product_id = s.product_id
            group by 1 
            having cnt >1
            order by 2 desc 
            """)
df10.show()

Product present in more orders


[Stage 39:>                                                         (0 + 1) / 1]

+------------+---+
|product_name|cnt|
+------------+---+
|    p_000138|134|
|    p_000997|133|
|    p_000002|126|
|    p_000107|126|
|    p_000548|125|
|    p_000668|125|
|    p_000817|125|
|    p_000325|124|
|    p_000971|124|
|    p_000872|124|
|    p_000850|123|
|    p_000089|123|
|    p_000593|123|
|    p_000202|123|
|    p_000222|123|
|    p_000652|123|
|    p_000391|122|
|    p_000679|122|
|    p_000499|122|
|    p_000875|122|
+------------+---+
only showing top 20 rows



                                                                                

In [23]:
df10.count()

                                                                                

1000

In [26]:
df20=spark.sql("""select product_name, count(*)
            from sales s right outer join products p on p.product_id = s.product_id
            group by 1 
            """)
# print(df20.count())
print(df20.show(10))

+------------+--------+
|product_name|count(1)|
+------------+--------+
|    p_000007|     111|
|    p_000480|      79|
|    p_000914|     104|
|    p_000057|     101|
|    p_000493|      87|
|    p_000674|      79|
|    p_000981|      99|
|    p_000181|      77|
|    p_000435|      96|
|    p_000551|      79|
+------------+--------+
only showing top 10 rows

None


In [27]:
print("Product present in more orders")

df20=spark.sql("""select product_name, count(*)
            from sales s right outer join products p on p.product_id = s.product_id
            group by 1 
            order by 2 desc 
            """)
df20.cache()
print(df20.count())
print(df20.show())

Product present in more orders
1000
+------------+--------+
|product_name|count(1)|
+------------+--------+
|    p_000138|     134|
|    p_000997|     133|
|    p_000002|     126|
|    p_000107|     126|
|    p_000548|     125|
|    p_000668|     125|
|    p_000817|     125|
|    p_000325|     124|
|    p_000971|     124|
|    p_000872|     124|
|    p_000089|     123|
|    p_000850|     123|
|    p_000593|     123|
|    p_000202|     123|
|    p_000222|     123|
|    p_000652|     123|
|    p_000875|     122|
|    p_000679|     122|
|    p_000391|     122|
|    p_000499|     122|
+------------+--------+
only showing top 20 rows

None


In [23]:
print(df20.count())
print(df20.show())

100000
+--------------+--------+
|  product_name|count(1)|
+--------------+--------+
|product_000227|       1|
|product_000328|       1|
|product_000530|       1|
|product_000718|       1|
|product_000900|       1|
|product_001021|       1|
|product_001070|       1|
|product_001172|       1|
|product_001362|       1|
|product_001368|       1|
|product_001724|       1|
|product_001796|       1|
|product_002124|       1|
|product_002257|       1|
|product_002386|       1|
|product_002574|       1|
|product_002693|       1|
|product_003396|       1|
|product_003571|       1|
|product_003634|       1|
+--------------+--------+
only showing top 20 rows

None


In [28]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold","20m")

In [29]:

df30=spark.sql("""select product_name, count(*)
            from sales s join products p on p.product_id = s.product_id
            group by 1 
            order by 2 asc 
            """)
df30.show()

+------------+--------+
|product_name|count(1)|
+------------+--------+
|    p_000376|      67|
|    p_000600|      71|
|    p_000566|      75|
|    p_000516|      76|
|    p_000472|      76|
|    p_000587|      76|
|    p_000723|      76|
|    p_000181|      77|
|    p_000299|      77|
|    p_000676|      77|
|    p_000157|      77|
|    p_000998|      78|
|    p_000549|      78|
|    p_000720|      78|
|    p_000830|      78|
|    p_000633|      78|
|    p_000480|      79|
|    p_000674|      79|
|    p_000551|      79|
|    p_000902|      79|
+------------+--------+
only showing top 20 rows



In [30]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold","-1")
df40=spark.sql("""select product_name, count(*)
            from sales s join products p on p.product_id = s.product_id
            group by 1 
            order by 2 asc 
            """)
df40.show()

+------------+--------+
|product_name|count(1)|
+------------+--------+
|    p_000376|      67|
|    p_000600|      71|
|    p_000566|      75|
|    p_000472|      76|
|    p_000516|      76|
|    p_000587|      76|
|    p_000723|      76|
|    p_000181|      77|
|    p_000299|      77|
|    p_000676|      77|
|    p_000157|      77|
|    p_000998|      78|
|    p_000549|      78|
|    p_000720|      78|
|    p_000830|      78|
|    p_000633|      78|
|    p_000480|      79|
|    p_000674|      79|
|    p_000551|      79|
|    p_000902|      79|
+------------+--------+
only showing top 20 rows



Certainly! In Apache Spark, BROADCASTJOIN, SHUFFLE_HASH, and SHUFFLE_MERGE are strategies used for joining data frames or datasets. Let me explain each of them with examples:

### 1. **BROADCASTJOIN:**
When one of the DataFrames is small enough to fit in the memory of each worker node, you can use a broadcast join. In this type of join, the smaller DataFrame is broadcasted to all worker nodes, and the join operation is performed locally on each node.



### 2. **SHUFFLE_HASH:**
Shuffle hash join is used when the join keys are not evenly distributed and don't fit in memory. In this approach, data is shuffled across the cluster based on the join keys. Each partition of the DataFrames with the same join key is moved to the same node, where the actual join operation is performed.


### 3. **SHUFFLE_MERGE:**
Shuffle merge join is an optimization introduced in Spark 3.0. It is used when both DataFrames to be joined are hash-partitioned on the join key. Instead of performing a full shuffle, this join strategy can efficiently merge the already shuffled partitions of both DataFrames.


In these examples, `commonColumn` represents the column on which the DataFrames are being joined. The appropriate join strategy depends on the size of the DataFrames, the distribution of join keys, and the available memory in the cluster. Choosing the right join strategy is crucial for optimizing the performance of Spark applications.

The choice of join strategy in Apache Spark depends on various factors such as the size of the DataFrames, the distribution of the join keys, the available memory in the cluster, and the overall cluster configuration. Here's a guideline on when to use each join strategy:

### 1. **BROADCASTJOIN:**

- **Use Case:** Broadcast join is ideal when one of the DataFrames is small enough to fit in the memory of each worker node.
  
- **When to Use:**
  - When one DataFrame is significantly smaller than the other.
  - When the smaller DataFrame can fit in the memory of each worker node.
  - When network bandwidth is a concern, and minimizing data shuffling is important.
  
- **Example:** Joining a large DataFrame with a small lookup table.

### 2. **SHUFFLE_HASH:**

- **Use Case:** Shuffle hash join is suitable when join keys are not evenly distributed and do not fit in memory, but both DataFrames are not hash-partitioned on the join key.
  
- **When to Use:**
  - When join keys are not evenly distributed.
  - When DataFrames are not hash-partitioned on the join key.
  - When the size of DataFrames is moderate and cannot fit in memory.
  
- **Example:** Joining two large DataFrames on a non-uniformly distributed join key.

### 3. **SHUFFLE_MERGE:**

- **Use Case:** Shuffle merge join is efficient when both DataFrames are hash-partitioned on the join key and can take advantage of sorted and partitioned data for the join operation.
  
- **When to Use:**
  - When both DataFrames are hash-partitioned on the join key.
  - When both DataFrames are large, and the join keys are evenly distributed.
  - When Spark version is 3.0 or higher (as shuffle merge join is optimized in these versions).
  
- **Example:** Joining two large DataFrames that are already hash-partitioned on the join key.

In summary, consider the size of your DataFrames, the distribution of your join keys, and the available memory in your cluster when choosing a join strategy. Always profile your data and perform benchmarking to determine the most efficient join strategy for your specific use case. Additionally, staying up-to-date with the latest optimizations and features in newer versions of Apache Spark can also help you make better decisions regarding join strategies.

In [34]:
# SELECT /*+  BROADCASTJOIN(small) */ * 

df50=spark.sql("""select /*+  BROADCASTJOIN(s) */   product_name, count(*)  
            from products p join sales s on p.product_id = s.product_id
            group by 1 
            order by 2 asc 
            """)
print(df50.count())
df50.show()

1000
+------------+--------+
|product_name|count(1)|
+------------+--------+
|    p_000376|      67|
|    p_000600|      71|
|    p_000566|      75|
|    p_000472|      76|
|    p_000516|      76|
|    p_000587|      76|
|    p_000723|      76|
|    p_000181|      77|
|    p_000299|      77|
|    p_000676|      77|
|    p_000157|      77|
|    p_000998|      78|
|    p_000549|      78|
|    p_000720|      78|
|    p_000830|      78|
|    p_000633|      78|
|    p_000480|      79|
|    p_000674|      79|
|    p_000551|      79|
|    p_000902|      79|
+------------+--------+
only showing top 20 rows



In [32]:
df50=spark.sql("""select /*+  SHUFFLE_HASH(s) */   product_name, count(*)  
            from products p join sales s on p.product_id = s.product_id
            group by 1 
            order by 2 asc 
            """)
print(df50.count())
df50.show()

                                                                                

1000
+------------+--------+
|product_name|count(1)|
+------------+--------+
|    p_000376|      67|
|    p_000600|      71|
|    p_000566|      75|
|    p_000516|      76|
|    p_000472|      76|
|    p_000587|      76|
|    p_000723|      76|
|    p_000181|      77|
|    p_000299|      77|
|    p_000676|      77|
|    p_000157|      77|
|    p_000998|      78|
|    p_000549|      78|
|    p_000720|      78|
|    p_000830|      78|
|    p_000633|      78|
|    p_000480|      79|
|    p_000674|      79|
|    p_000551|      79|
|    p_000902|      79|
+------------+--------+
only showing top 20 rows



In [33]:
df50=spark.sql("""select /*+  SHUFFLE_MERGE(s) */   product_name, count(*)  
            from products p join sales s on p.product_id = s.product_id
            group by 1 
            order by 2 asc 
            """)
print(df50.count())
df50.show()

1000
+------------+--------+
|product_name|count(1)|
+------------+--------+
|    p_000376|      67|
|    p_000600|      71|
|    p_000566|      75|
|    p_000472|      76|
|    p_000516|      76|
|    p_000587|      76|
|    p_000723|      76|
|    p_000181|      77|
|    p_000299|      77|
|    p_000676|      77|
|    p_000157|      77|
|    p_000998|      78|
|    p_000549|      78|
|    p_000720|      78|
|    p_000830|      78|
|    p_000633|      78|
|    p_000480|      79|
|    p_000674|      79|
|    p_000551|      79|
|    p_000902|      79|
+------------+--------+
only showing top 20 rows



In [29]:
####  Dataframe Joins
print(sales_table.join(broadcast(sellers_table), sales_table["seller_id"] == sellers_table["seller_id"], "inner").withColumn(
    "ratio", sales_table["num_pieces_sold"]/sellers_table["daily_target"]
).groupBy(sales_table["seller_id"]).agg(avg("ratio")).show())

+---------+--------------------+
|seller_id|          avg(ratio)|
+---------+--------------------+
|       29|4.644988939825174...|
|       26|2.319554387605397...|
|       65|2.931429678805068...|
|       54|2.389661293535090...|
|       19|2.858258703574406...|
|       22|3.708259605141521E-5|
|        7|1.555967948441298...|
|       77|2.981032682717429...|
|       34|3.477284846496053...|
|       50|2.870018587854760...|
|       94|4.674046551971184E-5|
|       57|3.405221429066354E-5|
|       32|8.634325709528986E-5|
|       43|2.882534893949249...|
|       84|4.044736739416897...|
|       31|3.006195752651825...|
|       98|6.342958088068233E-5|
|       39|5.703705240055282...|
|       25|3.625800692872154...|
|       95|2.924769017292419...|
+---------+--------------------+
only showing top 20 rows

None


In [30]:
from pyspark.sql import Row, Window
from pyspark.sql.types import IntegerType
import hashlib

#   Define the UDF function
def algo(order_id, bill_text):
    #   If number is even
    ret = bill_text.encode("utf-8")
    if int(order_id) % 2 == 0:
        #   Count number of 'A'
        cnt_A = bill_text.count("A")
        for _c in range(0, cnt_A):
            ret = hashlib.md5(ret).hexdigest().encode("utf-8")
        ret = ret.decode('utf-8')
    else:
        ret = hashlib.sha256(ret).hexdigest()
    return ret

#   Register the UDF function.
algo_udf = spark.udf.register("algo", algo)

sales_table = sales_table.withColumn("hashed_bill", algo_udf(col("order_id"), col("bill_raw_text")))

sales_table.show()


[Stage 158:>                                                        (0 + 1) / 1]

+--------+----------+---------+----------+---------------+--------------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|         hashed_bill|
+--------+----------+---------+----------+---------------+--------------------+--------------------+
|       1|     57164|       69|2021-10-04|             20|tvuerrnvlBwhejtyz...|76c669d6d3e9c8ddd...|
|       2|     17466|        4|2021-10-03|             80|ezoexpsddxnahgkxr...|ezoexpsddxnahgkxr...|
|       3|     25747|       81|2021-10-08|             24|rrzavxtphkxhyiicu...|11d2ead66af66ec03...|
|       4|     53745|       59|2021-10-02|             92|jklcqmgjbfqxtsugt...|jklcqmgjbfqxtsugt...|
|       5|     31885|        5|2021-10-06|             10|azdujdzlhddjebywm...|4ac76798d905e2ba4...|
|       6|     36014|       52|2021-10-08|             71|jpjwwfckwolwphpkq...|63f8773c95665bb5e...|
|       7|      1709|       59|2021-10-05|              3|gcjrymmjlcmkhoyxq...|96a5d3cfb3d8

23/10/04 22:05:21 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
23/10/04 22:05:30 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [29]:
spark.stop()