In [1]:
# https://towardsdatascience.com/pyspark-and-sparksql-basics-6cb4bf967e53

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('Prac-2').getOrCreate()

In [4]:
json_file_path = 'file:///home/boom/Documents/programming/pyspark/data_files/nyt2.json'

In [5]:
df = spark.read.format('json').load(json_file_path)

In [6]:
df.show(5,truncate=False)

+--------------------------+-----------------------------------------------------------------------------------------+-----------------+-----------------+----------------------------------------------------------------------------------------------------------------------------+--------+-----------------+-------------+----+--------------+------------------------+-------------+
|_id                       |amazon_product_url                                                                       |author           |bestsellers_date |description                                                                                                                 |price   |published_date   |publisher    |rank|rank_last_week|title                   |weeks_on_list|
+--------------------------+-----------------------------------------------------------------------------------------+-----------------+-----------------+--------------------------------------------------------------------------------------

In [7]:
df.count()

10195

In [8]:
uniq_df = df.dropDuplicates()
uniq_df.count()

10195

In [9]:
df.select('author').show(10)

+--------------------+
|              author|
+--------------------+
|       Dean R Koontz|
|     Stephenie Meyer|
|        Emily Giffin|
|   Patricia Cornwell|
|     Chuck Palahniuk|
|James Patterson a...|
|       John Sandford|
|       Jimmy Buffett|
|    Elizabeth George|
|      David Baldacci|
+--------------------+
only showing top 10 rows



In [10]:
df.columns

['_id',
 'amazon_product_url',
 'author',
 'bestsellers_date',
 'description',
 'price',
 'published_date',
 'publisher',
 'rank',
 'rank_last_week',
 'title',
 'weeks_on_list']

In [11]:
df.select('author', 'title', 'publisher', 'price', 'rank').show(10, truncate=False)

+----------------------------------------+------------------------+-------------+--------+----+
|author                                  |title                   |publisher    |price   |rank|
+----------------------------------------+------------------------+-------------+--------+----+
|Dean R Koontz                           |ODD HOURS               |Bantam       |[, 27]  |[1] |
|Stephenie Meyer                         |THE HOST                |Little, Brown|[25.99,]|[2] |
|Emily Giffin                            |LOVE THE ONE YOU'RE WITH|St. Martin's |[24.95,]|[3] |
|Patricia Cornwell                       |THE FRONT               |Putnam       |[22.95,]|[4] |
|Chuck Palahniuk                         |SNUFF                   |Doubleday    |[24.95,]|[5] |
|James Patterson and Gabrielle Charbonnet|SUNDAYS AT TIFFANY’S    |Little, Brown|[24.99,]|[6] |
|John Sandford                           |PHANTOM PREY            |Putnam       |[26.95,]|[7] |
|Jimmy Buffett                          

In [12]:
from pyspark.sql.functions import *
df.select('title',when(df.title != 'ODD HOURS', 1).otherwise(0)).show()

+--------------------+-----------------------------------------------------+
|               title|CASE WHEN (NOT (title = ODD HOURS)) THEN 1 ELSE 0 END|
+--------------------+-----------------------------------------------------+
|           ODD HOURS|                                                    0|
|            THE HOST|                                                    1|
|LOVE THE ONE YOU'...|                                                    1|
|           THE FRONT|                                                    1|
|               SNUFF|                                                    1|
|SUNDAYS AT TIFFANY’S|                                                    1|
|        PHANTOM PREY|                                                    1|
|          SWINE NOT?|                                                    1|
|     CARELESS IN RED|                                                    1|
|     THE WHOLE TRUTH|                                                    1|

In [13]:
df[df.author.isin("John Sandford", "David Baldacci")].show()

+--------------------+--------------------+--------------+-----------------+--------------------+--------+-----------------+-------------+----+--------------+---------------+-------------+
|                 _id|  amazon_product_url|        author| bestsellers_date|         description|   price|   published_date|    publisher|rank|rank_last_week|          title|weeks_on_list|
+--------------------+--------------------+--------------+-----------------+--------------------+--------+-----------------+-------------+----+--------------+---------------+-------------+
|[5b4aa4ead3089013...|http://www.amazon...| John Sandford|[[1211587200000]]|The Minneapolis d...|[26.95,]|[[1212883200000]]|       Putnam| [7]|           [4]|   PHANTOM PREY|          [3]|
|[5b4aa4ead3089013...|http://www.amazon...|David Baldacci|[[1211587200000]]|An intelligence a...|[26.99,]|[[1212883200000]]|Grand Central|[10]|           [7]|THE WHOLE TRUTH|          [5]|
|[5b4aa4ead3089013...|http://www.amazon...| John Sandfo

In [14]:
df[df.author.isin("John Sandford", "David Baldacci")].select('author', 'title', 'publisher', 'price', 'rank').show()

+--------------+---------------+-------------+--------+----+
|        author|          title|    publisher|   price|rank|
+--------------+---------------+-------------+--------+----+
| John Sandford|   PHANTOM PREY|       Putnam|[26.95,]| [7]|
|David Baldacci|THE WHOLE TRUTH|Grand Central|[26.99,]|[10]|
| John Sandford|   PHANTOM PREY|       Putnam|[26.95,]| [9]|
|David Baldacci|THE WHOLE TRUTH|Grand Central|[26.99,]|[13]|
| John Sandford|   PHANTOM PREY|       Putnam|[26.95,]|[12]|
| John Sandford|   PHANTOM PREY|       Putnam|   [, 0]|[20]|
| John Sandford| HEAT LIGHTNING|       Putnam|[26.95,]| [2]|
| John Sandford| HEAT LIGHTNING|       Putnam|[26.95,]| [4]|
| John Sandford| HEAT LIGHTNING|       Putnam|[26.95,]| [6]|
| John Sandford| HEAT LIGHTNING|       Putnam|[26.95,]| [7]|
| John Sandford| HEAT LIGHTNING|       Putnam|[26.95,]|[13]|
| John Sandford| HEAT LIGHTNING|       Putnam|   [, 0]|[20]|
|David Baldacci| DIVINE JUSTICE|Grand Central|  [, 27]| [1]|
|David Baldacci| DIVINE 

In [15]:
df.select('author', 'title', 'price', 'rank',df.author.isin("John Sandford", "David Baldacci")).show()

+--------------------+--------------------+--------+----+-------------------------------------------+
|              author|               title|   price|rank|(author IN (John Sandford, David Baldacci))|
+--------------------+--------------------+--------+----+-------------------------------------------+
|       Dean R Koontz|           ODD HOURS|  [, 27]| [1]|                                      false|
|     Stephenie Meyer|            THE HOST|[25.99,]| [2]|                                      false|
|        Emily Giffin|LOVE THE ONE YOU'...|[24.95,]| [3]|                                      false|
|   Patricia Cornwell|           THE FRONT|[22.95,]| [4]|                                      false|
|     Chuck Palahniuk|               SNUFF|[24.95,]| [5]|                                      false|
|James Patterson a...|SUNDAYS AT TIFFANY’S|[24.99,]| [6]|                                      false|
|       John Sandford|        PHANTOM PREY|[26.95,]| [7]|                         

In [16]:
df.select('author', 'title', 'publisher',df.title.like("THE %")).show(15,truncate=False)

+----------------------------------------+-----------------------------+------------------+----------------+
|author                                  |title                        |publisher         |title LIKE THE %|
+----------------------------------------+-----------------------------+------------------+----------------+
|Dean R Koontz                           |ODD HOURS                    |Bantam            |false           |
|Stephenie Meyer                         |THE HOST                     |Little, Brown     |true            |
|Emily Giffin                            |LOVE THE ONE YOU'RE WITH     |St. Martin's      |false           |
|Patricia Cornwell                       |THE FRONT                    |Putnam            |true            |
|Chuck Palahniuk                         |SNUFF                        |Doubleday         |false           |
|James Patterson and Gabrielle Charbonnet|SUNDAYS AT TIFFANY’S         |Little, Brown     |false           |
|John Sandford     

In [17]:
df[df.title.like("% THE %")].select('author', 'title').show(15,truncate=False)

+-------------+--------------------------------------------------+
|author       |title                                             |
+-------------+--------------------------------------------------+
|Emily Giffin |LOVE THE ONE YOU'RE WITH                          |
|Garth Stein  |THE ART OF RACING IN THE RAIN                     |
|James Rollins|INDIANA JONES AND THE KINGDOM OF THE CRYSTAL SKULL|
|Emily Giffin |LOVE THE ONE YOU'RE WITH                          |
|Garth Stein  |THE ART OF RACING IN THE RAIN                     |
|Emily Giffin |LOVE THE ONE YOU'RE WITH                          |
|Garth Stein  |THE ART OF RACING IN THE RAIN                     |
|Emily Giffin |LOVE THE ONE YOU'RE WITH                          |
|Garth Stein  |THE ART OF RACING IN THE RAIN                     |
|Emily Giffin |LOVE THE ONE YOU'RE WITH                          |
|Garth Stein  |THE ART OF RACING IN THE RAIN                     |
|Emily Giffin |LOVE THE ONE YOU'RE WITH                       

In [18]:
df[df.title.startswith("THE")].select('author','title','publisher').show(5)

+-----------------+--------------------+-------------+
|           author|               title|    publisher|
+-----------------+--------------------+-------------+
|  Stephenie Meyer|            THE HOST|Little, Brown|
|Patricia Cornwell|           THE FRONT|       Putnam|
|   David Baldacci|     THE WHOLE TRUTH|Grand Central|
|      Garth Stein|THE ART OF RACING...|       Harper|
|      Jeff Shaara|      THE STEEL WAVE|   Ballantine|
+-----------------+--------------------+-------------+
only showing top 5 rows



In [19]:
df[df.title.endswith("NT")].select('author','title','publisher').show(5)

+-----------------+------------------+---------+
|           author|             title|publisher|
+-----------------+------------------+---------+
|Patricia Cornwell|         THE FRONT|   Putnam|
|Patricia Cornwell|         THE FRONT|   Putnam|
|Patricia Cornwell|         THE FRONT|   Putnam|
|     Daniel Silva|THE SECRET SERVANT|   Putnam|
|     Daniel Silva|THE SECRET SERVANT|   Putnam|
+-----------------+------------------+---------+
only showing top 5 rows



In [20]:
df[df.title.endswith("NT")].select('author','title','publisher').dropDuplicates().show(5)

+-----------------+-------------------+--------------+
|           author|              title|     publisher|
+-----------------+-------------------+--------------+
|       Dale Brown|   EXECUTIVE INTENT|William Morrow|
|        P D James|THE PRIVATE PATIENT|         Knopf|
|   Kazuo Ishiguro|   THE BURIED GIANT|         Knopf|
|   Danielle Steel|      THE APARTMENT|     Delacorte|
|Patricia Cornwell|          THE FRONT|        Putnam|
+-----------------+-------------------+--------------+
only showing top 5 rows



In [21]:
df.select(df.author.substr(1,3).alias('title')).show(5)

+-----+
|title|
+-----+
|  Dea|
|  Ste|
|  Emi|
|  Pat|
|  Chu|
+-----+
only showing top 5 rows



In [22]:
# Adding cols
new_df = df[df.title.endswith("NT")].select('author','title','publisher').dropDuplicates()
updated_new_df = new_df.withColumn('new_col', lit('This is a new column'))
updated_new_df.show()

+--------------------+--------------------+-------------------+--------------------+
|              author|               title|          publisher|             new_col|
+--------------------+--------------------+-------------------+--------------------+
|          Dale Brown|    EXECUTIVE INTENT|     William Morrow|This is a new column|
|           P D James| THE PRIVATE PATIENT|              Knopf|This is a new column|
|      Kazuo Ishiguro|    THE BURIED GIANT|              Knopf|This is a new column|
|      Danielle Steel|       THE APARTMENT|          Delacorte|This is a new column|
|   Patricia Cornwell|           THE FRONT|             Putnam|This is a new column|
|James Patterson a...|    THE 9TH JUDGMENT|      Little, Brown|This is a new column|
|           J A Jance|        CRUEL INTENT|         Touchstone|This is a new column|
|      Kristin Hannah|          HOME FRONT|       St. Martin’s|This is a new column|
|       Alex Berenson|THE COUNTERFEIT A...|             Putnam|Th

In [23]:
# updating col
updated_new_df = updated_new_df.withColumnRenamed('new_col', 'added_col')
updated_new_df.show()

+--------------------+--------------------+-------------------+--------------------+
|              author|               title|          publisher|           added_col|
+--------------------+--------------------+-------------------+--------------------+
|          Dale Brown|    EXECUTIVE INTENT|     William Morrow|This is a new column|
|           P D James| THE PRIVATE PATIENT|              Knopf|This is a new column|
|      Kazuo Ishiguro|    THE BURIED GIANT|              Knopf|This is a new column|
|      Danielle Steel|       THE APARTMENT|          Delacorte|This is a new column|
|   Patricia Cornwell|           THE FRONT|             Putnam|This is a new column|
|James Patterson a...|    THE 9TH JUDGMENT|      Little, Brown|This is a new column|
|           J A Jance|        CRUEL INTENT|         Touchstone|This is a new column|
|      Kristin Hannah|          HOME FRONT|       St. Martin’s|This is a new column|
|       Alex Berenson|THE COUNTERFEIT A...|             Putnam|Th

In [24]:
# Dropping cols
df_remove = updated_new_df.drop('added_col')
df_remove.show()

+--------------------+--------------------+-------------------+
|              author|               title|          publisher|
+--------------------+--------------------+-------------------+
|          Dale Brown|    EXECUTIVE INTENT|     William Morrow|
|           P D James| THE PRIVATE PATIENT|              Knopf|
|      Kazuo Ishiguro|    THE BURIED GIANT|              Knopf|
|      Danielle Steel|       THE APARTMENT|          Delacorte|
|   Patricia Cornwell|           THE FRONT|             Putnam|
|James Patterson a...|    THE 9TH JUDGMENT|      Little, Brown|
|           J A Jance|        CRUEL INTENT|         Touchstone|
|      Kristin Hannah|          HOME FRONT|       St. Martin’s|
|       Alex Berenson|THE COUNTERFEIT A...|             Putnam|
|             C J Box|      BREAKING POINT|             Putnam|
|             Amy Tan|THE VALLEY OF AMA...| Ecco/HarperCollins|
|        Tim Johnston|             DESCENT|          Algonquin|
|         Scott Turow|            INNOCE

In [25]:
# Inspecting data

In [26]:
df.dtypes

[('_id', 'struct<$oid:string>'),
 ('amazon_product_url', 'string'),
 ('author', 'string'),
 ('bestsellers_date', 'struct<$date:struct<$numberLong:string>>'),
 ('description', 'string'),
 ('price', 'struct<$numberDouble:string,$numberInt:string>'),
 ('published_date', 'struct<$date:struct<$numberLong:string>>'),
 ('publisher', 'string'),
 ('rank', 'struct<$numberInt:string>'),
 ('rank_last_week', 'struct<$numberInt:string>'),
 ('title', 'string'),
 ('weeks_on_list', 'struct<$numberInt:string>')]

In [27]:
df.first()

Row(_id=Row($oid='5b4aa4ead3089013507db18b'), amazon_product_url='http://www.amazon.com/Odd-Hours-Dean-Koontz/dp/0553807056?tag=NYTBS-20', author='Dean R Koontz', bestsellers_date=Row($date=Row($numberLong='1211587200000')), description='Odd Thomas, who can communicate with the dead, confronts evil forces in a California coastal town.', price=Row($numberDouble=None, $numberInt='27'), published_date=Row($date=Row($numberLong='1212883200000')), publisher='Bantam', rank=Row($numberInt='1'), rank_last_week=Row($numberInt='0'), title='ODD HOURS', weeks_on_list=Row($numberInt='1'))

In [28]:
df.first()['amazon_product_url']

'http://www.amazon.com/Odd-Hours-Dean-Koontz/dp/0553807056?tag=NYTBS-20'

In [29]:
df.first()['author']

'Dean R Koontz'

In [30]:
df.take(5)

[Row(_id=Row($oid='5b4aa4ead3089013507db18b'), amazon_product_url='http://www.amazon.com/Odd-Hours-Dean-Koontz/dp/0553807056?tag=NYTBS-20', author='Dean R Koontz', bestsellers_date=Row($date=Row($numberLong='1211587200000')), description='Odd Thomas, who can communicate with the dead, confronts evil forces in a California coastal town.', price=Row($numberDouble=None, $numberInt='27'), published_date=Row($date=Row($numberLong='1212883200000')), publisher='Bantam', rank=Row($numberInt='1'), rank_last_week=Row($numberInt='0'), title='ODD HOURS', weeks_on_list=Row($numberInt='1')),
 Row(_id=Row($oid='5b4aa4ead3089013507db18c'), amazon_product_url='http://www.amazon.com/The-Host-Novel-Stephenie-Meyer/dp/0316218502?tag=NYTBS-20', author='Stephenie Meyer', bestsellers_date=Row($date=Row($numberLong='1211587200000')), description='Aliens have taken control of the minds and bodies of most humans, but one woman won’t surrender.', price=Row($numberDouble='25.99', $numberInt=None), published_date=

In [31]:
df.describe().show()

+-------+--------------------+---------------+--------------------+---------+------------------+
|summary|  amazon_product_url|         author|         description|publisher|             title|
+-------+--------------------+---------------+--------------------+---------+------------------+
|  count|               10195|          10195|               10195|    10195|             10195|
|   mean|                null|           null|                null|     null|1877.7142857142858|
| stddev|                null|           null|                null|     null| 370.9760613506458|
|    min|http://www.amazon...|        AJ Finn|                    |      ACE|  10TH ANNIVERSARY|
|    max|https://www.amazo...|various authors|’Tis for the Rebe...|allantine|               ZOO|
+-------+--------------------+---------------+--------------------+---------+------------------+



In [32]:
df.distinct().count()

10195

In [33]:
# GROUP BY
df.groupBy('author').count().show()

+--------------------+-----+
|              author|count|
+--------------------+-----+
|          James Frey|    2|
|    Elin Hilderbrand|   58|
|   Sharon Kay Penman|    2|
|         Lisa Genova|    7|
|        Will Allison|    1|
|   Patricia Cornwell|   64|
|       Laurie R King|    6|
|          Tea Obreht|    6|
|        Sarah Dunant|    1|
|        Tim Johnston|    1|
|          Sara Gruen|   13|
|Tom Clancy with P...|    9|
|         Andre Dubus|    1|
|        Terry Brooks|   19|
|        Lisa Wingate|   42|
|        Daniel Silva|   69|
|Karen White, Beat...|    1|
|      Rachel Kushner|    3|
|      Jackie Collins|   14|
|          Pat Conroy|   11|
+--------------------+-----+
only showing top 20 rows



In [34]:
# Filter Operation
df.filter(df['title'] == 'THE HOST').show(5)

+--------------------+--------------------+---------------+-----------------+--------------------+--------+-----------------+-------------+----+--------------+--------+-------------+
|                 _id|  amazon_product_url|         author| bestsellers_date|         description|   price|   published_date|    publisher|rank|rank_last_week|   title|weeks_on_list|
+--------------------+--------------------+---------------+-----------------+--------------------+--------+-----------------+-------------+----+--------------+--------+-------------+
|[5b4aa4ead3089013...|http://www.amazon...|Stephenie Meyer|[[1211587200000]]|Aliens have taken...|[25.99,]|[[1212883200000]]|Little, Brown| [2]|           [1]|THE HOST|          [3]|
|[5b4aa4ead3089013...|http://www.amazon...|Stephenie Meyer|[[1212192000000]]|Aliens have taken...|[25.99,]|[[1213488000000]]|Little, Brown| [2]|           [2]|THE HOST|          [4]|
|[5b4aa4ead3089013...|http://www.amazon...|Stephenie Meyer|[[1212796800000]]|Aliens h

In [35]:
# Missing and Replacing Values
df.na.fill()
# OR
df.fillna()

TypeError: fill() missing 1 required positional argument: 'value'

In [36]:
# Repartitioning
df.rdd.getNumPartitions()

2

In [37]:
df.repartition(10).rdd.getNumPartitions()

10

In [38]:
df.coalesce(2).rdd.getNumPartitions()

2

In [None]:
# CONVERSION
# Converting dataframe into an RDD
rdd_convert = df.rdd
# Converting dataframe into a RDD of string 
df.toJSON().first()
# Obtaining contents of df as Pandas 
df.toPandas()

In [None]:
# Write & Save File in .parquet format
dataframe.select("author", "title", "rank", "description") \
.write \
.save("Rankings_Descriptions.parquet")


# Write & Save File in .json format
dataframe.select("author", "title") \
.write \
.save("Authors_Titles.json",format="json")

In [39]:
spark.stop()

In [40]:
df.show()

Py4JJavaError: An error occurred while calling o31.showString.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:238)
java.lang.Thread.run(Thread.java:748)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:111)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1471)
	at org.apache.spark.sql.execution.datasources.json.JsonFileFormat.buildReader(JsonFileFormat.scala:101)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:130)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:121)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:170)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:398)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:389)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:47)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:316)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:434)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3625)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2695)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2695)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2902)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
	at sun.reflect.GeneratedMethodAccessor74.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
