In [89]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
import sys

In [90]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

#### Criar sessão PySpark

In [91]:
spark = (
  SparkSession.builder
  .master('local')
  .appName('PySpark_01')
  .getOrCreate()
)

#### Criar DF / ler arquivo

In [92]:
df = spark.read.csv('./netflix_titles.csv', header=True, inferSchema=True)

#### Exibir DF

In [93]:
df.show(5)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                NULL|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           NULL|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         NULL|Septem

#### Verificar tipos de colunas

In [94]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



#### Verificando dados nulos

In [95]:
for coluna in df.columns:
  print(coluna, df.filter(df[coluna].isNull()).count())

show_id 0
type 1
title 2
director 2636
cast 826
country 832
date_added 13
release_year 2
rating 6
duration 5
listed_in 3
description 3


#### Renomeando Colunas

In [96]:
portDf = df.withColumnRenamed('show_id', 'id')\
.withColumnRenamed('type', 'tipo')\
.withColumnRenamed('title', 'titulo')\
.withColumnRenamed('director', 'diretor')\
.withColumnRenamed('cast' ,'atores_principais')\
.withColumnRenamed('country', 'pais')\
.withColumnRenamed('date_added', 'data_dicionado')\
.withColumnRenamed('release_year', 'ano_lancamento')\
.withColumnRenamed('rating', 'classificacao')\
.withColumnRenamed('duration', 'duracao')\
.withColumnRenamed('listed_in', 'categoria')\
.withColumnRenamed('description', 'descricao').show(5)

+---+-------+--------------------+---------------+--------------------+-------------+------------------+--------------+-------------+---------+--------------------+--------------------+
| id|   tipo|              titulo|        diretor|   atores_principais|         pais|    data_dicionado|ano_lancamento|classificacao|  duracao|           categoria|           descricao|
+---+-------+--------------------+---------------+--------------------+-------------+------------------+--------------+-------------+---------+--------------------+--------------------+
| s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                NULL|United States|September 25, 2021|          2020|        PG-13|   90 min|       Documentaries|As her father nea...|
| s2|TV Show|       Blood & Water|           NULL|Ama Qamata, Khosi...| South Africa|September 24, 2021|          2021|        TV-MA|2 Seasons|International TV ...|After crossing pa...|
| s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...

#### Selecionar Colunas

In [97]:
df.select('title', 'description').show(5)

+--------------------+--------------------+
|               title|         description|
+--------------------+--------------------+
|Dick Johnson Is Dead|As her father nea...|
|       Blood & Water|After crossing pa...|
|           Ganglands|To protect his fa...|
|Jailbirds New Orl...|Feuds, flirtation...|
|        Kota Factory|In a city of coac...|
+--------------------+--------------------+
only showing top 5 rows



In [98]:
df.select(col('title'), col('description')).show(5)

+--------------------+--------------------+
|               title|         description|
+--------------------+--------------------+
|Dick Johnson Is Dead|As her father nea...|
|       Blood & Water|After crossing pa...|
|           Ganglands|To protect his fa...|
|Jailbirds New Orl...|Feuds, flirtation...|
|        Kota Factory|In a city of coac...|
+--------------------+--------------------+
only showing top 5 rows



In [99]:
df.select(df['title']).show(5)

+--------------------+
|               title|
+--------------------+
|Dick Johnson Is Dead|
|       Blood & Water|
|           Ganglands|
|Jailbirds New Orl...|
|        Kota Factory|
+--------------------+
only showing top 5 rows



#### Selecionar Colunas com ALIAS
**Pelo primeiro método de select, o alias não funciona por se tratar de uma string**

In [100]:
df.select(col('title').alias('titulo')).show(5)

+--------------------+
|              titulo|
+--------------------+
|Dick Johnson Is Dead|
|       Blood & Water|
|           Ganglands|
|Jailbirds New Orl...|
|        Kota Factory|
+--------------------+
only showing top 5 rows



In [101]:
df.select('title director cast'.split()).show(5)

+--------------------+---------------+--------------------+
|               title|       director|                cast|
+--------------------+---------------+--------------------+
|Dick Johnson Is Dead|Kirsten Johnson|                NULL|
|       Blood & Water|           NULL|Ama Qamata, Khosi...|
|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|
|Jailbirds New Orl...|           NULL|                NULL|
|        Kota Factory|           NULL|Mayur More, Jiten...|
+--------------------+---------------+--------------------+
only showing top 5 rows



#### Organizar Select

In [102]:
df.select('title', 'type', 'country').show(5)

+--------------------+-------+-------------+
|               title|   type|      country|
+--------------------+-------+-------------+
|Dick Johnson Is Dead|  Movie|United States|
|       Blood & Water|TV Show| South Africa|
|           Ganglands|TV Show|         NULL|
|Jailbirds New Orl...|TV Show|         NULL|
|        Kota Factory|TV Show|        India|
+--------------------+-------+-------------+
only showing top 5 rows



#### Filtrar DF
**Se tiver espaços, você deve utilizar a função col no nome da coluna**

In [103]:
# df.filter('country = "Brazil"').show()
df.filter(col('cast') == 'Afonso Padilha').show()

+-------+-----+--------------------+--------------------+--------------+-------+-----------------+------------+------+--------+---------------+--------------------+
|show_id| type|               title|            director|          cast|country|       date_added|release_year|rating|duration|      listed_in|         description|
+-------+-----+--------------------+--------------------+--------------+-------+-----------------+------------+------+--------+---------------+--------------------+
|  s2051|Movie|Afonso Padilha: C...|Junior Carelli, R...|Afonso Padilha| Brazil|September 3, 2020|        2020| TV-MA|  63 min|Stand-Up Comedy|Brazilian comedia...|
+-------+-----+--------------------+--------------------+--------------+-------+-----------------+------------+------+--------+---------------+--------------------+



#### Filtrar com duas condições (AND/&)

In [104]:
df.filter((col('country') == 'Brazil') & (col('type') == 'Movie') & (col('release_year') == '2021')).show()

+-------+-----+-------------+-------------+--------------------+-------+--------------+------------+------+--------+--------------------+--------------------+
|show_id| type|        title|     director|                cast|country|    date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+-------------+-------------+--------------------+-------+--------------+------------+------+--------+--------------------+--------------------+
|   s784|Movie|     Carnaval| Leandro Neri|Giovana Cordeiro,...| Brazil|  June 2, 2021|        2021| TV-MA|  95 min|Comedies, Interna...|After a breakup, ...|
|   s967|Movie|Get the Grift|Pedro Antonio|Marcus Majella, S...| Brazil|April 28, 2021|        2021| TV-MA|  95 min|Comedies, Interna...|After a botched s...|
|  s1189|Movie| Get the Goat| Vitor Brandt|Matheus Nachterga...| Brazil|March 18, 2021|        2021| TV-14|  98 min|Action & Adventur...|Two hapless cops ...|
+-------+-----+-------------+-------------+---

#### Filtrar com 2 condições (OR/|)

In [105]:
df.filter((col('director') == 'Theodore Melfi') | (col('director') == 'Andy Devonshire')).show()

+-------+-------+--------------------+---------------+--------------------+--------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|       country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+--------------+------------------+------------+------+---------+--------------------+--------------------+
|     s9|TV Show|The Great British...|Andy Devonshire|Mel Giedroyc, Sue...|United Kingdom|September 24, 2021|        2021| TV-14|9 Seasons|British TV Shows,...|A talented batch ...|
|    s10|  Movie|        The Starling| Theodore Melfi|Melissa McCarthy,...| United States|September 24, 2021|        2021| PG-13|  104 min|    Comedies, Dramas|A woman adjusting...|
+-------+-------+--------------------+---------------+--------------------+--------------+

#### Filtrar combinando & e |

In [106]:
df.filter((col('director') == 'Theodore Melfi') & (col('type') == 'Movie') | (col('country') == 'South Africa') & (col('type') == 'Movie')).show(5)

+-------+-----+--------------------+--------------+--------------------+-------------+------------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|      director|                cast|      country|        date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+--------------+--------------------+-------------+------------------+------------+------+--------+--------------------+--------------------+
|    s10|Movie|        The Starling|Theodore Melfi|Melissa McCarthy,...|United States|September 24, 2021|        2021| PG-13| 104 min|    Comedies, Dramas|A woman adjusting...|
|   s294|Movie|                Slay|     Adze Ugah|Enhle Mbali, Rams...| South Africa|    August 8, 2021|        2021| TV-MA|  86 min|Comedies, Interna...|In pursuit of bot...|
|   s765|Movie|Trippin' with the...| Jayan Moodley|Jailoshini Naidoo...| South Africa|      June 4, 2021|        20

#### Criando novas colunas (usando a função lit)

In [107]:
df.withColumn('streaming', lit('Netflix')).show(5)
# Também é possível passar funções dentro do lit
df.withColumn('higher_2020', lit(col('release_year') > 2019)).show(20)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+---------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|streaming|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+---------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                NULL|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|  Netflix|
|     s2|TV Show|       Blood & Water|           NULL|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|  Netflix|
|     s3|TV Show|           Ganglands|Julien 

#### Criar coluna condicional (usando função substring)

In [114]:
df.withColumn('id_without_s', substring('show_id', 2, 5)).show(5)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|id_without_s|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                NULL|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|           1|
|     s2|TV Show|       Blood & Water|           NULL|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|           2|
|     s3|TV Show|           Ga

#### Criar coluna condicional (concat/concat_ws)

In [109]:
df.withColumn('release_duration', concat('duration', 'release_year')).select('release_duration').show(5) #Como podemos reparar, sem espaços

+----------------+
|release_duration|
+----------------+
|      90 min2020|
|   2 Seasons2021|
|    1 Season2021|
|    1 Season2021|
|   2 Seasons2021|
+----------------+
only showing top 5 rows



In [110]:
df.withColumn('release_duration', concat_ws(' - ', 'duration', 'release_year')).select('release_duration').show(5)
#Como vemos, com o separador determinado

+----------------+
|release_duration|
+----------------+
|   90 min - 2020|
|2 Seasons - 2021|
| 1 Season - 2021|
| 1 Season - 2021|
|2 Seasons - 2021|
+----------------+
only showing top 5 rows



#### Alterar tipo de coluna

In [111]:
df = df.withColumn('release_year', col('release_year').cast(IntegerType()))

In [112]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



#### Usando funções lambdas

In [113]:
onlyDateAdded = udf(lambda date: date.split(',')[0] if date else None)

df.withColumn('day', onlyDateAdded('date_added')).show(5)
# Esse arquivo tem dados nulos e infelizmente nao to conseguindo utilizar a função lambda para separar as datas

Py4JJavaError: An error occurred while calling o789.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 152.0 failed 1 times, most recent failure: Lost task 0.0 in stage 152.0 (TID 128) (host.docker.internal executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(Unknown Source)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4333)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3539)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at sun.reflect.GeneratedMethodAccessor57.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(Unknown Source)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	... 26 more
