In [2]:
# Importar las librerías necesarias
from pyspark.sql import SparkSession

# Inicializar la SparkSession
spark = SparkSession.builder \
    .appName("Flight CSV Reader") \
    .getOrCreate()

# Leer el archivo CSV
df = spark.read.csv("flights.csv", header=True, inferSchema=True)

# Mostrar las primeras filas del DataFrame
df.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|    1|  1|       1|       96|     235|       70|     AS| N508AS|   145|   PDX| ANC|     194|    1542|   0|     1|
|2014|    1|  1|       4|       -6|     738|      -23|     US| N195UW|  1830|   SEA| CLT|     252|    2279|   0|     4|
|2014|    1|  1|       8|       13|     548|       -4|     UA| N37422|  1609|   PDX| IAH|     201|    1825|   0|     8|
|2014|    1|  1|      28|       -2|     800|      -23|     US| N547UW|   466|   PDX| CLT|     251|    2282|   0|    28|
|2014|    1|  1|      34|       44|     325|       43|     AS| N762AS|   121|   SEA| ANC|     201|    1448|   0|    34|
|2014|    1|  1|      37|       82|     

In [15]:
from pyspark.ml.feature import VectorAssembler

# Lista todas las columnas del DataFrame original
todas_las_columnas = df.columns


# Filtrar para obtener solo las columnas que no son de tipo string
column_data_types = df.dtypes
columnas_ensamblar = [name for name, dtype in column_data_types if dtype != 'string' and name != 'arr_delay']
columnas_ensamblar = [
    'monthIndexed' if col == 'month' else
    'carrierIndexed' if col == 'carrier' else
    col
    for col in columnas_ensamblar
]

print(columnas_ensamblar)

['year', 'monthIndexed', 'day', 'flight', 'distance']


In [17]:
vectorAssembler = VectorAssembler(
    inputCols=columnas_ensamblar,
    outputCol="features"
)

#df_assembled = vectorAssembler.transform(df)

In [18]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="carrierIndexed", outputCol="carrier")

In [19]:
a =StringIndexer().setInputCol("monthIndexed ").setOutputCol("month")

In [20]:
b=StringIndexer().setInputCol("carrierIndexed ").setOutputCol("carrier")

In [24]:
# Filtrar los vuelos que llegan con retraso positivo
delayed_flights = df.where("arr_delay > 20")

# Mostrar el resultado
delayed_flights.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|    1|  1|       1|       96|     235|       70|     AS| N508AS|   145|   PDX| ANC|     194|    1542|   0|     1|
|2014|    1|  1|      34|       44|     325|       43|     AS| N762AS|   121|   SEA| ANC|     201|    1448|   0|    34|
|2014|    1|  1|      37|       82|     747|       88|     DL| N806DN|  1823|   SEA| DTW|     224|    1927|   0|    37|
|2014|    1|  1|     346|      227|     936|      219|     UA| N14219|  1481|   SEA| ORD|     202|    1721|   3|    46|
|2014|    1|  1|     527|        7|     917|       24|     UA| N75433|  1576|   SEA| DEN|     136|    1024|   5|    27|
|2014|    1|  1|     650|       90|    1

In [32]:
#myDF = spark.read.format(<formato>)
#         .load("/path/to/hdfs/file")  # spark es el objeto SparkSession
#   # <formato> puede ser “parquet” | “json” | “csv” | “orc” | “avro”

In [28]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

fileSchema = StructType([
             StructField("dest_country_name", StringType(), True),
             StructField("origin_country_name", StringType(), True),
             StructField("count", LongType(), False)
])

In [33]:
#myDF = spark.read.format("json")
#       .schema(fileSchema)
#       .load(“/path/to/file.json”) # spark es el objeto SparkSession

In [None]:
#df.write.format(“csv”)
#         .mode(“overwrite”)
#         .option(“sep”, “\t”)
#         .option(“header”, “true”)
#         .save("path/to/hdfs/directory")

In [35]:
# Empezamos obteniendo la SparkSession.
# Como, en este caso, no vamos a usar RDD, no es necesario
# obtener el SparkContext:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("ejemplo_DF")\
  .getOrCreate()

# Lo siguiente es cargar los datos de fichero
# en un DataFrame, usando las funciones de lectura
# que vimos anteriormente.
# Concretamente, usamos la opción header, que nos
# permite no tener que hacer código auxiliar para no cargar
# la cabecera como datos, como pasaba con RDD:
flightsDF = spark.read\
           .option("header", "true")\
           .csv("flights.csv")

# Cuando usamos DataFrames, tenemos dos opciones para obtener
# registros de ellos:
# Usar la función take de RDD:
print(flightsDF.take(5))

# Usar la función show de DataFrames:
print(flightsDF.show(5))

[Row(year='2014', month='1', day='1', dep_time='1', dep_delay='96', arr_time='235', arr_delay='70', carrier='AS', tailnum='N508AS', flight='145', origin='PDX', dest='ANC', air_time='194', distance='1542', hour='0', minute='1'), Row(year='2014', month='1', day='1', dep_time='4', dep_delay='-6', arr_time='738', arr_delay='-23', carrier='US', tailnum='N195UW', flight='1830', origin='SEA', dest='CLT', air_time='252', distance='2279', hour='0', minute='4'), Row(year='2014', month='1', day='1', dep_time='8', dep_delay='13', arr_time='548', arr_delay='-4', carrier='UA', tailnum='N37422', flight='1609', origin='PDX', dest='IAH', air_time='201', distance='1825', hour='0', minute='8'), Row(year='2014', month='1', day='1', dep_time='28', dep_delay='-2', arr_time='800', arr_delay='-23', carrier='US', tailnum='N547UW', flight='466', origin='PDX', dest='CLT', air_time='251', distance='2282', hour='0', minute='28'), Row(year='2014', month='1', day='1', dep_time='34', dep_delay='44', arr_time='325', a

In [37]:
# Otro método útil que nos proporciona la API estructurada
# es printSchema, para comprobar el tipo de datos de cada
# columna del DataFrame:
flightsDF.printSchema()

root
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- dep_time: string (nullable = true)
 |-- dep_delay: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: string (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)



In [41]:
# Esto nos da pie a convertir el tipo de las columnas
# año, mes y día, y ver así cómo se hace.
flightsDF = flightsDF\
           .withColumn('year', F.col('year').cast(IntegerType()))\
           .withColumn('month', F.col('month').cast(IntegerType()))\
           .withColumn('day', F.col('day').cast(IntegerType()))

NameError: name 'IntegerType' is not defined

In [40]:
# Ahora, para contar cuantos vuelos llegan a cada aeropuerto de
# destino, basta con agrupar por destino y contar registros por grupo:
flights_destDF = flightsDF.groupBy('dest').count()

# Le cambiamos el nombre a la columna count para ver cómo se hace:
flights_destDF = flights_destDF\
                  .withColumnRenamed('count','dest_count')

# Ordenamos el DataFrame por número de vuelos en orden
# descendente y mostramos los 5 primeros:
flights_destDF.orderBy(F.desc('dest_count')).show(5)

+----+----------+
|dest|dest_count|
+----+----------+
| SFO|     12809|
| LAX|     10456|
| DEN|      9518|
| PHX|      8660|
| LAS|      8214|
+----+----------+
only showing top 5 rows



In [43]:
# Para mostrar también el nombre de los aeropuertos,
# vamos a cargar el fichero correspondiente en otro DataFrame:
airportsDF = spark.read\
            .option("header", "true")\
            .csv("airport-codes.csv")

# Y solo queda hacer el join. Del resultado del join, nos quedamos
# con el nombre del aeropuerto (name) y el número de vuelos,
# y ordenamos por número de vuelos en orden descendente:
flights_airports = flights_destDF\
         .join(airportsDF, flights_destDF.dest == airportsDF.iata_code)\
         .select(F.col('name'), F.col('dest_count'))\
         .orderBy(F.desc('dest_count'))
flights_airports.show(5)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/C:/Users/Pablo/airport-codes.csv.

In [None]:
# Por último, para demostrar el uso, escribimos el resultado en ficheros:
flights_airports.write\
  .format('csv')\
  .save("hdfs://192.168.240.4:9000/user/data/flights_dest_airports")

In [44]:
# Supongamos que tenemos la SparkSession ya cargada.
# Empezamos por cargar los datos:
flightsDF = spark.read\
            .option("header", "true")\
            .csv("hdfs://<ip:port>/user/data/flights.csv")

# Para el uso de la API estructurada, es primordial registrar
# todos los DataFrames que queramos usar como tablas o vistas:
flightsDF.createOrReplaceTempView('flights')

# Calculamos los vuelos que llegan a cada aeropuerto
# de destino usando una sentencia SQL:
flights_dest_count = spark.sql('SELECT dest, COUNT(dest) AS dest_count FROM flights GROUP BY dest ORDER BY dest_count DESC')

# De nuevo, para trabajar posteriormente con el resultado
# de esta consulta, necesitamos registrarla como vista:
flights_dest_count.createOrReplaceTempView('flights_dest_count')
flights_dest_count.show(5)

Py4JJavaError: An error occurred while calling o199.csv.
: java.io.IOException: Incomplete HDFS URI, no host: hdfs://%3Cip:port%3E/user/data/flights.csv
	at org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:184)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3469)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$1(DataSource.scala:724)
	at scala.collection.immutable.List.map(List.scala:293)
	at org.apache.spark.sql.execution.datasources.DataSource$.checkAndGlobPathIfNecessary(DataSource.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessary(DataSource.scala:551)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:404)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:538)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)


In [None]:
# Cargamos la información sobre los aeropuertos:
airportsDF = spark.read\
            .option("header", "true")\
            .csv("hdfs://192.168.240.4:9000/user/data/airport-codes.csv")
# Registramos el DataFrame como vista:
airportsDF.createOrReplaceTempView('airports')

# Llegados a este punto, solo resta hacer el join:
flights_dest_airports = spark.sql('SELECT a.name, f.dest_count FROM flights_dest_count f JOIN airports a ON f.dest=a.iata_code ORDER BY dest_count DESC')
flights_dest_airports.show(5)

In [12]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# Crear la sesión de Spark
spark = SparkSession.builder.appName("Ejemplo").getOrCreate()
# Crear el DataFrame
data = [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.51]), 1.0)]

# Definir los nombres de las columnas
columnas = ["id", "hour", "mobile", "userFeatures", "clicked"]
# Crear el DataFrame
dataset = spark.createDataFrame(data, columnas)

# Crear el VectorAssembler
assembler = VectorAssembler(
    inputCols=["hour", "mobile", "userFeatures"],
    outputCol="features"
)

# Transformar el DataFrame
output = assembler.transform(dataset)
print("Assembled hour, mobile, userFeatures to column 'features'")
output.select("features", "clicked").show(truncate=False)

Py4JJavaError: An error occurred while calling o101.transform.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1) (DESKTOP-3AMGHHK executor driver): java.io.IOException: Cannot run program "python3": CreateProcess error=2, El sistema no puede encontrar el archivo especificado
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1143)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1073)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.IOException: CreateProcess error=2, El sistema no puede encontrar el archivo especificado
	at java.base/java.lang.ProcessImpl.create(Native Method)
	at java.base/java.lang.ProcessImpl.<init>(ProcessImpl.java:499)
	at java.base/java.lang.ProcessImpl.start(ProcessImpl.java:158)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1110)
	... 32 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2790)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2726)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2725)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2725)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1211)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1211)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1211)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2989)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2928)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2917)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:976)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4218)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3202)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4208)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4206)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4206)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3202)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3209)
	at org.apache.spark.sql.Dataset.first(Dataset.scala:3216)
	at org.apache.spark.ml.feature.VectorAssembler$.getVectorLengthsFromFirstRow(VectorAssembler.scala:205)
	at org.apache.spark.ml.feature.VectorAssembler$.getLengths(VectorAssembler.scala:231)
	at org.apache.spark.ml.feature.VectorAssembler.transform(VectorAssembler.scala:95)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.IOException: Cannot run program "python3": CreateProcess error=2, El sistema no puede encontrar el archivo especificado
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1143)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1073)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
Caused by: java.io.IOException: CreateProcess error=2, El sistema no puede encontrar el archivo especificado
	at java.base/java.lang.ProcessImpl.create(Native Method)
	at java.base/java.lang.ProcessImpl.<init>(ProcessImpl.java:499)
	at java.base/java.lang.ProcessImpl.start(ProcessImpl.java:158)
	at java.base/java.lang.ProcessBuilder.start(ProcessBuilder.java:1110)
	... 32 more


In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

