In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AnalisisTransacciones").getOrCreate()


In [2]:
df = spark.read.option("multiline", "true").json("transacciones.json")
df.show()

+------------+-------+---+------+
|   categoria|cliente| id| monto|
+------------+-------+---+------+
|     Compras|    Ana|  1|150.75|
|      Viajes| Carlos|  2| 230.5|
|Alimentaci贸n|  Elena|  3|  85.3|
| Electr贸nica| Miguel|  4| 320.0|
+------------+-------+---+------+



In [3]:
df.printSchema()

root
 |-- categoria: string (nullable = true)
 |-- cliente: string (nullable = true)
 |-- id: long (nullable = true)
 |-- monto: double (nullable = true)



In [4]:
df.createOrReplaceTempView("transacciones")

consulta = spark.sql("SELECT cliente, monto FROM transacciones WHERE monto > 100")
consulta.show()

+-------+------+
|cliente| monto|
+-------+------+
|    Ana|150.75|
| Carlos| 230.5|
| Miguel| 320.0|
+-------+------+



In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def clasificar_monto(monto):
  return "Alta" if monto > 200 else "Baja"

udf_clasificar = udf(clasificar_monto, StringType())

df3 = df.withColumn("categoria_monto", udf_clasificar(df["monto"]))
df3.show()


+------------+-------+---+------+---------------+
|   categoria|cliente| id| monto|categoria_monto|
+------------+-------+---+------+---------------+
|     Compras|    Ana|  1|150.75|           Baja|
|      Viajes| Carlos|  2| 230.5|           Alta|
|Alimentaci贸n|  Elena|  3|  85.3|           Baja|
| Electr贸nica| Miguel|  4| 320.0|           Alta|
+------------+-------+---+------+---------------+



In [6]:
df3.write.parquet("transacciones.parquet")

In [7]:
df3.coalesce(1).write.mode("overwrite").parquet("transacciones2.parquet")