# Chapter 4

## Creamos una vista temporal

In [0]:
schema = "`date` STRING, `delay` INT, `distance` INT, `origin` STRING, `destination` STRING"

In [0]:
# In Python
from pyspark.sql import SparkSession 
# Create a SparkSession
spark = (SparkSession
 .builder
 .appName("SparkSQLExampleApp")
 .getOrCreate())
# Path to data set
csv_file = "/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
# Read and create a temporary view
# Infer schema (note that for larger files you 
# may want to specify the schema)
df = (spark.read.schema(schema).format("csv")
 .option("header", "true")
 .load(csv_file))
df.createOrReplaceTempView("us_delay_flights_tbl")

In [0]:
df.columns

Out[3]: ['date', 'delay', 'distance', 'origin', 'destination']

In [0]:
df.show(5)

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+
only showing top 5 rows



**Ejercicio: Convertir la columna fecha a un formato legible**

In [0]:
from pyspark.sql.functions import *
df_new = (df
         .withColumn("DateAndHour", to_timestamp(col("date"), "mm-dd hh:mm")))

In [0]:
df_new.show()

+--------+-----+--------+------+-----------+-----------+
|    date|delay|distance|origin|destination|DateAndHour|
+--------+-----+--------+------+-----------+-----------+
|01011245|    6|     602|   ABE|        ATL|       null|
|01020600|   -8|     369|   ABE|        DTW|       null|
|01021245|   -2|     602|   ABE|        ATL|       null|
|01020605|   -4|     602|   ABE|        ATL|       null|
|01031245|   -4|     602|   ABE|        ATL|       null|
|01030605|    0|     602|   ABE|        ATL|       null|
|01041243|   10|     602|   ABE|        ATL|       null|
|01040605|   28|     602|   ABE|        ATL|       null|
|01051245|   88|     602|   ABE|        ATL|       null|
|01050605|    9|     602|   ABE|        ATL|       null|
|01061215|   -6|     602|   ABE|        ATL|       null|
|01061725|   69|     602|   ABE|        ATL|       null|
|01061230|    0|     369|   ABE|        DTW|       null|
|01060625|   -3|     602|   ABE|        ATL|       null|
|01070600|    0|     369|   ABE

## Consultas SQL

Buscamos todos los vuelos cuya distancia sea mayor a 1,000 millas

In [0]:
spark.sql("""SELECT distance, origin, destination 
FROM us_delay_flights_tbl WHERE distance > 1000 
ORDER BY distance DESC""").show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



Todos los vuelos entre San Francisco (SFO) y Chicago (ORD) con al menos dos horas de retraso

In [0]:
spark.sql("""SELECT date, delay, origin, destination 
FROM us_delay_flights_tbl 
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD' 
ORDER by delay DESC""").show(10)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|02190925| 1638|   SFO|        ORD|
|01031755|  396|   SFO|        ORD|
|01022330|  326|   SFO|        ORD|
|01051205|  320|   SFO|        ORD|
|01190925|  297|   SFO|        ORD|
|02171115|  296|   SFO|        ORD|
|01071040|  279|   SFO|        ORD|
|01051550|  274|   SFO|        ORD|
|03120730|  266|   SFO|        ORD|
|01261104|  258|   SFO|        ORD|
+--------+-----+------+-----------+
only showing top 10 rows



Etiquetar todos los vuelos de EE. UU., independientemente de su origen y destino, con una indicación de los retrasos que experimentaron: retrasos muy largos (> 6 horas), retrasos largos (2 a 6 horas), etc. Agregaremos estas etiquetas legibles por humanos en una nueva columna llamada Flight_Delays

In [0]:
spark.sql("""SELECT delay, origin, destination,
 CASE 
 WHEN delay > 360 THEN 'Very Long Delays'
 WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
 WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
 WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
 WHEN delay = 0 THEN 'No Delays'
 ELSE 'Early'
 END AS Flight_Delays
 FROM us_delay_flights_tbl
 ORDER BY origin, delay DESC""").show(10)
#Supongo que CASE será para crear nuevas columnas

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



Las consultas anteriores se pueden expresar como consultas API. Por ejemplo la primera:

In [0]:
from pyspark.sql.functions import col, desc
(df.select("distance", "origin", "destination")
 .where(col("distance") > 1000)
 .orderBy(desc("distance"))).show(10)
# Or
(df.select("distance", "origin", "destination")
 .where("distance > 1000")
 .orderBy("distance", ascending=False).show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



**Ejercicio: Hacer las otras 2 consultas usando la API de DataFrame**

La segunda

In [0]:
#spark.sql("""SELECT date, delay, origin, destination 
#FROM us_delay_flights_tbl 
#WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD' 
#ORDER by delay DESC""").show(10)

(df
 .select(df.date, df.delay, df.origin, df.destination)
 .where((df.delay > 120) & (df.origin == 'SFO') & (df.destination == 'ORD'))
 .orderBy(df.delay, ascending = False)
 .show(5)
)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|02190925| 1638|   SFO|        ORD|
|01031755|  396|   SFO|        ORD|
|01022330|  326|   SFO|        ORD|
|01051205|  320|   SFO|        ORD|
|01190925|  297|   SFO|        ORD|
+--------+-----+------+-----------+
only showing top 5 rows



La tercera

In [0]:
#spark.sql("""SELECT delay, origin, destination,
#CASE
#WHEN delay > 360 THEN 'Very Long Delays'
#WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
#WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
#WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
#WHEN delay = 0 THEN 'No Delays'
#ELSE 'Early'
#END AS Flight_Delays
#FROM us_delay_flights_tbl
#ORDER BY origin, delay DESC""").show(10)
from pyspark.sql.functions import *
df_3 = (df.withColumn("Flight_Delays", \
                        when((df.delay > 360), lit("Very Long Delays")) \
                        when((df.delay > 120) & (df.delay < 360), lit("Long Delays")) \
                        when(col("delay") > 60 & col("delay") < 120,
                            lit("Short Delays"))
                        when(col("delay") > 0 & col("delay") < 60,
                            lit("Tolerable Delays"))
                        when(col("delay") == 0,
                            lit("No Delays"))
                        otherwise(lit("Early")))
.orderBy(df.origin, ascending=False)
.show(10))

[0;36m  File [0;32m"<command-447741929212287>"[0;36m, line [0;32m15[0m
[0;31m    when((df.delay > 120) & (df.delay < 360), lit("Long Delays")) \[0m
[0m    ^[0m
[0;31mSyntaxError[0m[0;31m:[0m invalid syntax


In [0]:
from pyspark.sql.functions import *
df_3 = df.withColumn("Flight_Delays", \
                     when((df.delay > 360), lit("Very Long Delays")) \
                     when((df.delay < 120), lit("C"))
                    )

[0;36m  File [0;32m"<command-447741929212288>"[0;36m, line [0;32m4[0m
[0;31m    when((df.delay < 120), lit("C"))[0m
[0m    ^[0m
[0;31mSyntaxError[0m[0;31m:[0m invalid syntax


In [0]:
df_3.show(5)

+--------+-----+--------+------+-----------+-------------+
|    date|delay|distance|origin|destination|Flight_Delays|
+--------+-----+--------+------+-----------+-------------+
|01011245|    6|     602|   ABE|        ATL|         null|
|01020600|   -8|     369|   ABE|        DTW|         null|
|01021245|   -2|     602|   ABE|        ATL|         null|
|01020605|   -4|     602|   ABE|        ATL|         null|
|01031245|   -4|     602|   ABE|        ATL|         null|
+--------+-----+--------+------+-----------+-------------+
only showing top 5 rows



## Creando Tablas y Databases de SQL

### Databases

In [0]:
spark.sql("CREATE DATABASE learn_spark_db")
spark.sql("USE learn_spark_db")

Out[45]: DataFrame[]

Ahora todas las tablas que se creen estarán dentro de esa base de datos

### Tables

#### Managed
Las tablas administradas administran tanto los metadatos como los datos. Si se ejecuta un DROP se eliminarían los datos reales y los metadatos.

In [0]:
spark.sql("CREATE TABLE managed_us_delay_flights_tbl (date STRING, delay INT, distance INT, origin STRING, destination STRING)")

Out[72]: DataFrame[]

In [0]:
spark.sql("DROP table managed_us_delay_flights_tbl")

Out[74]: DataFrame[]

Utilizando la API

In [0]:
csv_file = "/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
# Schema as defined in the preceding example
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file, schema=schema)
flights_df.write.saveAsTable("managed_us_delay_flights_tbl")

In [0]:
flights_df.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|    date| null|    null|origin|destination|
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|

Aunque cierre un cluster y abra otro, lo que he hecho en sesiones anteriores se guarda en el fichero de metadatos. Al abrir otra sesión, la tabla no aparece en el contenido de las tablas, pero no me deja volverla a crear porque ya existen los metadatos de la misma. Tendré que eliminar el registro

In [0]:
dbutils.fs.rm("dbfs:/user/hive/warehouse/learn_spark_db.db/managed_us_delay_flights_tbl",recurse=True)

Out[66]: False

In [0]:
spark.sql("DROP table managed_us_delay_flights_tbl")

Out[79]: DataFrame[]

#### Unmanaged
En las tablas no administradas, Spark solo administra los metadatos mientras que el propio usuario maneja los datos. Aquí el DROP solo eliminaría los metadatos.

In [0]:
#SQL
spark.sql("""CREATE TABLE us_delay_flights_tbl(date STRING, delay INT,
 distance INT, origin STRING, destination STRING)
 USING csv OPTIONS (PATH
 '/learning-spark-v2/flights/departuredelays.csv')""")

Out[149]: DataFrame[]

Para eliminar se ha de especificar la ruta de los datos ya que en las unmanaged tables, Spark solo controla los metadatos y no sabe dónde se especifican los datos

In [0]:
spark.sql("""DROP TABLE IF EXISTS us_delay_flights_tbl""") #deletes the metadata
dbutils.fs.rm("/learning-spark-v2/flights/departuredelays.csv", True)   # deletes the data"""

Out[148]: True

In [0]:
%fs
ls 'dbfs:/databricks-datasets/'

path,name,size,modificationTime
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-datasets/COVID/,COVID/,0,0
dbfs:/databricks-datasets/README.md,README.md,976,1532468253000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455043490000
dbfs:/databricks-datasets/adult/,adult/,0,0
dbfs:/databricks-datasets/airlines/,airlines/,0,0
dbfs:/databricks-datasets/amazon/,amazon/,0,0
dbfs:/databricks-datasets/asa/,asa/,0,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,0


In [0]:
#API
(flights_df
 .write
 .option("path", "/learning-spark-v2/flights/departuredelays.csv")
 .saveAsTable("us_delay_flights_tbl"))

## Creando vistas
Las vistas se crean sobre tablas existentes y pueden ser globales o session-scoped. Las globales son visibles en todas las Spark-Sessions de un cluster y la segunda solo es visible para una sola Spark-session (son temporales, desaparecen después de la finalización de la Spark Application.

#### Global
Son visibles en todos los SparkSession en un cluster dado

In [0]:
#A partir de una tabla existente: SQL
spark.sql("""CREATE OR REPLACE GLOBAL TEMP VIEW us_origin_airport_SFO_global_tmp_view AS
 SELECT date, delay, origin, destination from us_delay_flights_tbl WHERE
 origin = 'SFO'""")

Out[89]: DataFrame[]

In [0]:
#API
df_sfo = spark.sql("""SELECT date, delay, origin, destination FROM 
 us_delay_flights_tbl WHERE origin = 'SFO'""")
df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view")
df_sfo.show(5)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|01011250|   55|   SFO|        JFK|
|01012230|    0|   SFO|        JFK|
|01010705|   -7|   SFO|        JFK|
|01010620|   -3|   SFO|        MIA|
|01010915|   -3|   SFO|        LAX|
+--------+-----+------+-----------+
only showing top 5 rows



**Eliminar vistas**

In [0]:
spark.catalog.dropGlobalTempView("us_origin_airport_SFO_global_tmp_view")

#### Temporal
Las vistas son visibles para un único SparkSessios y se eliminan después de finalizar la aplicación Spark

In [0]:
spark.sql("""CREATE OR REPLACE TEMP VIEW us_origin_airport_JFK_tmp_view AS
 SELECT date, delay, origin, destination from us_delay_flights_tbl WHERE
 origin = 'JFK'""")

Out[95]: DataFrame[]

In [0]:
#API
df_jfk = spark.sql("""SELECT date, delay, origin, destination FROM 
 us_delay_flights_tbl WHERE origin = 'JFK'""")
df_jfk.createOrReplaceTempView("us_origin_airport_JFK_tmp_view")
df_jfk.show(5)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|01010900|   14|   JFK|        LAX|
|01011200|   -3|   JFK|        LAX|
|01011900|    2|   JFK|        LAX|
|01011700|   11|   JFK|        LAS|
|01010800|   -1|   JFK|        SFO|
+--------+-----+------+-----------+
only showing top 5 rows



**Eliminar vistas**

In [0]:
spark.catalog.dropTempView("us_origin_airport_JFK_tmp_view")

## Metadatos
Spark administra los metadatos asociados a cada tabla, ya sea administrada o no administrada. Para administrar los metadatos se utiliza Catalog la cual es una herramienta de alto nivel de Spark SQL para almacenar metadatos.

Después de crear la vartiable de la SparkSession, se puede acceder al metadata almacenado de la siaguiente manera:

In [0]:
#spark.catalog.listDatabases()
#spark.catalog.listTables()
spark.catalog.listColumns("us_delay_flights_tbl")

Out[109]: [Column(name='date', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='delay', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='distance', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='origin', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='destination', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

**Almacenamiento en caché de tablas SQL**

Se puede cache y uncahe tablas SQL y vistas. Además, si especificas la tabla como LAZY, se guardará en caché cuando se utiliza la tabla por primera vez y no inmediatamente cuando se crea.

In [0]:
#In SQL
spark.sql("CACHE [LAZY] TABLE <table-name>")
spark.sql("UNCACHE TABLE <table-name>")

## Leyendo tablas a DataFrames
Si ya hay una base de datos learn_spark_db y una tabla us_delay_flights_tbl preparadas para ser utilizadas. En vez de importar datos directamente del JSON file para tener un dataframe, podemos simplemente ejecutar una consulta SQL a la tabla y asignarle como resultado un DataFrame

In [0]:
#Dos formas
usFlightsDF = spark.sql("SELECT * FROM us_delay_flights_tbl")
usFlightsDF2 = spark.table("us_delay_flights_tbl")
usFlightsDF.show(5)


+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|    date| null|    null|origin|destination|
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+
only showing top 5 rows



## Data Sources for DataFrames and SQL Tables
Spark SQL proporciona una gran variedad de data sources. Además de proporcionar un conjunto de métodos reales para leer y escribir desde estas fuentes utilizando la Data Sources API.

### DataFrameReader
Es la herramienta con la que se lee una fuente de datos en un DataFrame. Tiene un formato definido y un patrón recomendado de uso.

```DataFrameReader.format(args).option("key", "value").schema(args).load()```

**Opciones**

| Method | Arguments | Description |
| --- | --- | --- |
| format() | "parquet", "csv", "txt", "json","jdbc", "orc", "avro", etc. | If you don’t specify this method, then the default ismParquet or whatever is set in spark.sql.sources.default.|
| option() |("mode", {PERMISSIVE / FAILFAST / DROPMALFORMED } )("inferSchema", {true / false}) ("path", "path_file_data_source") | A series of key/value pairs and options. The Spark documentation shows some examples and explains the different modes and their actions. The default mode is PERMISSIVE. The "inferSchema" and "mode" options are specific to the JSON and CSV file formats.|
| schema() | DDL String or StructType, e.g., 'A INT, B STRING' orStructType(...) | For JSON or CSV format, you can specify to infer the schema in the option() method. Generally, providing a schema for any format makes loading faster and ensures your data conforms to the expected schema.|
| load() | "/path/to/data/source" | The path to the data source. This can be empty if specified in option("path", "...").|

**Ejemplos**

In [0]:
#Use Parquet
file = """/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet"""
df = spark.read.format("parquet").load(file)
#df2 = spark.read.load(file)
#Use CSV
df3 = spark.read.format("csv").option("inferSchema", "true").option("header", "true").option("mode", "PERMISSIVE").load("/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*")
#Use JSON
df4 = spark.read.format("json").load("/databricks-datasets/learning-spark-v2/flights/summary-data/json/*")

df.show(5)
df3.show(5)
df4.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-----------

En general, no se necesita ningún esquema cuando se lee desde una fuente de datos estática de Parquet; los metadatos de Parquet generalmente contienen el esquema, por lo que se deduce.

### DataFrameWriter
Guarda o escribe datos en una fuente de datos integrada especificada y estos son sus patrones de uso:
```
DataFrameWriter.format(args)
 .option(args)
 .bucketBy(args)
 .partitionBy(args)
 .save(path)
```
o

```
DataFrameWriter.format(args).option(args).sortBy(args).saveAsTable(table)
```

**Opciones**

| Method | Arguments | Description |
| --- | --- | --- |
| format() | "parquet", "csv", "txt", "json","jdbc", "orc", "avro", etc. | If you don’t specify this method, then the default is Parquet or whatever is set in spark.sql.sources.default.|
| option() | ("mode", {append / overwrite / ignore / error or errorifexists} ) ("mode", {SaveMode.Overwrite / SaveMode.Append, Save Mode.Ignore, SaveMode.ErrorIfExists}) ("path", "path_to_write_to") | A series of key/value pairs and options. The Spark documentation shows some examples. This is an overloaded method. The default mode options are error or error ifexists and SaveMode.ErrorIfExists; they throw an exception at runtime if the data already exists.|
| bucketBy() | (numBuckets, col, col..., coln) | The number of buckets and names of columns to bucket by. Uses Hive’s bucketing scheme on a filesystem.|
| save() | "/path/to/data/source" | The path to save to. This can be empty if specified in option("path", "..."). |
| saveAsTable() | "table_name" | The table to save to.|

**Ejemplo**

In [0]:
#Use JSON
location = """"/databricks-datasets/learning-spark-v2/
 sf-fire/Ejemplo_save_json.json"""
df.write.format("json").mode("overwrite").save(location)

In [0]:
%fs
ls /learning-spark-v2/sf-fire/Ejemplo_save_json

path,name,size,modificationTime
dbfs:/learning-spark-v2/sf-fire/Ejemplo_save_json/_SUCCESS,_SUCCESS,0,1651142577000
dbfs:/learning-spark-v2/sf-fire/Ejemplo_save_json/_committed_2238006556770501276,_committed_2238006556770501276,114,1651142577000
dbfs:/learning-spark-v2/sf-fire/Ejemplo_save_json/_started_2238006556770501276,_started_2238006556770501276,0,1651142576000
dbfs:/learning-spark-v2/sf-fire/Ejemplo_save_json/part-00000-tid-2238006556770501276-4704d525-889b-423b-8d58-a96e6976647e-280-1-c000.json,part-00000-tid-2238006556770501276-4704d525-889b-423b-8d58-a96e6976647e-280-1-c000.json,21353,1651142576000


## Fuentes de datos

### Parquet
La fuente de datos predeterminada de Spark

#### Leer de archivos Parquet en un DataFrame

In [0]:
file = """/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet/"""
df = spark.read.format("parquet").load(file)
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



#### Leer de archivos Parquet en una tabla SQL

In [0]:
#Creamos la tabla
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
 USING parquet
 OPTIONS (
 path "/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet/" )
""")

Out[154]: DataFrame[]

In [0]:
#Leemos la tabla
spark.sql("SELECT * FROM us_delay_flights_tbl").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



#### Escribir DataFrames a archivos Parquet

In [0]:
(df.write.format("parquet")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/parquet/df_parquet"))

In [0]:
spark.sql("DROP table us_delay_flights_tbl")

Out[162]: DataFrame[]

#### Escribir DataFrames a tablas SQL

In [0]:
(df.write
 .mode("overwrite")
 .saveAsTable("us_delay_flights_tbl"))

### JSON
En Spark se soporta tanto el formato sigle-line mode como el multiline mode.

#### Leer de archivos JSON en un DataFrame

In [0]:
file = "/databricks-datasets/learning-spark-v2/flights/summary-data/json/*"
df = spark.read.format("json").load(file)

#### Leer de archivos JSON en una tabla SQL

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
 USING json
 OPTIONS (
 path "/databricks-datasets/learning-spark-v2/flights/summary-data/json/*"
 )

In [0]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



#### Escribir DataFrames a archivos JSON

In [0]:
(df.write.format("json")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/json/df_json"))

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-447741929212383>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m (df.write.format("json")
[0m[1;32m      2[0m  [0;34m.[0m[0mmode[0m[0;34m([0m[0;34m"overwrite"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m  [0;34m.[0m[0moption[0m[0;34m([0m[0;34m"compression"[0m[0;34m,[0m [0;34m"snappy"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      4[0m  .save("/tmp/data/json/df_json"))

[0;32m/databricks/spark/python/pyspark/sql/readwriter.py[0m in [0;36msave[0;34m(self, path, format, mode, partitionBy, **options)[0m
[1;32m    738[0m             [0mself[0m[0;34m.[0m[0m_jwrite[0m[0;34m.[0m[0msave[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m    739[0m         [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m--> 740[0;31

**Opciones**

| Property name | Values | Meaning | Scope
| --- | --- | --- | --- |
| compression | none, uncompressed, bzip2, deflate, gzip, lz4, or snappy | Use this compression codec for writing. Note that read will only detect the compression or codec from the file extension.| Write |
| dateFormat | yyyy-MM-dd or DateTimeFormatter | Use this format or any format from Java’s DateTime Formatter. | Read/ write |
| multiLine | true, false | Use multiline mode. Default is false (single-line mode). | Read
| allowUnquoted FieldNames | true, false | Allow unquoted JSON field names. Default is false. | Read

### CSV

#### Leer de archivos CSV en un DataFrame

In [0]:
file = "/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*"
schema = "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT"
df = (spark.read.format("csv")
 .option("header", "true")
 .schema(schema)
 .option("mode", "FAILFAST") # Exit if any errors
 .option("nullValue", "") # Replace any null data field with quotes
 .load(file))

#### Leer de archivos CSV en una tabla SQL

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
 USING csv
 OPTIONS (
 path "/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*",
 header "true",
 inferSchema "true",
 mode "FAILFAST"
 )

In [0]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



#### Escribir DataFrames a archivos CSV

In [0]:
df.write.format("csv").mode("overwrite").save("/tmp/data/csv/df_csv")

**Opciones**
## TABLA
Página 128

### AVRO

#### Leer de archivos AVRO en un DataFrame

In [0]:
df = (spark.read.format("avro")
 .load("/databricks-datasets/learning-spark-v2/flights/summary-data/avro/*"))
df.show(truncate=False)

+--------------------------------+-------------------+-----+
|DEST_COUNTRY_NAME               |ORIGIN_COUNTRY_NAME|count|
+--------------------------------+-------------------+-----+
|United States                   |Romania            |1    |
|United States                   |Ireland            |264  |
|United States                   |India              |69   |
|Egypt                           |United States      |24   |
|Equatorial Guinea               |United States      |1    |
|United States                   |Singapore          |25   |
|United States                   |Grenada            |54   |
|Costa Rica                      |United States      |477  |
|Senegal                         |United States      |29   |
|United States                   |Marshall Islands   |44   |
|Guyana                          |United States      |17   |
|United States                   |Sint Maarten       |53   |
|Malta                           |United States      |1    |
|Bolivia                

#### Leer de archivos AVRO en una tabla SQL

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW episode_tbl
 USING avro
 OPTIONS (
 path "/databricks-datasets/learning-spark-v2/flights/summary-data/avro/*"
 )

In [0]:
spark.sql("SELECT * FROM episode_tbl").show(truncate=False)

+--------------------------------+-------------------+-----+
|DEST_COUNTRY_NAME               |ORIGIN_COUNTRY_NAME|count|
+--------------------------------+-------------------+-----+
|United States                   |Romania            |1    |
|United States                   |Ireland            |264  |
|United States                   |India              |69   |
|Egypt                           |United States      |24   |
|Equatorial Guinea               |United States      |1    |
|United States                   |Singapore          |25   |
|United States                   |Grenada            |54   |
|Costa Rica                      |United States      |477  |
|Senegal                         |United States      |29   |
|United States                   |Marshall Islands   |44   |
|Guyana                          |United States      |17   |
|United States                   |Sint Maarten       |53   |
|Malta                           |United States      |1    |
|Bolivia                

#### Escribir DataFrames a archivos AVRO

In [0]:
(df.write
 .format("avro")
 .mode("overwrite")
 .save("/tmp/data/avro/df_avro"))

**Opciones**

##TABLA
Página 130

### ORC

#### Leer de archivos ORC en un DataFrame

In [0]:
file = "/databricks-datasets/learning-spark-v2/flights/summary-data/orc/*"
df = spark.read.format("orc").option("path", file).load()

#### Leer de archivos ORC en una tabla SQL

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
 USING orc
 OPTIONS (
 path "/databricks-datasets/learning-spark-v2/flights/summary-data/orc/*"
 )

In [0]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



#### Escribir DataFrames a archivos ORC

In [0]:
(df.write.format("orc")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/orc/flights_orc"))

### Imágenes

#### Reading an image file into a DataFrame

In [0]:
from pyspark.ml import image
image_dir = "/databricks-datasets/learning-spark-v2/cctvVideos/train_images/"
images_df = spark.read.format("image").load(image_dir)
images_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = true)



### Binary Files

#### Reading a binary file into a DataFrame
El siguiente código lee todos los archivos JPG del directorio de entrada con cualquier directorio particionado

In [0]:
path = "/databricks-datasets/learning-spark-v2/cctvVideos/train_images/"
binary_files_df = (spark.read.format("binaryFile")
 .option("pathGlobFilter", "*.jpg")
 .load(path))
binary_files_df.show(5)

+--------------------+-------------------+------+--------------------+-----+
|                path|   modificationTime|length|             content|label|
+--------------------+-------------------+------+--------------------+-----+
|dbfs:/databricks-...|2020-01-02 20:42:21| 55037|[FF D8 FF E0 00 1...|    0|
|dbfs:/databricks-...|2020-01-02 20:42:31| 54634|[FF D8 FF E0 00 1...|    1|
|dbfs:/databricks-...|2020-01-02 20:42:21| 54624|[FF D8 FF E0 00 1...|    0|
|dbfs:/databricks-...|2020-01-02 20:42:22| 54505|[FF D8 FF E0 00 1...|    0|
|dbfs:/databricks-...|2020-01-02 20:42:22| 54475|[FF D8 FF E0 00 1...|    0|
+--------------------+-------------------+------+--------------------+-----+
only showing top 5 rows



Para ignorar la detección de datos de partición en un directorio, puede configurar ```recursiveFile Lookup to "true"```:

In [0]:
binary_files_df = (spark.read.format("binaryFile")
 .option("pathGlobFilter", "*.jpg")
 .option("recursiveFileLookup", "true")
 .load(path))
binary_files_df.show(5)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|dbfs:/databricks-...|2020-01-02 20:42:21| 55037|[FF D8 FF E0 00 1...|
|dbfs:/databricks-...|2020-01-02 20:42:31| 54634|[FF D8 FF E0 00 1...|
|dbfs:/databricks-...|2020-01-02 20:42:21| 54624|[FF D8 FF E0 00 1...|
|dbfs:/databricks-...|2020-01-02 20:42:22| 54505|[FF D8 FF E0 00 1...|
|dbfs:/databricks-...|2020-01-02 20:42:22| 54475|[FF D8 FF E0 00 1...|
+--------------------+-------------------+------+--------------------+
only showing top 5 rows



# APUNTES
Con \%fs puedo ejecutar comandos linux en el Notebook de Databricks. Se puede hacer un ls, ver directorios, crearlos y eliminarlos. En la siguiente web ```//https://docs.databricks.com/_static/notebooks/dbutils.html``` se observan todos los comandos disponibles.

**Ejemplos**

In [0]:
%fs
ls

path,name,size,modificationTime
dbfs:/Ejemplo_save_json.json/,Ejemplo_save_json.json/,0,0
dbfs:/FileStore/,FileStore/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/learning-spark-v2/,learning-spark-v2/,0,0
dbfs:/tmp/,tmp/,0,0
dbfs:/user/,user/,0,0


In [0]:
%fs
ls /learning-spark-v2/sf-fire/Ejemplo_save_json

path,name,size,modificationTime
dbfs:/learning-spark-v2/sf-fire/Ejemplo_save_json/_SUCCESS,_SUCCESS,0,1651142577000
dbfs:/learning-spark-v2/sf-fire/Ejemplo_save_json/_committed_2238006556770501276,_committed_2238006556770501276,114,1651142577000
dbfs:/learning-spark-v2/sf-fire/Ejemplo_save_json/_started_2238006556770501276,_started_2238006556770501276,0,1651142576000
dbfs:/learning-spark-v2/sf-fire/Ejemplo_save_json/part-00000-tid-2238006556770501276-4704d525-889b-423b-8d58-a96e6976647e-280-1-c000.json,part-00000-tid-2238006556770501276-4704d525-889b-423b-8d58-a96e6976647e-280-1-c000.json,21353,1651142576000


In [0]:
%fs
rm -r '/"'