# Capítulo 3: DataFrame API

## Creando un RDD

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
# Create a DataFrame using SparkSession
spark = (SparkSession
 .builder
 .appName("AuthorsAges")
 .getOrCreate())
# Create a DataFrame 
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30),
 ("TD", 35), ("Brooke", 25)], ["name", "age"])
# Group the same names together, aggregate their ages, and compute an average
avg_df = data_df.groupBy("name").agg(avg("age"))
# Show the results of the final execution
avg_df.show() #Calculo la media de las edades de las personas que se llaman igual

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Jules|    30.0|
|    TD|    35.0|
| Denny|    31.0|
+------+--------+



## Formas de definir un esquema

- Programmatically

In [2]:
from pyspark.sql.types import *
schema = StructType([StructField("author", StringType(), False),
 StructField("title", StringType(), False),
 StructField("pages", IntegerType(), False)])

- Con DDL (Data Definition Language): más simple

In [3]:
schema = "author STRING, title STRING, pages INT"

- Utilizando los dos **Esto no lo acabo de entender**

In [2]:
from pyspark.sql import SparkSession
# Define schema for our data using DDL 
schema = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"
# Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter",
"LinkedIn"]],
 [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
"LinkedIn"]],
 [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
"twitter", "FB", "LinkedIn"]],
 [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,
["twitter", "FB"]],
 [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
"twitter", "FB", "LinkedIn"]],
 [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,
["twitter", "LinkedIn"]]
 ]
# Main program
if __name__ == "__main__":
 # Create a SparkSession
 spark = (SparkSession
 .builder
 .appName("Example-3_6")
 .getOrCreate())
 # Create a DataFrame using the schema defined above
 blogs_df = spark.createDataFrame(data, schema)
 # Show the DataFrame; it should reflect our table above
 blogs_df.show()
 # Print the schema used by Spark to process the DataFrame
 print(blogs_df.printSchema())

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (

**Para ver el esquema de un DF**

In [6]:
blogs_df.schema

StructType(List(StructField(Id,IntegerType,true),StructField(First,StringType,true),StructField(Last,StringType,true),StructField(Url,StringType,true),StructField(Published,StringType,true),StructField(Hits,IntegerType,true),StructField(Campaigns,ArrayType(StringType,true),true)))

## Leyendo JSON 

**Traducción de Scala (no sé si está bien)** 

In [10]:
def main(args):
 # Create a SparkSession
 spark = (SparkSession
 .builder
 .appName("Example-3_7")
 .getOrCreate())
 if args.length <= 0:
    print("usage Example3_7 <file path to blogs.json>")
    #break
 #  Get the path to the JSON file
 jsonFile = args(0)
 # Definir el esquema de forma programática
 schema = StructType([StructField("Id", IntegerType(), False),
StructField("First", StringType(), False),
 StructField("Last", StringType(), False),
 StructField("Url", StringType(), False),
 StructField("Published", StringType(), False),
 StructField("Hits", IntegerType(), False),
 StructField("Campaigns", StringType(), False)])

 # Create a DataFrame by reading from the JSON file
 jsonFile
 blogs_DF = spark.read.schema(schema).json(jsonFile)
 # Show the DataFrame; it should reflect our table above
 blogs_DF.show(False)
 # Print the schema used by Spark to process the DataFrame
 print(blogs_DF.printSchema())
 print(blogs_DF.schema())

## Columnas

In [11]:
blogs_df.columns

['Id', 'First', 'Last', 'Url', 'Published', 'Hits', 'Campaigns']

## Filas

In [13]:
from pyspark.sql import Row
blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015",
 ["twitter", "LinkedIn"])
# access using index for individual items
blog_row[1]

'Reynold'

### Las filas se pueden crear utilizando DataFrames

In [14]:
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
authors_df = spark.createDataFrame(rows, ["Authors", "State"])
authors_df.show()

+-------------+-----+
|      Authors|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



## Leer y escribir DataFrames

### Leer

#### Si no especificas esquema, Spark lo predice

In [19]:
sampleDF = (spark
 .read
 .option("samplingRatio", 0.001)
 .option("header", True)
 .csv('C:\\Users\\nora.hafidi\\Desktop\\Big Data\\sf-fire-calls.csv'))
#Con samplingRatio se infiere el esquema
sampleDF.schema

StructType(List(StructField(CallNumber,StringType,true),StructField(UnitID,StringType,true),StructField(IncidentNumber,StringType,true),StructField(CallType,StringType,true),StructField(CallDate,StringType,true),StructField(WatchDate,StringType,true),StructField(CallFinalDisposition,StringType,true),StructField(AvailableDtTm,StringType,true),StructField(Address,StringType,true),StructField(City,StringType,true),StructField(Zipcode,StringType,true),StructField(Battalion,StringType,true),StructField(StationArea,StringType,true),StructField(Box,StringType,true),StructField(OriginalPriority,StringType,true),StructField(Priority,StringType,true),StructField(FinalPriority,StringType,true),StructField(ALSUnit,StringType,true),StructField(CallTypeGroup,StringType,true),StructField(NumAlarms,StringType,true),StructField(UnitType,StringType,true),StructField(UnitSequenceInCallDispatch,StringType,true),StructField(FirePreventionDistrict,StringType,true),StructField(SupervisorDistrict,StringType,t

### Leyendo datos del departamento de bomberos

In [1]:
# In Python, define a schema 
from pyspark.sql.types import *
# Programmatic way to define a schema 
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
 StructField('UnitID', StringType(), True),
 StructField('IncidentNumber', IntegerType(), True),
 StructField('CallType', StringType(), True), 
 StructField('CallDate', StringType(), True), 
 StructField('WatchDate', StringType(), True),
 StructField('CallFinalDisposition', StringType(), True),
 StructField('AvailableDtTm', StringType(), True),
 StructField('Address', StringType(), True), 
 StructField('City', StringType(), True), 
 StructField('Zipcode', IntegerType(), True), 
 StructField('Battalion', StringType(), True), 
 StructField('StationArea', StringType(), True), 
 StructField('Box', StringType(), True), 
 StructField('OriginalPriority', StringType(), True), 
 StructField('Priority', StringType(), True), 
 StructField('FinalPriority', IntegerType(), True), 
 StructField('ALSUnit', BooleanType(), True), 
 StructField('CallTypeGroup', StringType(), True),
 StructField('NumAlarms', IntegerType(), True),
 StructField('UnitType', StringType(), True),
 StructField('UnitSequenceInCallDispatch', IntegerType(), True),
 StructField('FirePreventionDistrict', StringType(), True),
 StructField('SupervisorDistrict', StringType(), True),
 StructField('Neighborhood', StringType(), True),
 StructField('Location', StringType(), True),
 StructField('RowID', StringType(), True),
 StructField('Delay', FloatType(), True)])
# Use the DataFrameReader interface to read a CSV file
sf_fire_file = 'C:\\Users\\nora.hafidi\\Desktop\\Big Data\\sf-fire-calls.csv'
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)

In [30]:
fire_df.head()

Row(CallNumber=20110016, UnitID='T13', IncidentNumber=2003235, CallType='Structure Fire', CallDate='01/11/2002', WatchDate='01/10/2002', CallFinalDisposition='Other', AvailableDtTm='01/11/2002 01:51:44 AM', Address='2000 Block of CALIFORNIA ST', City='SF', Zipcode=94109, Battalion='B04', StationArea='38', Box='3362', OriginalPriority='3', Priority='3', FinalPriority=3, ALSUnit=False, CallTypeGroup=None, NumAlarms=1, UnitType='TRUCK', UnitSequenceInCallDispatch=2, FirePreventionDistrict='4', SupervisorDistrict='5', Neighborhood='Pacific Heights', Location='(37.7895840679362, -122.428071912459)', RowID='020110016-T13', Delay=2.950000047683716)

#### Consultando datos del departamento de bomberos

In [3]:
from pyspark.sql.functions import *
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
 .select("ResponseDelayedinMins")
 .where(col("ResponseDelayedinMins") > 5)
 .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



#### Eliminando y añadiendo columnas

In [4]:
# In Python
fire_ts_df = (new_fire_df
 .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))#Creo una columna con el formato que quiero
 .drop("CallDate") #Elimino la anterior columna
 .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
 .drop("WatchDate")
 .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
 "MM/dd/yyyy hh:mm:ss a"))
 .drop("AvailableDtTm"))
# Select the converted columns
(fire_ts_df
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, False))


+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



### Escribir

**Se guarda el DataFrame en formato Parquet por defecto**

In [7]:
"""parquet_table = 'Tabla1'
fire_df.write.format("parquet").saveAsTable(parquet_table)"""


"""(fire_df.write
    .format("parquet")
    .saveAsTable("tbl_nm"))"""

'(fire_df.write\n    .format("parquet")\n    .saveAsTable("tbl_nm"))'

## Operaciones y funciones varias (con consultas)

- disctinct()

In [15]:
(fire_ts_df
 .select(year('IncidentDate'))
 .distinct()
 .orderBy(year('IncidentDate'))
 .show())

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



- groupBy()

In [16]:
(fire_ts_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .groupBy("CallType")
 .count()
 .show(n=10, truncate=False))

+-----------------------------------+-----+
|CallType                           |count|
+-----------------------------------+-----+
|Elevator / Escalator Rescue        |453  |
|Marine Fire                        |14   |
|Aircraft Emergency                 |36   |
|Confined Space / Structure Collapse|13   |
|Administrative                     |3    |
|Alarms                             |19406|
|Odor (Strange / Unknown)           |490  |
|Citizen Assist / Service Call      |2524 |
|HazMat                             |124  |
|Watercraft in Distress             |28   |
+-----------------------------------+-----+
only showing top 10 rows



- orderBy()

In [17]:
(fire_ts_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .groupBy("CallType")
 .count()
 .orderBy("count", ascending=False)
 .show(n=10, truncate=False))

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



- max(), min(), avg(), sum()

In [23]:
import pyspark.sql.functions
(fire_ts_df
 .select(sum("NumAlarms"), avg("ResponseDelayedinMins"),
 min("ResponseDelayedinMins"), max("ResponseDelayedinMins"))
 .show())

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



- stat(), describe(), correlation(), covariance(), sampleBy(), approxQuantile(), frequentItems()

In [31]:
print(fire_ts_df
         .corr("CallNumber", 'Zipcode'))

0.010848528180278628


## Ejercicios Departamento de bomberos

- ¿Cuáles fueron todos los diferentes tipos de llamadas de emergencia en 2018?
- ¿Qué meses del año 2018 vieron la mayor cantidad de llamadas de emergencia?
- ¿Qué vecindario en San Francisco generó la mayor cantidad de llamadas de emergencia en 2018?
- ¿Qué vecindarios tuvieron los peores tiempos de respuesta a las llamadas de incendios en 2018?
- ¿Qué semana del año en 2018 tuvo la mayor cantidad de llamadas de emergencia?
- ¿Existe una correlación entre el vecindario, el código postal y el número de llamadas de bomberos?
- ¿Cómo podemos usar archivos Parquet o tablas SQL para almacenar estos datos y volver a leerlos?

**Columnas**

In [13]:
fire_ts_df.columns

['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallFinalDisposition',
 'Address',
 'City',
 'Zipcode',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumAlarms',
 'UnitType',
 'UnitSequenceInCallDispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'Neighborhood',
 'Location',
 'RowID',
 'ResponseDelayedinMins',
 'IncidentDate',
 'OnWatchDate',
 'AvailableDtTS']

**Pregunta 1**

¿Cuáles fueron todos los diferentes tipos de llamadas de emergencia en 2018?

In [14]:
preg_1 = (fire_ts_df
                   .select("CallType")
                   .where((year(fire_ts_df.IncidentDate) == "2018") )
                   .distinct()
                   .show())

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|           Explosion|
|        Vehicle Fire|
|  Suspicious Package|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
|        Water Rescue|
|   Electrical Hazard|
|      Structure Fire|
|    Medical Incident|
|          Fuel Spill|
|Smoke Investigati...|
|Train / Rail Inci...|
+--------------------+



**Pregunta 2**

¿Qué meses del año 2018 vieron la mayor cantidad de llamadas de emergencia?

In [90]:
preg_2 = (fire_ts_df
                   .where((year(fire_ts_df.IncidentDate) == "2018") )
                   .groupBy(month(fire_ts_df.IncidentDate)).count()
                   .orderBy("count", ascending=False).show(5))



+-------------------+-----+
|month(IncidentDate)|count|
+-------------------+-----+
|                 10| 1068|
|                  5| 1047|
|                  3| 1029|
|                  8| 1021|
|                  1| 1007|
+-------------------+-----+
only showing top 5 rows



**Pregunta 3**

¿Qué vecindario en San Francisco generó la mayor cantidad de llamadas de emergencia en 2018?

In [87]:
preg_3 = (fire_ts_df
                   .where((year(fire_ts_df.IncidentDate) == "2018") )
                   .groupBy("Neighborhood").count()
                   .orderBy("count", ascending=False).show(1))

+------------+-----+
|Neighborhood|count|
+------------+-----+
|  Tenderloin| 1393|
+------------+-----+
only showing top 1 row



**Pregunta 4**

¿Qué vecindarios tuvieron los peores tiempos de respuesta a las llamadas de incendios en 2018?

In [89]:
preg_4 = (fire_ts_df
                   .where((year(fire_ts_df.IncidentDate) == "2018") )
                   .groupBy("Neighborhood")
                   .agg(avg("ResponseDelayedinMins").alias("MediaTiempoRespuesta"))
                   .orderBy("MediaTiempoRespuesta", ascending=False).show(3))

+---------------+--------------------+
|   Neighborhood|MediaTiempoRespuesta|
+---------------+--------------------+
|      Chinatown|   6.190314101143033|
|       Presidio|   5.829227011272873|
|Treasure Island|   5.453703684111436|
+---------------+--------------------+
only showing top 3 rows



**Pregunta 5**

¿Qué semana del año en 2018 tuvo la mayor cantidad de llamadas de emergencia?

In [95]:
preg_5 = (fire_ts_df
                   .where((year(fire_ts_df.IncidentDate) == "2018") )
                   .groupBy(weekofyear(fire_ts_df.IncidentDate)).count()
                   .orderBy("count", ascending=False).show(1))

+------------------------+-----+
|weekofyear(IncidentDate)|count|
+------------------------+-----+
|                      22|  259|
+------------------------+-----+
only showing top 1 row



**Pregunta 6**

¿Existe una correlación entre el vecindario, el código postal y el número de llamadas de bomberos?

In [10]:
preg_6 = (fire_ts_df
                   .groupBy('Neighborhood', 'Zipcode').count()
                   .corr("count", 'Zipcode'))

In [11]:
print(preg_6)

0.06771510346381854


**Pregunta 7**

¿Cómo podemos usar archivos Parquet o tablas SQL para almacenar estos datos y volver a leerlos?

## Dataset API: Leyendo archivo JSON

In [39]:
ds = (spark.read
.json("C:\\Users\\nora.hafidi\\Desktop\\Big Data\\iot_devices.json"))
ds.show(2)

+-------------+---------+----+----+-------------+---------+--------------------+--------+-------------+--------+-----+---------+-------+----+-------------+
|battery_level|c02_level|cca2|cca3|           cn|device_id|         device_name|humidity|           ip|latitude|  lcd|longitude|  scale|temp|    timestamp|
+-------------+---------+----+----+-------------+---------+--------------------+--------+-------------+--------+-----+---------+-------+----+-------------+
|            8|      868|  US| USA|United States|        1|meter-gauge-1xbYRYcj|      51| 68.161.225.1|    38.0|green|    -97.0|Celsius|  34|1458444054093|
|            7|     1473|  NO| NOR|       Norway|        2|   sensor-pad-2n2Pea|      70|213.161.254.1|   62.47|  red|     6.15|Celsius|  11|1458444054119|
+-------------+---------+----+----+-------------+---------+--------------------+--------+-------------+--------+-----+---------+-------+----+-------------+
only showing top 2 rows



### Operaciones con el conjunto de datos

Los Datasets no son soportados por Python por lo que los demás ejercicios hay que hacerlos en Scala.