In [14]:
from __future__ import print_function

import sys

from pyspark.sql import SparkSession


spark = (SparkSession
        .builder
        .appName("PythonMnMCount")
        .getOrCreate())
    # get the M&M data set file name
mnm_file = "C:/Users/xabier.jimenez/Documents/data/mnm_dataset.csv"
    # read the file into a Spark DataFrame
mnm_df = (spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(mnm_file))
mnm_df.show(n=5, truncate=False)

    # aggregate count of all colors and groupBy state and color
    # orderBy descending order
count_mnm_df = (mnm_df.select("State", "Color", "Count")
                    .groupBy("State", "Color")
                    .sum("Count")
                    .orderBy("sum(Count)", ascending=False))

    # show all the resulting aggregation for all the dates and colors
count_mnm_df.show(n=60, truncate=False)
print("Total Rows = %d" % (count_mnm_df.count()))

    # find the aggregate count for California by filtering
ca_count_mnm_df = (mnm_df.select("*")
                       .where(mnm_df.State == 'CA')
                       .groupBy("State", "Color")
                       .sum("Count")
                       .orderBy("sum(Count)", ascending=False))
    
    # show the resulting aggregation for California
ca_count_mnm_df.show(n=10, truncate=False)

+-----+------+-----+
|State|Color |Count|
+-----+------+-----+
|TX   |Red   |20   |
|NV   |Blue  |66   |
|CO   |Blue  |79   |
|OR   |Blue  |71   |
|WA   |Yellow|93   |
+-----+------+-----+
only showing top 5 rows

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|CA   |Yellow|100956    |
|WA   |Green |96486     |
|CA   |Brown |95762     |
|TX   |Green |95753     |
|TX   |Red   |95404     |
|CO   |Yellow|95038     |
|NM   |Red   |94699     |
|OR   |Orange|94514     |
|WY   |Green |94339     |
|NV   |Orange|93929     |
|TX   |Yellow|93819     |
|CO   |Green |93724     |
|CO   |Brown |93692     |
|CA   |Green |93505     |
|NM   |Brown |93447     |
|CO   |Blue  |93412     |
|WA   |Red   |93332     |
|WA   |Brown |93082     |
|WA   |Yellow|92920     |
|NM   |Yellow|92747     |
|NV   |Brown |92478     |
|TX   |Orange|92315     |
|AZ   |Brown |92287     |
|AZ   |Green |91882     |
|WY   |Red   |91768     |
|AZ   |Orange|91684     |
|CA   |Red   |91527     |
|WA   

Del ejercicio de M&M aplicar: 
i. Otras operaciones de agregación como el Max con otro tipo de 
ordenamiento (descendiente)

In [22]:
max_count_mnm_df = (mnm_df.select("*")
                       .where(mnm_df.State == 'CA')
                       .groupBy("State", "Color")
                       .max("Count")
                       .orderBy("max(Count)", descending=True))
max_count_mnm_df.show(n=10, truncate=False)

+-----+------+----------+
|State|Color |max(Count)|
+-----+------+----------+
|CA   |Blue  |100       |
|CA   |Red   |100       |
|CA   |Brown |100       |
|CA   |Yellow|100       |
|CA   |Orange|100       |
|CA   |Green |100       |
+-----+------+----------+



ii. hacer un ejercicio como el “where” de CA que aparece en el libro pero 
indicando más opciones de estados (p.e. NV, TX, CA, CO).

In [23]:
nv_count_mnm_df = (mnm_df.select("*")
                       .where(mnm_df.State == 'NV')
                       .groupBy("State", "Color")
                       .sum("Count")
                       .orderBy("sum(Count)", ascending=False))
nv_count_mnm_df.show(n=10, truncate=False)

tx_count_mnm_df = (mnm_df.select("*")
                       .where(mnm_df.State == 'TX')
                       .groupBy("State", "Color")
                       .sum("Count")
                       .orderBy("sum(Count)", ascending=False))
tx_count_mnm_df.show(n=10, truncate=False)

co_count_mnm_df = (mnm_df.select("*")
                       .where(mnm_df.State == 'CO')
                       .groupBy("State", "Color")
                       .sum("Count")
                       .orderBy("sum(Count)", ascending=False))
co_count_mnm_df.show(n=10, truncate=False)

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|NV   |Orange|93929     |
|NV   |Brown |92478     |
|NV   |Yellow|91390     |
|NV   |Green |91331     |
|NV   |Blue  |90003     |
|NV   |Red   |89346     |
+-----+------+----------+

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|TX   |Green |95753     |
|TX   |Red   |95404     |
|TX   |Yellow|93819     |
|TX   |Orange|92315     |
|TX   |Brown |90736     |
|TX   |Blue  |88466     |
+-----+------+----------+

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|CO   |Yellow|95038     |
|CO   |Green |93724     |
|CO   |Brown |93692     |
|CO   |Blue  |93412     |
|CO   |Orange|90971     |
|CO   |Red   |89465     |
+-----+------+----------+



iii. Hacer un ejercicio donde se calculen en una misma operación el Max, 
Min, Avg, Count. Revisar el API (documentación) donde encontrarán 
este ejemplo:
ds.agg(max($"age"), avg($"salary"))
ds.groupBy().agg(max($"age"), avg($"salary"))
NOTA: $ es un alias de col()

iv. Hacer también ejercicios en SQL creando tmpVie

In [42]:
mnm_df.createOrReplaceTempView("mnm_dftemp")

spark.sql("""SELECT * 
FROM mnm_dftemp 
WHERE Color='Blue'""").show(10)

+-----+-----+-----+
|State|Color|Count|
+-----+-----+-----+
|   NV| Blue|   66|
|   CO| Blue|   79|
|   OR| Blue|   71|
|   WY| Blue|   16|
|   AZ| Blue|   75|
|   CO| Blue|   52|
|   CO| Blue|   95|
|   CO| Blue|   98|
|   CA| Blue|   13|
|   NV| Blue|   50|
+-----+-----+-----+
only showing top 10 rows



# Don quijote

a. Descargar el Quijote 
https://gist.github.com/jsdario/6d6c69398cb0c73111e49f1218960f79
Aplicar no solo count (para obtener el número de líneas) y show sino probar 
distintas sobrecargas del método show (con/sin truncate, indicando/sin indicar 
num de filas, etc) así como también los métodos, head, take, first (diferencias 
entre estos 3?

In [7]:
from __future__ import print_function

import sys

from pyspark.sql import SparkSession


spark = (SparkSession
        .builder
        .appName("PythonQuijoteCount")
        .getOrCreate())
quijote_file = "C:/Users/xabier.jimenez/Documents/data/el_quijote.txt"
    # read the file into a Spark DataFrame
quijote_df = (spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(quijote_file))
quijote_df.show(n=5, truncate=False)

quijote_df.show(n=5, truncate=True)

quijote_df.show(n=5, truncate=False,vertical=True)

quijote_df.show()

+--------------------------------------------------------------------------------------------------+
|DON QUIJOTE DE LA MANCHA                                                                          |
+--------------------------------------------------------------------------------------------------+
|Miguel de Cervantes Saavedra                                                                      |
|PRIMERA PARTE                                                                                     |
|CAPÍTULO 1: Que trata de la condición y ejercicio del famoso hidalgo D. Quijote de la Mancha    |
|En un lugar de la Mancha                                                                          |
|Tuvo muchas veces competencia con el cura de su lugar (que era hombre docto graduado en Sigüenza)|
+--------------------------------------------------------------------------------------------------+
only showing top 5 rows

+------------------------+
|DON QUIJOTE DE LA MANCHA|
+-----------

In [8]:
print("Total Rows = %d" % (quijote_df.count()))

Total Rows = 2184


In [9]:
quijote_df.head() #Enseña la cabecera

Row(DON QUIJOTE DE LA MANCHA='Miguel de Cervantes Saavedra')

In [10]:
quijote_df.first() #Enseña la primera linea

Row(DON QUIJOTE DE LA MANCHA='Miguel de Cervantes Saavedra')

In [12]:
quijote_df.take(4) #Enseña las primeras n lineas

[Row(DON QUIJOTE DE LA MANCHA='Miguel de Cervantes Saavedra'),
 Row(DON QUIJOTE DE LA MANCHA='PRIMERA PARTE'),
 Row(DON QUIJOTE DE LA MANCHA='CAPÍTULO 1: Que trata de la condición y ejercicio del famoso hidalgo D. Quijote de la Mancha'),
 Row(DON QUIJOTE DE LA MANCHA='En un lugar de la Mancha')]

# Capítulo 3

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
# Create a DataFrame using SparkSession
spark = (SparkSession
 .builder
 .appName("AuthorsAges")
 .getOrCreate())
# Create a DataFrame 
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30),("TD", 35), ("Brooke", 25)], ["name", "age"])
# Group the same names together, aggregate their ages, and compute an average
avg_df = data_df.groupBy("name").agg(avg("age"))
# Show the results of the final execution
avg_df.show(n=2,truncate=False)



In [None]:
from pyspark.sql.types import *
schema = StructType([StructField("author", StringType(), False),
 StructField("title", StringType(), False),
 StructField("pages", IntegerType(), False)])
spark = (SparkSession
 .builder
 .appName("Example-3_6")
 .getOrCreate())
from pyspark.sql import SparkSession
# Define schema for our data using DDL 
schema = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"
# Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
 [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
"LinkedIn"]],
 [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
"twitter", "FB", "LinkedIn"]],
 [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,
["twitter", "FB"]],
 [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
"twitter", "FB", "LinkedIn"]],
 [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,
["twitter", "LinkedIn"]]
 ]


 # Create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, schema)
 # Show the DataFrame; it should reflect our table above
blogs_df.show()
 # Print the schema used by Spark to process the DataFrame
print(blogs_df.printSchema())

In [None]:
from pyspark.sql import Row
blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015",
 ["twitter", "LinkedIn"])
# access using index for individual items
blog_row[1]

In [None]:
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
authors_df = spark.createDataFrame(rows, ["Authors", "State"])
authors_df.show()

In [None]:
from pyspark.sql.types import *
# Programmatic way to define a schema 
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
 StructField('UnitID', StringType(), True),
 StructField('IncidentNumber', IntegerType(), True),
 StructField('CallType', StringType(), True), 
 StructField('CallDate', StringType(), True), 
 StructField('WatchDate', StringType(), True),
 StructField('CallFinalDisposition', StringType(), True),
 StructField('AvailableDtTm', StringType(), True),
 StructField('Address', StringType(), True), 
 StructField('City', StringType(), True), 
 StructField('Zipcode', IntegerType(), True), 
 StructField('Battalion', StringType(), True), 
 StructField('StationArea', StringType(), True), 
 StructField('Box', StringType(), True), 
 StructField('OriginalPriority', StringType(), True), 
 StructField('Priority', StringType(), True), 
 StructField('FinalPriority', IntegerType(), True), 
 StructField('ALSUnit', BooleanType(), True), 
 StructField('CallTypeGroup', StringType(), True),
 StructField('NumAlarms', IntegerType(), True),
 StructField('UnitType', StringType(), True),
 StructField('UnitSequenceInCallDispatch', IntegerType(), True),
 StructField('FirePreventionDistrict', StringType(), True),
 StructField('SupervisorDistrict', StringType(), True),
 StructField('Neighborhood', StringType(), True),
 StructField('Location', StringType(), True),
 StructField('RowID', StringType(), True),
 StructField('Delay', FloatType(), True)])
# Use the DataFrameReader interface to read a CSV file
sf_fire_file = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/chapter3/data/sf-fire-calls.csv"
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)


In [None]:
parquet_path = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/chapter3/parquet/fire_df.parquet2"
fire_df.write.mode("overwrite").format("parquet").save(parquet_path)
parquet_table = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/chapter3/parquet/fire_df.parquet2"
fire_df.write.mode("overwrite").format("parquet").save(parquet_path)

In [None]:
from pyspark.sql import functions as F
few_fire_df = (fire_df
 .select("IncidentNumber", "AvailableDtTm", "CallType")
 .where(F.col("CallType") != "Medical Incident"))
few_fire_df.show(5, truncate=False)

from pyspark.sql.functions import *
(fire_df
  .select("CallType")
 .where(col("CallType").isNotNull())
 .agg(countDistinct("CallType").alias("DistinctCallTypes"))
 .show())

In [None]:
(fire_df.select("CallType")
 .where(col("CallType").isNotNull())
 .distinct().count())

In [None]:
(fire_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .distinct()
 .show(10, False))


In [None]:
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
 .select("ResponseDelayedinMins")
 .where(col("ResponseDelayedinMins") > 5)
 .show(5, False))


In [None]:
fire_ts_df = (new_fire_df
 .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
 .drop("CallDate")
 .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
 .drop("WatchDate")
 .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
 "MM/dd/yyyy hh:mm:ss a"))
 .drop("AvailableDtTm"))
# Select the converted columns
(fire_ts_df
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, False))

In [None]:
(fire_ts_df
 .select(year('IncidentDate'))
 .distinct()
 .orderBy(year('IncidentDate'))
 .show())

In [None]:
(fire_ts_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .groupBy("CallType")
 .count()
 .orderBy("count", ascending=False)
 .show(n=10, truncate=False))

In [None]:
import pyspark.sql.functions as F
(fire_ts_df
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show())

In [None]:
from pyspark.sql import Row
row = Row(350, True, "Learning Spark 2E", None)
row[0]

In [None]:
count_mnm_df = (mnm_df
 .select("State", "Color", "Count")
 .groupBy("State", "Color")
 .agg(count("Count")
 .alias("Total"))
 .orderBy("Total", ascending=False))
count_mnm_df.explain(True)

# Ejercicios capítulo 3

Leer el CSV del ejemplo del cap2 y obtener la estructura del schema dado por 
defecto.

In [43]:
mnm_file = "C:/Users/xabier.jimenez/Documents/data/mnm_dataset.csv"
    # read the file into a Spark DataFrame
mnm_df = (spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(mnm_file))
mnm_df.show(n=5, truncate=False)

+-----+------+-----+
|State|Color |Count|
+-----+------+-----+
|TX   |Red   |20   |
|NV   |Blue  |66   |
|CO   |Blue  |79   |
|OR   |Blue  |71   |
|WA   |Yellow|93   |
+-----+------+-----+
only showing top 5 rows



Cuando se define un schema al definir un campo por ejemplo StructField('Delay', 
FloatType(), True) ¿qué significa el último parámetro Boolean?

La posibilidad de contener nulos

# Capítulo 4

In [1]:
from pyspark.sql import SparkSession 
# Create a SparkSession
spark = (SparkSession
 .builder
 .enableHiveSupport()
 .appName("SparkSQLExampleApp")
 .getOrCreate())
csv_file = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
# Read and create a temporary view
# Infer schema (note that for larger files you 
# may want to specify the schema)
df = (spark.read.format("csv")
 .option("inferSchema", "true")
 .option("header", "true")
 .load(csv_file))
df.createOrReplaceTempView("us_delay_flights_tbl")

In [None]:
spark.stop()

In [None]:
schema = "`date` STRING, `delay` INT, `distance` INT, `origin` STRING, `destination` STRING"

In [None]:
spark.sql("""SELECT distance, origin, destination 
FROM us_delay_flights_tbl WHERE distance > 1000 
ORDER BY distance DESC""").show(10)

In [None]:
spark.sql("""SELECT date, delay, origin, destination 
FROM us_delay_flights_tbl 
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD' 
ORDER by delay DESC""").show(10)


In [None]:
spark.sql("""SELECT delay, origin, destination,
 CASE
 WHEN delay > 360 THEN 'Very Long Delays'
 WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
 WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
 WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
 WHEN delay = 0 THEN 'No Delays'
 ELSE 'Early'
 END AS Flight_Delays
 FROM us_delay_flights_tbl
 ORDER BY origin, delay DESC""").show(10)


In [None]:
from pyspark.sql.functions import col, desc
(df.select("distance", "origin", "destination")
 .where(col("distance") > 1000)
 .orderBy(desc("distance"))).show(10)

In [None]:
(df.select("distance", "origin", "destination")
 .where("distance > 1000")
 .orderBy("distance", ascending=False).show(10))

In [20]:
spark.sql("CREATE DATABASE learn_spark_db")
spark.sql("USE learn_spark_db")

DataFrame[]

In [22]:
spark.sql("CREATE TABLE managed_us_delay_flights_tbl3 (date STRING, delay INT, distance INT, origin STRING, destination STRING)")

DataFrame[]

In [23]:
csv_file = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
# Schema as defined in the preceding example
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file, schema=schema)
flights_df.write.saveAsTable("managed_us_delay_flights_tbl")

AnalysisException: Can not create the managed table('`managed_us_delay_flights_tbl`'). The associated location('file:/C:/Users/xabier.jimenez/spark-warehouse/learn_spark_db.db/managed_us_delay_flights_tbl') already exists.

In [30]:
spark.sql("show databases")

DataFrame[namespace: string]

In [26]:
spark.sql("""CREATE TABLE us_delay_flights_tbl(date STRING, delay INT, 
 distance INT, origin STRING, destination STRING) 
 USING csv OPTIONS (PATH 
 'C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv')""")

DataFrame[]

In [None]:
(flights_df
 .write
 .option("path", "/tmp/data/us_flights_delay")
 .saveAsTable("us_delay_flights_tbl2"))

In [None]:
df_sfo = spark.sql("SELECT date, delay, origin, destination FROM us_delay_flights_tbl WHERE origin = 'SFO'")
df_jfk = spark.sql("SELECT date, delay, origin, destination FROM us_delay_flights_tbl WHERE origin = 'JFK'")
#Create a temporary and global temporary view
df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view")
df_jfk.createOrReplaceTempView("us_origin_airport_JFK_tmp_view")
spark.sql("SELECT * FROM us_origin_airport_JFK_tmp_view")

In [None]:
spark.catalog.dropGlobalTempView("us_origin_airport_SFO_global_tmp_view")
spark.catalog.dropTempView("us_origin_airport_JFK_tmp_view")

In [31]:
spark.catalog.listDatabases()

[Database(name='default', description='Default Hive database', locationUri='file:/C:/Users/xabier.jimenez/spark-warehouse'),
 Database(name='learn_spark_db', description='', locationUri='file:/C:/Users/xabier.jimenez/spark-warehouse/learn_spark_db.db')]

In [32]:
spark.catalog.listTables()

[Table(name='managed_us_delay_flights_tbl3', database='learn_spark_db', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='us_delay_flights_tbl', database='learn_spark_db', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='airports_na', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='bar', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='departuredelays', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='foo', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='tc', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='us_delay_flights_tbl', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [33]:
spark.catalog.listColumns("us_delay_flights_tbl")

[Column(name='date', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='delay', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='distance', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='origin', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='destination', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

In [None]:
us_flights_df = spark.sql("SELECT * FROM us_delay_flights_tbl")
us_flights_df2 = spark.table("us_delay_flights_tbl")

## Data Sources for DataFrames and SQL Tables

In [None]:
file = """C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet/"""
df = spark.read.format("parquet").load(file)

In [None]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show()

In [None]:
(df.write.format("parquet")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/parquet/df_parquet"))

(df.write
 .mode("overwrite")
 .saveAsTable("us_delay_flights_tbl"))

# Json

In [None]:
file = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/json/*"
df = spark.read.format("json").load(file)

In [None]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show()

In [None]:
(df.write.json("C:/tmp/data/json/df_json3"))

# Csv

In [None]:
file = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*"
schema = "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT"
df = (spark.read.format("csv")
 .option("header", "true")
 .schema(schema)
 .option("mode", "FAILFAST") # Exit if any errors
 .option("nullValue", "") # Replace any null data field with quotes
 .load(file))

In [None]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(10)

In [None]:
df.write.format("csv").mode("overwrite").save("/tmp/data/csv/df_csv")

# avro

In [None]:
df = (spark.read.format("avro")
 .load("C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/avro/*"))
df.show(truncate=False)

In [None]:
spark.sql("SELECT * FROM episode_tbl").show(truncate=False)

In [None]:
(df.write
 .format("avro")
 .mode("overwrite")
 .save("/tmp/data/avro/df_avro"))


# orc

In [None]:
file = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/orc/*"
df = spark.read.format("orc").option("path", file).load()
df.show(10, False)

In [None]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show()

In [None]:
(df.write.format("orc")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/orc/flights_orc"))

# image

In [None]:
from pyspark.ml import image
image_dir = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/cctvVideos/train_images/"
images_df = spark.read.format("image").load(image_dir)
images_df.printSchema()
images_df.select("image.height", "image.width", "image.nChannels", "image.mode",
 "label").show(5, truncate=False)

# Binary files

In [None]:
path = "C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/cctvVideos/train_images/"
binary_files_df = (spark.read.format("binaryFile")
 .option("pathGlobFilter", "*.jpg")
 .load(path))
binary_files_df.show(5)

In [None]:
binary_files_df = (spark.read.format("binaryFile")
 .option("pathGlobFilter", "*.jpg")
 .option("recursiveFileLookup", "true")
 .load(path))
binary_files_df.show(5)

# Ejercicios capítulo 4

GlobalTempView vs TempView

GlobalTempView es una temp view que se puede utilizar en todas las sesiones de Spark, mientras que TempView solo se puede usar en una sesion

# Capítulo 5

In [None]:
from pyspark.sql.types import LongType
# Create cubed function
def cubed(s):
 return s * s * s
# Register UDF
spark.udf.register("cubed", cubed, LongType())
# Generate temporary view
spark.range(1, 9).createOrReplaceTempView("udf_test")
spark.sql("SELECT id, cubed(id) AS id_cubed FROM udf_test").show()

In [None]:
#pip install pyarrow
import pandas as pd
# Import various pyspark SQL functions including pandas_udf
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType
# Declare the cubed function 
def cubed(a: pd.Series) -> pd.Series:
    return a * a * a
# Create the pandas UDF for the cubed function 
cubed_udf = pandas_udf(cubed, returnType=LongType())

x= pd.Series([1, 2, 3])
# The function for a pandas_udf executed with local Pandas data
print(cubed(x))

In [None]:
df = spark.range(1, 4)
# Execute function as a Spark vectorized UDF
df.select("id", cubed_udf(col("id"))).show()

# Higher-Order Functions in DataFrames and Spark SQL

In [3]:
from pyspark.sql.types import *
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])
t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]
t_c = spark.createDataFrame(t_list, schema)
t_c.createOrReplaceTempView("tC")
# Show the DataFrame
t_c.show()

+--------------------+
|             celsius|
+--------------------+
|[35, 36, 32, 30, ...|
|[31, 32, 34, 55, 56]|
+--------------------+



In [8]:
spark.sql("""
SELECT celsius,
transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit 
 FROM tC
""").show()

+--------------------+--------------------+
|             celsius|          fahrenheit|
+--------------------+--------------------+
|[35, 36, 32, 30, ...|[95, 96, 89, 86, ...|
|[31, 32, 34, 55, 56]|[87, 89, 93, 131,...|
+--------------------+--------------------+



In [9]:
spark.sql("""
SELECT celsius, 
 filter(celsius, t -> t > 38) as high 
 FROM tC
""").show()

+--------------------+--------+
|             celsius|    high|
+--------------------+--------+
|[35, 36, 32, 30, ...|[40, 42]|
|[31, 32, 34, 55, 56]|[55, 56]|
+--------------------+--------+



In [10]:
# is there a temperature >38
spark.sql("""
SELECT celsius, 
 exists(celsius, t -> t = 38) as threshold
 FROM tC
""").show()

+--------------------+---------+
|             celsius|threshold|
+--------------------+---------+
|[35, 36, 32, 30, ...|     true|
|[31, 32, 34, 55, 56]|    false|
+--------------------+---------+



In [11]:
#Calculate average temperature and convert to F
spark.sql("""
SELECT celsius, 
 reduce(
 celsius, 
 0, 
 (t, acc) -> t + acc, 
 acc -> (acc div size(celsius) * 9 div 5) + 32
 ) as avgFahrenheit 
 FROM tC
""").show()

AnalysisException: Undefined function: 'reduce'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 3 pos 1

In [12]:
from pyspark.sql.functions import expr
tripdelaysFilePath ="C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
airportsnaFilePath ="C:/Users/xabier.jimenez/Documents/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/airport-codes-na.txt"
 
#Obtain airports data set
airportsna = (spark.read
 .format("csv")
 .options(header="true", inferSchema="true", sep="\t")
 .load(airportsnaFilePath))
airportsna.createOrReplaceTempView("airports_na")
# Obtain departure delays data set
departureDelays = (spark.read
 .format("csv")
 .options(header="true")
 .load(tripdelaysFilePath))
departureDelays = (departureDelays
 .withColumn("delay", expr("CAST(delay as INT) as delay"))
 .withColumn("distance", expr("CAST(distance as INT) as distance")))
departureDelays.createOrReplaceTempView("departureDelays")
# Create temporary small table
foo = (departureDelays
 .filter(expr("""origin == 'SEA' and destination == 'SFO' and 
 date like '01010%' and delay > 0""")))
foo.createOrReplaceTempView("foo")


In [13]:
spark.sql("SELECT * FROM airports_na LIMIT 10").show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Abbotsford|   BC| Canada| YXX|
|   Aberdeen|   SD|    USA| ABR|
|    Abilene|   TX|    USA| ABI|
|      Akron|   OH|    USA| CAK|
|    Alamosa|   CO|    USA| ALS|
|     Albany|   GA|    USA| ABY|
|     Albany|   NY|    USA| ALB|
|Albuquerque|   NM|    USA| ABQ|
| Alexandria|   LA|    USA| AEX|
|  Allentown|   PA|    USA| ABE|
+-----------+-----+-------+----+



In [14]:
spark.sql("SELECT * FROM departureDelays LIMIT 10").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+



In [15]:
spark.sql("SELECT * FROM foo").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



In [16]:
bar = departureDelays.union(foo)
bar.createOrReplaceTempView("bar")
# Show the union (filtering for SEA and SFO in a specific time range)
bar.filter(expr("""origin == 'SEA' AND destination == 'SFO' AND date LIKE '01010%' AND delay > 0""")).show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



In [17]:
foo.join(
airportsna,
airportsna.IATA == foo.origin
).select("City", "State", "date", "delay", "distance", "destination").show()

+-------+-----+--------+-----+--------+-----------+
|   City|State|    date|delay|distance|destination|
+-------+-----+--------+-----+--------+-----------+
|Seattle|   WA|01010710|   31|     590|        SFO|
|Seattle|   WA|01010955|  104|     590|        SFO|
|Seattle|   WA|01010730|    5|     590|        SFO|
+-------+-----+--------+-----+--------+-----------+



In [37]:
from pyspark.sql.functions import expr
foo2 = (foo.withColumn(
 "status",
 expr("CASE WHEN delay <= 10 THEN 'On-time' ELSE 'Delayed' END")
 ))
foo2.show()

+--------+-----+--------+------+-----------+-------+
|    date|delay|distance|origin|destination| status|
+--------+-----+--------+------+-----------+-------+
|01010710|   31|     590|   SEA|        SFO|Delayed|
|01010955|  104|     590|   SEA|        SFO|Delayed|
|01010730|    5|     590|   SEA|        SFO|On-time|
+--------+-----+--------+------+-----------+-------+



In [38]:
foo3 = foo2.drop("delay")
foo3.show()

+--------+--------+------+-----------+-------+
|    date|distance|origin|destination| status|
+--------+--------+------+-----------+-------+
|01010710|     590|   SEA|        SFO|Delayed|
|01010955|     590|   SEA|        SFO|Delayed|
|01010730|     590|   SEA|        SFO|On-time|
+--------+--------+------+-----------+-------+



In [39]:
foo4 = foo3.withColumnRenamed("status", "flight_status")
foo4.show()

+--------+--------+------+-----------+-------------+
|    date|distance|origin|destination|flight_status|
+--------+--------+------+-----------+-------------+
|01010710|     590|   SEA|        SFO|      Delayed|
|01010955|     590|   SEA|        SFO|      Delayed|
|01010730|     590|   SEA|        SFO|      On-time|
+--------+--------+------+-----------+-------------+



In [9]:
from pyspark.sql import SparkSession 
# Create a SparkSession
spark = (SparkSession
 .builder
 .enableHiveSupport()
 .appName("SparkSQLExampleApp")
 .getOrCreate())

In [10]:
employees = (spark
.read
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/employees")
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", "employees")
.option("user", "root")
.option("password", "jjzSx9PJ")
.load())

In [24]:
salaries= (spark
.read
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/employees")
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", "salaries")
.option("user", "root")
.option("password", "jjzSx9PJ")
.load())

In [28]:
titles= (spark
.read
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/employees")
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", "titles")
.option("user", "root")
.option("password", "jjzSx9PJ")
.load())

In [32]:
employees.join(salaries,employees.emp_no == salaries.emp_no).select("salary").show()

+------+
|salary|
+------+
| 40000|
| 43519|
| 46265|
| 46865|
| 47837|
| 52042|
| 52370|
| 53202|
| 56087|
| 59252|
| 62716|
| 67137|
| 67944|
| 67588|
| 71052|
| 40000|
| 44091|
| 45546|
| 49296|
| 48889|
+------+
only showing top 20 rows



In [37]:
employees.join(salaries,employees.emp_no == salaries.emp_no).join(titles,employees.emp_no == titles.emp_no).select("salary","title").show()

+------+----------------+
|salary|           title|
+------+----------------+
| 40000|Technique Leader|
| 43519|Technique Leader|
| 46265|Technique Leader|
| 46865|Technique Leader|
| 47837|Technique Leader|
| 52042|Technique Leader|
| 52370|Technique Leader|
| 53202|Technique Leader|
| 56087|Technique Leader|
| 59252|Technique Leader|
| 62716|Technique Leader|
| 67137|Technique Leader|
| 67944|Technique Leader|
| 67588|Technique Leader|
| 71052|Technique Leader|
| 40000|    Senior Staff|
| 44091|    Senior Staff|
| 45546|    Senior Staff|
| 49296|    Senior Staff|
| 48889|    Senior Staff|
+------+----------------+
only showing top 20 rows



Denserank sigue la numeración aunque haya elementos repetidos