# Chapter 3. Apache Spark's Structured APIs

In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
#create a SparkSession
spark = (SparkSession
   .builder
   .appName("Example-3_6")
   .getOrCreate())
spark

## Schemas and Creating DataFrames

In [None]:
# define schema for our data
schema = ("`Id` INT, `First` STRING, `Last` STRING, `Url` STRING,"
          " `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>")

#create our data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

In [None]:
# create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, schema)
# show the DataFrame; it should reflect our table above
blogs_df.show()

In [None]:
# print the schema used by Spark to process the DataFrame
blogs_df.printSchema()

## Columns, and Expressions

In [None]:
# show heavy hitters
blogs_df.withColumn("Big Hitters", (expr("Hits > 10000"))).show()
blogs_df.schema.simpleString()

In [None]:
blogs_df.withColumn("AuthorsId", (concat("First", "Last", "Id"))).select("AuthorsId").show(4)

In [None]:
# Show columns and expressions
blogs_df.select(expr("Hits") * 2).show(2)
blogs_df.select(col("Hits") * 2).show(2)
blogs_df.select(expr("Hits * 2")).show(2)

In [None]:
blogs_df.sort(col("Id"), ascending=False).show()

## Rows

In [None]:
from pyspark.sql import Row
blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015", ["twitter", "LinkedIn"])
# access using index for individual items
blog_row[1]

In [None]:
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
authors_df = spark.createDataFrame(rows, ["Authors", "State"])
authors_df.show()

## Common DataFrame Operations

In [None]:
# Define a schema
fire_schema = ("`CallNumber` int, `UnitID` string, `IncidentNumber` int, `CallType` string, "
               "`CallDate` string, `WatchDate` string, `CallFinalDisposition` string, "
               "`AvailableDtTm` string, `Address` string, `City` string, `Zipcode` int, "
               "`Battalion` string, `StationArea` string, `Box` string, `OriginalPriority` string, "
               "`Priority` string, `FinalPriority` int, `ALSUnit` boolean, `CallTypeGroup` string, "
               "`NumAlarms` int, `UnitType` string, `UnitSequenceInCallDispatch` int, "
               "`FirePreventionDistrict` string, `SupervisorDistrict` string, "
               "`Neighborhood` string, `Location` string, `RowID` string, `Delay` float")

In [None]:
# Use the DataFrameReader interface to read a CSV file
sf_fire_file = "../data/sf-fire-calls.csv"
fire_df = spark.read.schema(fire_schema).option("header", "true").csv(sf_fire_file)

In [None]:
# Save as a Parquet file
parquet_path = '../data_output/chapter3'
fire_df.write.format("parquet").save(parquet_path)

In [None]:
# Save as a SQL table
parquet_table = "firecalls"
fire_df.write.format("parquet").saveAsTable(parquet_table)

In [None]:
spark.sql("select * from firecalls").count()

In [None]:
# Find units with the most calls
(fire_df.select("UnitID").groupBy("UnitID").agg(count("UnitID").alias("NumberOfCalls"))
 .orderBy("NumberOfCalls", ascending=False).show())

In [None]:
# Projections and filters
few_fire_df = (fire_df
               .select("IncidentNumber", "AvailableDtTm", "CallType")
               .where(col("CallType") != "Medical Incident"))
few_fire_df.show(5, truncate=False)

In [None]:
# Return number of distinct types of calls using countDistinct()
(fire_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .agg(countDistinct("CallType").alias("DistinctCallTypes"))
 .show())

In [None]:
# Filter for only distinct non-null CallTypes from all the rows
(fire_df
 .select("CallType")
 .distinct()
 .where(col("CallType").isNotNull())
 .show(100, truncate=False))

In [None]:
# Renaming, adding, and dropping columns
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
 .select("ResponseDelayedinMins")
 .where(col("ResponseDelayedinMins") > 5)
 .show(5, False))

In [None]:
# Convert strings to timestamps
fire_ts_df = (new_fire_df
              .withColumn("IncidentDate", to_date(col("CallDate"), "MM/dd/yyyy"))
              .withColumn("OnWatchDate", to_date(col("WatchDate"), "MM/dd/yyyy"))
              .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"), "MM/dd/yyyy hh:mm:ss a"))
              .drop("CallDate", "WatchDate", "AvailableDtTm"))
# Select the converted columns
(fire_ts_df
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, False))

In [None]:
# How many years’ worth of Fire Department calls are included in the data set
(fire_ts_df
.select(year("IncidentDate").alias("IncidentDateYear"))
.distinct()
.orderBy("IncidentDateYear")
.show())

In [None]:
# What were the most common types of fire calls?
(fire_ts_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .groupBy("CallType")
 .count()
 .orderBy("count", ascending=False)
 .show(n=10, truncate=False))

In [None]:
import pyspark.sql.functions as F
(fire_ts_df
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
         F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show())

In [None]:
spark.stop()