In [5]:
# In Python 

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
# Create a DataFrame using SparkSession
spark = (SparkSession
 .builder
 .appName("AuthorsAges")
 .getOrCreate())
# Create a DataFrame 


In [2]:
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30),
 ("TD", 35), ("Brooke", 25)], ["name", "age"])
# Group the same names together, aggregate their ages, and compute an average
avg_df = data_df.groupBy("name").agg(avg("age"))
# Show the results of the final execution
avg_df.show()


+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Jules|    30.0|
|    TD|    35.0|
| Denny|    31.0|
+------+--------+



In [3]:
from pyspark.sql.types import *
schema = StructType([StructField("author", StringType(), False),
StructField("title", StringType(), False),
StructField("pages", IntegerType(), False)])

## Con DDL 
# schema = "author STRING, title STRING, pages INT"


In [4]:
data_df.countDistinct("age")

AttributeError: 'DataFrame' object has no attribute 'countDistinct'

In [6]:
# In Python 
from pyspark.sql import SparkSession
# Define schema for our data using DDL 
schema = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"
# Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter",

"LinkedIn"]],
 [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
"LinkedIn"]],
 [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
"twitter", "FB", "LinkedIn"]],
 [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,
["twitter", "FB"]],
 [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
"twitter", "FB", "LinkedIn"]],
 [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,
["twitter", "LinkedIn"]]
 ]
# Main program
# if __name__ == "__main__":
 # Create a SparkSession
spark = (SparkSession
 .builder
 .appName("Example-3_6")
 .getOrCreate())
 # Create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, schema)
 # Show the DataFrame; it should reflect our table above
blogs_df.show()
 # Print the schema used by Spark to process the DataFrame
print(blogs_df.printSchema())


+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (

## Row

In [None]:
from pyspark.sql import Row
blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015",
 ["twitter", "LinkedIn"])
# access using index for individual items
blog_row[1]

In [None]:
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
authors_df = spark.createDataFrame(rows, ["Authors", "State"])
authors_df.show()

## Using DataFrameReader and DataFrameWriter

In [7]:
# In Python, define a schema
from pyspark.sql.types import *
from pyspark.sql.functions import *
# Programmatic way to define a schema
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
 StructField('UnitID', StringType(), True),
 StructField('IncidentNumber', IntegerType(), True),
 StructField('CallType', StringType(), True),
 StructField('CallDate', StringType(), True),
 StructField('WatchDate', StringType(), True),
 StructField('CallFinalDisposition', StringType(), True),
 StructField('AvailableDtTm', StringType(), True),
 StructField('Address', StringType(), True),
 StructField('City', StringType(), True),
 StructField('Zipcode', IntegerType(), True),
 StructField('Battalion', StringType(), True),
 StructField('StationArea', StringType(), True),
 StructField('Box', StringType(), True),
 StructField('OriginalPriority', StringType(), True),
 StructField('Priority', StringType(), True),
 StructField('FinalPriority', IntegerType(), True),
 StructField('ALSUnit', BooleanType(), True),
 StructField('CallTypeGroup', StringType(), True),
 StructField('NumAlarms', IntegerType(), True),
 StructField('UnitType', StringType(), True),
 StructField('UnitSequenceInCallDispatch', IntegerType(), True),
 StructField('FirePreventionDistrict', StringType(), True),
 StructField('SupervisorDistrict', StringType(), True),
 StructField('Neighborhood', StringType(), True),
 StructField('Location', StringType(), True),
 StructField('RowID', StringType(), True),
 StructField('Delay', FloatType(), True)])
# Use the DataFrameReader interface to read a CSV file
sf_fire_file = "C:/Users/sara.arribas/Downloads/Ejemplos_Spark/sf-fire-calls.csv"
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)


## Saving a DataFrame as a Parquet file or SQL table

In [None]:
parquet_path = "C:/Users/sara.arribas/Downloads/Ejemplos_Spark/parquet/fairDF.parquet"
fire_df.repartition(1).write.mode("overwrite").format("parquet").save(parquet_path)

## Projections and filters

In [None]:
few_fire_df = (fire_df
 .select("IncidentNumber", "AvailableDtTm", "CallType")
 .where(col("CallType") != "Medical Incident"))
few_fire_df.show(5, truncate=False)

In [8]:
# return number of distinct types of calls using countDistinct()
from pyspark.sql.functions import *
(fire_df.select("CallType")
 .where(col("CallType").isNotNull())
 .agg(countDistinct("CallType").alias("DistinctCallTypes"))
 .show())


Py4JError: org.apache.spark.sql.functions.count_distinct does not exist in the JVM

In [9]:
from pyspark.sql.functions import *
(fire_df.select("CallType")
 .where(col("CallType").isNotNull())
 .distinct().count())

30

In [None]:
# In Python, filter for only distinct non-null CallTypes from all the rows
(fire_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .distinct()
 .show(10, False))


## Renaming, adding, and dropping columns

In [11]:
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
 .select("ResponseDelayedinMins")
 .where(col("ResponseDelayedinMins") > 5)
 .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



In [None]:
 fire_df.select("Delay").where(col("Delay") > 5).show(5,False)

1. Convert the existing column’s data type from string to a Spark-supported timestamp.

2. Use the new format specified in the format string "MM/dd/yyyy" or "MM/dd/yyyy hh:mm:ss a" where appropriate.

3. After converting to the new data type, drop() the old column and append the new one specified in the first argument to the withColumn() method.

4. Assign the new modified DataFrame to fire_ts_df.

The queries result in three new columns

In [12]:
fire_ts_df = (new_fire_df
 .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
 .drop("CallDate")
 .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
 .drop("WatchDate")
 .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
 "MM/dd/yyyy hh:mm:ss a"))
 .drop("AvailableDtTm"))
# Select the converted columns
(fire_ts_df
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, False))


+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [None]:
(fire_ts_df
 .select(year('IncidentDate'))
 .distinct()
 .orderBy(year('IncidentDate'))
 .show())

In [None]:
(fire_ts_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .groupBy("CallType")
 .count()
 .orderBy("count", ascending=False)
 .show(n=10, truncate=False))

## Other common DataFrame operations

In [13]:
import pyspark.sql.functions as F
(fire_ts_df
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
 F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show())


+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



## End-to-End DataFrame Example

**1) How many distinct types of calls were made to the Fire Department?**
**2) What are distinct types of calls were made to the Fire Department?**
**3) 

In [14]:
fire_df.select("CallType").where(col("CallType").isNotNull()).distinct().count()

30

**2) What are distinct types of calls were made to the Fire Department?**

In [18]:
fire_df.select("CallType").distinct().show(10, False)

+-----------------------------------+
|CallType                           |
+-----------------------------------+
|Elevator / Escalator Rescue        |
|Marine Fire                        |
|Aircraft Emergency                 |
|Confined Space / Structure Collapse|
|Administrative                     |
|Alarms                             |
|Odor (Strange / Unknown)           |
|Citizen Assist / Service Call      |
|HazMat                             |
|Watercraft in Distress             |
+-----------------------------------+
only showing top 10 rows



**3) Find out all response or delayed times greater than 5 minutes**

In [30]:
fire_ts_df.select("ResponseDelayedinMins").where(col("ResponseDelayedinMins")<5).orderBy(desc("ResponseDelayedinMins")).show(10, False)

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|4.983333             |
|4.983333             |
|4.983333             |
|4.983333             |
|4.983333             |
|4.983333             |
|4.983333             |
|4.983333             |
|4.983333             |
|4.983333             |
+---------------------+
only showing top 10 rows



**4) What are the most common call types?**

In [33]:
fire_ts_df.select("CallType").groupBy("CallType").count().orderBy(desc("count")).show(10, False)

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



**5) What San Francisco neighborhoods are in zip codes 94102 and 94103**

In [37]:
fire_ts_df.select("Neighborhood","ZipCode").where((col("ZipCode") == 94102) | (col("ZipCode") == 94103)).distinct().show(10, False)

+------------------------------+-------+
|Neighborhood                  |ZipCode|
+------------------------------+-------+
|Potrero Hill                  |94103  |
|Western Addition              |94102  |
|Tenderloin                    |94102  |
|Nob Hill                      |94102  |
|Castro/Upper Market           |94103  |
|South of Market               |94102  |
|South of Market               |94103  |
|Hayes Valley                  |94103  |
|Financial District/South Beach|94102  |
|Mission Bay                   |94103  |
+------------------------------+-------+
only showing top 10 rows



**6) How many distinct years of data is in the CSV file?**

In [40]:
fire_ts_df.select(year("IncidentDate")).distinct().orderBy(year("IncidentDate")).show(10, False)

+------------------+
|year(IncidentDate)|
+------------------+
|2000              |
|2001              |
|2002              |
|2003              |
|2004              |
|2005              |
|2006              |
|2007              |
|2008              |
|2009              |
+------------------+
only showing top 10 rows



**7) What week of the year in 2018 had the most fire calls?**

In [41]:
fire_ts_df.filter(year("IncidentDate") == 2018).groupBy(weekofyear("IncidentDate")).count().orderBy(desc("count")).show(10, False)

+------------------------+-----+
|weekofyear(IncidentDate)|count|
+------------------------+-----+
|22                      |259  |
|40                      |255  |
|43                      |250  |
|25                      |249  |
|1                       |246  |
|44                      |244  |
|32                      |243  |
|13                      |243  |
|11                      |240  |
|5                       |236  |
+------------------------+-----+
only showing top 10 rows



## Typed Objects, Untyped Objects, and Generic Rows

In [None]:
from pyspark.sql import Row
row = Row(350, True, "Learning Spark 2E", None)

In [None]:
row[0]

In [None]:
row[1]

In [None]:
row[2]