In [1]:
import spark
import sys
from pyspark import SparkContext 
from pyspark.sql import SparkSession

In [2]:
# RDD:
#     Dependencies

#     Partitions (with some locality information)

#     Compute function: Partition => Iterator[T]

# First, a list of dependencies that instructs Spark how an RDD is constructed with its inputs is required.
# Second, partitions provide Spark the ability to split the work to parallelize computation 
# on partitions across executors.

# And finally, an RDD has a compute function that produces an Iterator[T] for the data 
# that will be stored in the RDD.

In [3]:
from pyspark import SparkContext
sc =SparkContext()
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30),("TD", 35), ("Brooke", 25)])

In [4]:
# Use map and reduceByKey transformations with their lambda 
# expressions to aggregate and then compute average
agesRDD = (dataRDD
  .map(lambda x: (x[0], (x[1], 1)))
  .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
  .map(lambda x: (x[0], x[1][0]/x[1][1])))

In [5]:
# same query with high-level DSL operators and the DataFrame API
# Some of these DSL operators perform relational-like operations that you’ll be familiar with if you know SQL, 
# such as selecting, filtering, grouping, and aggregation.

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
spark = (
    SparkSession.builder.appName("Data computation").getOrCreate()
)
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30), 
  ("TD", 35), ("Brooke", 25)], ["name", "age"])
avg_df = data_df.groupBy("name").agg(avg("age"))
avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Jules|    30.0|
|    TD|    35.0|
| Denny|    31.0|
+------+--------+



In [6]:
# A schema in Spark defines the column names and associated data types for a DataFrame.
# Most often, schemas come into play when you are reading structured data from an external data source
from pyspark.sql.types import *
schema = StructType(
    [StructField("author", StringType(), False),
    StructField("title", StringType(), False),
    StructField("pages", IntegerType(), False)]
)
schema = "author STRING, title STRING, pages INT"
from pyspark.sql import SparkSession
schema = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, \
  `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

# Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter",
"LinkedIn"]],
       [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
"LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
"twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, 
["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
"twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, 
["twitter", "LinkedIn"]]
      ]
spark = (SparkSession
     .builder
     .appName("Create_Schema")
     .getOrCreate())
schema_json_type = spark.createDataFrame(data, schema)
schema_json_type.show()
# print(schema.printSchema())

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [7]:
schema = StructType([
    StructField("Id_D",IntegerType(),False),
    StructField("First",StringType(),False),
    StructField("Last",StringType(),False),
    StructField("Url",StringType(),False),
    StructField("Published",StringType(),False),
    StructField("Hits",IntegerType(),False),
    StructField("Campaigns",ArrayType(StringType()),False)])

blog_df = spark.read.schema(schema).json('Data/blogs.json')
blog_df.show(n= 5)

+----+---------+-------+-----------------+---------+-----+--------------------+
|Id_D|    First|   Last|              Url|Published| Hits|           Campaigns|
+----+---------+-------+-----------------+---------+-----+--------------------+
|null|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|null|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|null|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|null|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|null|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
+----+---------+-------+-----------------+---------+-----+--------------------+
only showing top 5 rows



In [8]:
# Columns and Expressions
# Rows
from pyspark.sql import Row
blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015", 
  ["twitter", "LinkedIn"])
blog_row[1]
# Row objects can be used to create DataFrames
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
authors_df = spark.createDataFrame(rows, ["Authors", "State"])
authors_df.show()


+-------------+-----+
|      Authors|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



In [9]:
# Common DataFrame Operations
#  you’ll first need to load a DataFrame from a data source that holds your structured data.
# Using DataFrameReader and DataFrameWriter
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
                StructField('UnitID', StringType(), True),
                StructField('IncidentNumber', IntegerType(), True),
                StructField('CallType', StringType(), True),                  
                StructField('CallDate', StringType(), True),      
                StructField('WatchDate', StringType(), True),
                StructField('CallFinalDisposition', StringType(), True),
                StructField('AvailableDtTm', StringType(), True),
                StructField('Address', StringType(), True),       
                StructField('City', StringType(), True),       
                StructField('Zipcode', IntegerType(), True),       
                StructField('Battalion', StringType(), True),                 
                StructField('StationArea', StringType(), True),       
                StructField('Box', StringType(), True),       
                StructField('OriginalPriority', StringType(), True),       
                StructField('Priority', StringType(), True),       
                StructField('FinalPriority', IntegerType(), True),       
                StructField('ALSUnit', BooleanType(), True),       
                StructField('CallTypeGroup', StringType(), True),
                StructField('NumAlarms', IntegerType(), True),
                StructField('UnitType', StringType(), True),
                StructField('UnitSequenceInCallDispatch', IntegerType(), True),
                StructField('FirePreventionDistrict', StringType(), True),
                StructField('SupervisorDistrict', StringType(), True),
                StructField('Neighborhood', StringType(), True),
                StructField('Location', StringType(), True),
                StructField('RowID', StringType(), True),
                StructField('Delay', FloatType(), True)])
sf_fire_file = 'Data/sf-fire-calls.csv'
file_df = spark.read.csv(sf_fire_file, header = True, schema = fire_schema)
file_df
# Save file apache parquest
# You can compare this with other .file(csv). It's useful
# parquet_path = file_df.write.format("parquet").save('data_fire_calls')

DataFrame[CallNumber: int, UnitID: string, IncidentNumber: int, CallType: string, CallDate: string, WatchDate: string, CallFinalDisposition: string, AvailableDtTm: string, Address: string, City: string, Zipcode: int, Battalion: string, StationArea: string, Box: string, OriginalPriority: string, Priority: string, FinalPriority: int, ALSUnit: boolean, CallTypeGroup: string, NumAlarms: int, UnitType: string, UnitSequenceInCallDispatch: int, FirePreventionDistrict: string, SupervisorDistrict: string, Neighborhood: string, Location: string, RowID: string, Delay: float]

In [10]:
# Transformations and actions
# Projections and filters
few_fire_df = (
    file_df.select("IncidentNumber", "AvailableDtTm", "CallType")
    .filter("CallType != 'Medical Incident'")
)
few_fire_df.show(5, truncate=False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [11]:
# pyspak SQl
from pyspark.sql.functions import *
(
    file_df
    .select("CallType")
    .where(col("CallType").isNotNull())
    .agg(countDistinct("CallType").alias("DistinctCallTypes"))
    .show()
)
(file_df
  .select("CallType")
  .where(col("CallType").isNotNull())
  .distinct()
  .show(10))

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|         Marine Fire|
|  Aircraft Emergency|
|Confined Space / ...|
|      Administrative|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|Watercraft in Dis...|
+--------------------+
only showing top 10 rows



In [12]:
# Renaming, adding, and dropping columns

new_fire_df = file_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
  .select("ResponseDelayedinMins")
  .where(col("ResponseDelayedinMins") > 5)
  .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



In [13]:

# when we rename a column using withColumnRenamed() we get a new DataFrame 
# while retaining the original with the old column name.
fire_ts_df = (new_fire_df
  .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
  .drop("CallDate") 
  .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
  .drop("WatchDate") 
  .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"), 
  "MM/dd/yyyy hh:mm:ss a"))
  .drop("AvailableDtTm"))

# Select the converted columns
(fire_ts_df
  .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
  .show(5, False))
# Processing data
# Convert the existing column’s data type from string to a Spark-supported timestamp
# Use the new format specified in the format string "MM/dd/yyyy" or "MM/dd/yyyy hh:mm:ss a" where appropriate.
# After converting to the new data type, drop() the old column and append the new one specified in 
# the first argument to the withColumn() method.


+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [15]:
# get year, get month, get day
(
    fire_ts_df
    .select(year("IncidentDate"))
    .distinct()
    .orderBy(year("IncidentDate"))
    .show(n = 5)
)

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
+------------------+
only showing top 5 rows



In [18]:
# Aggregations
(
    fire_ts_df
    .select("CallType")
    .where(col("CallType").isNotNull())
    .groupBy("CallType")
    .count()
    .orderBy("count", ascending = False)
    .show(n = 5, truncate = False)
)
# (fire_ts_df
#   .select("CallType")
#   .where(col("CallType").isNotNull())
#   .groupBy("CallType")
#   .count()
#   .orderBy("count", ascending=False)
#   .show(n=10, truncate=False))

+-----------------------------+------+
|CallType                     |count |
+-----------------------------+------+
|Medical Incident             |113794|
|Structure Fire               |23319 |
|Alarms                       |19406 |
|Traffic Collision            |7013  |
|Citizen Assist / Service Call|2524  |
+-----------------------------+------+
only showing top 5 rows



In [20]:
# Other common DataFrame operations
import pyspark.sql.functions as func
(
    fire_ts_df
    .select(func.sum("NumAlarms"), func.avg("ResponseDelayedinMins"),
            func.min("ResponseDelayedinMins"), func.max("ResponseDelayedinMins")).show()
)
# advanced statistical
# API documentation for methods like stat(), describe(), correlation(), covariance(), sampleBy(), 
# approxQuantile(), frequentItems(), and so on.

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



In [29]:
mnm_df = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load('Data/mnm_dataset.csv')
)
count_mnm_df = (
    mnm_df
    .select("State", "Color", "Count")
    .groupBy("State", "Color")
    .agg(count("Count").alias("Total"))
    .orderBy("Total", ascending = False).show( n =5 )
    
)
count_mnm_df.explain(True)


+-----+------+-----+
|State| Color|Total|
+-----+------+-----+
|   CA|Yellow| 1807|
|   WA| Green| 1779|
|   OR|Orange| 1743|
|   TX| Green| 1737|
|   TX|   Red| 1725|
+-----+------+-----+
only showing top 5 rows



AttributeError: 'NoneType' object has no attribute 'explain'