In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[5]") \
        .appName("PySpark SQL") \
        .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/21 13:54:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/21 13:54:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


1. Create SQL View  -- *temporary table*

In [9]:
# DataFrame Creation
df = spark.read.csv("DataFiles/simple-zipcodes.csv", header = True)

df.printSchema()
df.show(10)

root
 |-- RecordNumber: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- State: string (nullable = true)

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
+------------+-------+-------------------+-------+-----+
|           1|     US|        PARC PARQUE|    704|   PR|
|           2|     US|PASEO COSTA DEL SUR|    704|   PR|
|          10|     US|       BDA SAN LUIS|    709|   PR|
|       49347|     US|               HOLT|  32564|   FL|
|       49348|     US|          HOMOSASSA|  34487|   FL|
|       61391|     US|  CINGULAR WIRELESS|  76166|   TX|
|       61392|     US|         FORT WORTH|  76177|   TX|
|       61393|     US|           FT WORTH|  76177|   TX|
|       54356|     US|        SPRUCE PINE|  35585|   AL|
|       76511|     US|           ASH HILL|  27007|   NC|
+------------+-------+-------------------+-------+-----+
o

In [10]:
# Creating a SQL view
spark.read.csv("DataFiles/simple-zipcodes.csv", header = True) \
        .createOrReplaceTempView("ZipCodes")

1. Selecting Columns

In [11]:
# using DataFrame API select()
df.select("country", "city", "zipcode", "state").show(5)

# using ANSI SQL
spark.sql(
    '''
    SELECT country, city, zipcode, state
    FROM ZipCodes
    '''
).show(5)

+-------+-------------------+-------+-----+
|country|               city|zipcode|state|
+-------+-------------------+-------+-----+
|     US|        PARC PARQUE|    704|   PR|
|     US|PASEO COSTA DEL SUR|    704|   PR|
|     US|       BDA SAN LUIS|    709|   PR|
|     US|               HOLT|  32564|   FL|
|     US|          HOMOSASSA|  34487|   FL|
+-------+-------------------+-------+-----+
only showing top 5 rows

+-------+-------------------+-------+-----+
|country|               city|zipcode|state|
+-------+-------------------+-------+-----+
|     US|        PARC PARQUE|    704|   PR|
|     US|PASEO COSTA DEL SUR|    704|   PR|
|     US|       BDA SAN LUIS|    709|   PR|
|     US|               HOLT|  32564|   FL|
|     US|          HOMOSASSA|  34487|   FL|
+-------+-------------------+-------+-----+
only showing top 5 rows



2. Filter Rows

In [16]:
# DataFrame API
df.select("country", "city", "zipcode", "state") \
    .where(df.State == "AZ").show(5)

# using SQL
spark.sql(
    '''
    SELECT country, city, zipcode, state
    FROM ZipCodes
    WHERE state == 'AZ'
    '''
).show()

+-------+----+-------+-----+
|country|city|zipcode|state|
+-------+----+-------+-----+
|     US|MESA|  85209|   AZ|
|     US|MESA|  85210|   AZ|
+-------+----+-------+-----+

+-------+----+-------+-----+
|country|city|zipcode|state|
+-------+----+-------+-----+
|     US|MESA|  85209|   AZ|
|     US|MESA|  85210|   AZ|
+-------+----+-------+-----+



3. Sorting

In [19]:
# DataFrame API
df.select("country", "city", "zipcode", "state") \
    .where(df.State.isin(['PR', 'AZ', 'FL'])) \
    .orderBy("state") \
    .show(10)

# using SQL
spark.sql(
    '''
    SELECT country, city, zipcode, state
    FROM ZipCodes
    WHERE state IN ('PR', 'AZ', 'FL')
    ORDER BY state
    '''
).show(10)

+-------+-------------------+-------+-----+
|country|               city|zipcode|state|
+-------+-------------------+-------+-----+
|     US|               MESA|  85209|   AZ|
|     US|               MESA|  85210|   AZ|
|     US|               HOLT|  32564|   FL|
|     US|          HOMOSASSA|  34487|   FL|
|     US|           HILLIARD|  32046|   FL|
|     US|             HOLDER|  34445|   FL|
|     US|        PARC PARQUE|    704|   PR|
|     US|PASEO COSTA DEL SUR|    704|   PR|
|     US|       BDA SAN LUIS|    709|   PR|
|     US|    URB EUGENE RICE|    704|   PR|
+-------+-------------------+-------+-----+
only showing top 10 rows

+-------+-------------------+-------+-----+
|country|               city|zipcode|state|
+-------+-------------------+-------+-----+
|     US|               MESA|  85209|   AZ|
|     US|               MESA|  85210|   AZ|
|     US|               HOLT|  32564|   FL|
|     US|          HOMOSASSA|  34487|   FL|
|     US|           HILLIARD|  32046|   FL|
|     

4. Grouping

In [21]:
# DataFrame API
df.groupBy("state").count().show()

# using SQL
spark.sql(
    '''
    SELECT state, count(*) as Count
    FROM ZipCodes
    GROUP BY state
    '''
).show()

+-----+-----+
|state|count|
+-----+-----+
|   AZ|    2|
|   NC|    3|
|   AL|    3|
|   TX|    3|
|   FL|    4|
|   PR|    5|
+-----+-----+

+-----+-----+
|state|Count|
+-----+-----+
|   AZ|    2|
|   NC|    3|
|   AL|    3|
|   TX|    3|
|   FL|    4|
|   PR|    5|
+-----+-----+



In [22]:
spark.stop()