In [1]:
# Import findspark to locate Spark in your Python Env
import findspark
findspark.init()
findspark.find()

# Import SparkSession

from pyspark.sql import SparkSession

# Create SparkSession

spark = SparkSession.builder \
        .master("local") \
        .appName("PySpark Databricks") \
        .getOrCreate()

print("Spark Session Details:",spark)

Spark Session Details: <pyspark.sql.session.SparkSession object at 0x000002C1BB617940>


In [2]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("Data_Files/Retail.csv")

df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|RED WHITE HANGING...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-02 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-03 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [13]:
from pyspark.sql.functions import col

df.selectExpr("(Description, InvoiceNo) as complex", "*")\
    .limit(2)\
    .show(truncate=False)

# OR In Parenthesis or with struct Keyword

df.selectExpr("struct(Description, InvoiceNo) as complex", "*")\
    .limit(2)\
    .orderBy(col("StockCode").asc())\
    .show(truncate=False)

+----------------------------------------------------+---------+---------+------------------------------------------+--------+-------------------+---------+----------+--------------+
|complex                                             |InvoiceNo|StockCode|Description                               |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+----------------------------------------------------+---------+---------+------------------------------------------+--------+-------------------+---------+----------+--------------+
|{RED WHITE HANGING RED HEART T-LIGHT HOLDER, 536365}|536365   |85123A   |RED WHITE HANGING RED HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|{WHITE METAL LANTERN, 536365}                       |536365   |71053    |WHITE METAL LANTERN                       |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
+----------------------------------------------------+---------+---------+-----------

In [16]:
from pyspark.sql.functions import struct

complexDF = df.select(
    struct("Description", "InvoiceNo").alias("complex")
)

complexDF.createOrReplaceTempView("complexDFTable")

In [20]:
# Querying Struct with . Notation

complexDF.select("complex.InvoiceNo").limit(5).show()

complexDF.select(col("complex").getField("Description")).limit(5).show()

complexDF.select("complex.*").show(2)

+---------+
|InvoiceNo|
+---------+
|   536365|
|   536365|
|   536365|
|   536365|
|   536365|
+---------+

+--------------------+
| complex.Description|
+--------------------+
|RED WHITE HANGING...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|
+--------------------+

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|RED WHITE HANGING...|   536365|
| WHITE METAL LANTERN|   536365|
+--------------------+---------+
only showing top 2 rows



In [22]:
spark.sql("""
SELECT
    complex.*
FROM complexDFTable
""").show()

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|RED WHITE HANGING...|   536365|
| WHITE METAL LANTERN|   536365|
|CREAM CUPID HEART...|   536365|
|KNITTED UNION FLA...|   536365|
|RED WOOLLY HOTTIE...|   536365|
|SET 7 BABUSHKA NE...|   536365|
|GLASS STAR FROSTE...|   536365|
|HAND WARMER UNION...|   536366|
|HAND WARMER RED P...|   536366|
|ASSORTED COLOUR B...|   536367|
|                null|   536367|
+--------------------+---------+



In [55]:
# Arrays
from pyspark.sql.functions import split, size, array_contains

df.select(split( col("Description"), " ")).show(truncate=False)

+--------------------------------------------------+
|split(Description,  , -1)                         |
+--------------------------------------------------+
|[RED, WHITE, HANGING, RED, HEART, T-LIGHT, HOLDER]|
|[WHITE, METAL, LANTERN]                           |
|[CREAM, CUPID, HEARTS, COAT, HANGER]              |
|[KNITTED, UNION, FLAG, HOT, WATER, BOTTLE]        |
|[RED, WOOLLY, HOTTIE, WHITE, HEART.]              |
|[SET, 7, BABUSHKA, NESTING, BOXES]                |
|[GLASS, STAR, FROSTED, T-LIGHT, HOLDER]           |
|[HAND, WARMER, UNION, JACK]                       |
|[HAND, WARMER, RED, POLKA, DOT]                   |
|[ASSORTED, COLOUR, BIRD, ORNAMENT]                |
|null                                              |
+--------------------------------------------------+



In [56]:
# Array Index
split_df = df.select(
    split( col("Description"), " ").alias("split_array"))\
    .selectExpr("split_array[0]", "split_array")

split_df.show()
    

+--------------+--------------------+
|split_array[0]|         split_array|
+--------------+--------------------+
|           RED|[RED, WHITE, HANG...|
|         WHITE|[WHITE, METAL, LA...|
|         CREAM|[CREAM, CUPID, HE...|
|       KNITTED|[KNITTED, UNION, ...|
|           RED|[RED, WOOLLY, HOT...|
|           SET|[SET, 7, BABUSHKA...|
|         GLASS|[GLASS, STAR, FRO...|
|          HAND|[HAND, WARMER, UN...|
|          HAND|[HAND, WARMER, RE...|
|      ASSORTED|[ASSORTED, COLOUR...|
|          null|                null|
+--------------+--------------------+



In [57]:
# Array length
split_df.select(size(col("split_array")), "split_array").show(truncate=False)

+-----------------+--------------------------------------------------+
|size(split_array)|split_array                                       |
+-----------------+--------------------------------------------------+
|7                |[RED, WHITE, HANGING, RED, HEART, T-LIGHT, HOLDER]|
|3                |[WHITE, METAL, LANTERN]                           |
|5                |[CREAM, CUPID, HEARTS, COAT, HANGER]              |
|6                |[KNITTED, UNION, FLAG, HOT, WATER, BOTTLE]        |
|5                |[RED, WOOLLY, HOTTIE, WHITE, HEART.]              |
|5                |[SET, 7, BABUSHKA, NESTING, BOXES]                |
|5                |[GLASS, STAR, FROSTED, T-LIGHT, HOLDER]           |
|4                |[HAND, WARMER, UNION, JACK]                       |
|5                |[HAND, WARMER, RED, POLKA, DOT]                   |
|4                |[ASSORTED, COLOUR, BIRD, ORNAMENT]                |
|-1               |null                                              |
+-----

In [59]:
split_df.select(array_contains("split_array", "HAND")).show()

+---------------------------------+
|array_contains(split_array, HAND)|
+---------------------------------+
|                            false|
|                            false|
|                            false|
|                            false|
|                            false|
|                            false|
|                            false|
|                             true|
|                             true|
|                            false|
|                             null|
+---------------------------------+

