In [0]:
import pyspark

spark = pyspark.sql.SparkSession.builder.appName("demotypes").getOrCreate()



In [0]:
df = spark.read.format("csv")\
    .option("header", "true")\
        .option("inferSchema", "true")\
            .load("/Volumes/workspace/myschema/myvolume/retail_2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
from pyspark.sql.functions import instr, col
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      NULL|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      NULL|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [0]:
from pyspark.sql.functions import expr, round

df.withColumn("IsExpensive", expr("UnitPrice > 250")).filter("IsExpensive").select("Description", "UnitPrice", round(df.UnitPrice)).show(5)
df.where(col("Description").eqNullSafe("hello"))

+--------------+---------+-------------------+
|   Description|UnitPrice|round(UnitPrice, 0)|
+--------------+---------+-------------------+
|DOTCOM POSTAGE|   569.77|              570.0|
|DOTCOM POSTAGE|   607.49|              607.0|
+--------------+---------+-------------------+



DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [0]:
from pyspark.sql.functions import monotonically_increasing_id, lit, ltrim, rtrim, rpad, lpad, trim

# df.select(monotonically_increasing_id()).show(5)
df.select(
    ltrim(lit("         HELLO        ")).alias("ltrim"),
    rtrim(lit("         HELLO        ")).alias("rtrim"),
    lpad(lit("HELLO"), 10, " ").alias("lpad"),
    rpad(lit("HELLO"), 10, " ").alias("rpad"),
    trim(lit("         HELLO        ")).alias("trim")
).show(3)

+-------------+--------------+----------+----------+-----+
|        ltrim|         rtrim|      lpad|      rpad| trim|
+-------------+--------------+----------+----------+-----+
|HELLO        |         HELLO|     HELLO|HELLO     |HELLO|
|HELLO        |         HELLO|     HELLO|HELLO     |HELLO|
|HELLO        |         HELLO|     HELLO|HELLO     |HELLO|
+-------------+--------------+----------+----------+-----+
only showing top 3 rows


In [0]:
from pyspark.sql.functions import col, current_date, current_timestamp, date_add, date_sub, datediff

dateDF = spark.range(10).withColumn("today", current_date()).withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

dateDF.select(date_add(dateDF.today, 5), date_sub(dateDF.today, 5)).show()
dateDF.withColumn("SeveralDaysAgo", date_sub(dateDF.today, 18)).select(datediff(col("today"), col("SeveralDaysAgo"))).show()

+------------------+------------------+
|date_add(today, 5)|date_sub(today, 5)|
+------------------+------------------+
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
|        2025-12-30|        2025-12-20|
+------------------+------------------+

+-------------------------------+
|datediff(today, SeveralDaysAgo)|
+-------------------------------+
|                             18|
|                             18|
|                             18|
|                             18|
|                             18|
|                             18|
|                             18|
|                             18|
|                             18|
|                             1

In [0]:
from pyspark.sql.functions import to_date, to_timestamp, lit, col

date_format = "yyy-dd-MM"
timestamp_format = "yyyy-MM-dd HH:mm:ss"
dateDF = spark.range(3).select(
    to_date(lit("2020-20-11"), date_format).alias("date"),
    to_timestamp(lit("2020-10-07 18:44:12"), timestamp_format).alias("timestamp")
)
dateDF.filter(col("date") > lit("2018-10-11")).display()

dateDF.na.drop("all", subset=["date", "timestamp"])
df.na.fill({"time":to_date(lit("1999-01-01")), "timestamp":to_timestamp(lit("1999-01-01 00:00:00"))})

date,timestamp
2020-11-20,2020-10-07T18:44:12.000Z
2020-11-20,2020-10-07T18:44:12.000Z
2020-11-20,2020-10-07T18:44:12.000Z


In [0]:
from pyspark.sql.functions import struct, col, split, size, explode

complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexTable")
complexDF.select(col("complex.Description"), col("complex").getField("InvoiceNo")).show(2)

split_df = df.withColumn("split_descr", split(col("Description"), " "))
split_df.selectExpr("split_descr[0]").show(2)
split_df.select(size(col("split_descr"))).show(2)

split_df.select(col("Description"), col("split_descr")).withColumn("exploded", explode(col("split_descr"))).show(3)

+--------------------+-----------------+
|         Description|complex.InvoiceNo|
+--------------------+-----------------+
|WHITE HANGING HEA...|           536365|
| WHITE METAL LANTERN|           536365|
+--------------------+-----------------+
only showing top 2 rows
+--------------+
|split_descr[0]|
+--------------+
|         WHITE|
|         WHITE|
+--------------+
only showing top 2 rows
+-----------------+
|size(split_descr)|
+-----------------+
|                5|
|                3|
+-----------------+
only showing top 2 rows
+--------------------+--------------------+--------+
|         Description|         split_descr|exploded|
+--------------------+--------------------+--------+
|WHITE HANGING HEA...|[WHITE, HANGING, ...|   WHITE|
|WHITE HANGING HEA...|[WHITE, HANGING, ...| HANGING|
|WHITE HANGING HEA...|[WHITE, HANGING, ...|   HEART|
+--------------------+--------------------+--------+
only showing top 3 rows


In [0]:
from pyspark.sql.functions import create_map, col

df.select(create_map(col("InvoiceNo"), col("Description")).alias("complex_map")).selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            NULL|
|                            NULL|
+--------------------------------+
only showing top 2 rows


In [0]:
jsonDF = spark.range(1).selectExpr("""'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
    json_tuple(col("jsonString"), "myJSONKey")
).show(2)

+------+--------------------+
|column|                  c0|
+------+--------------------+
|     2|{"myJSONValue":[1...|
+------+--------------------+



In [0]:
from pyspark.sql.functions import from_json, to_json, col
from pyspark.sql.types import *

json_schema = StructType((
    StructField("InvoiceNo", StringType(), True),
    StructField("Description", StringType(), True)
    ))
df_struct = df.selectExpr("(InvoiceNo, Description) as myStruct")
df_json = df_struct.select(
    to_json(col("myStruct")).alias("parsed")
)
result_df = df_json.select(
    from_json(col("parsed"), json_schema).alias("newJSON"),
    col("parsed")
)
result_df.show(4)

+--------------------+--------------------+
|             newJSON|              parsed|
+--------------------+--------------------+
|{536365, WHITE HA...|{"InvoiceNo":"536...|
|{536365, WHITE ME...|{"InvoiceNo":"536...|
|{536365, CREAM CU...|{"InvoiceNo":"536...|
|{536365, KNITTED ...|{"InvoiceNo":"536...|
+--------------------+--------------------+
only showing top 4 rows


In [0]:
def spending_tier(total_amount):
    try:
        if total_amount is None:
            return "UNKNOWN"
        elif total_amount < 100:
            return "LOW"
        elif total_amount < 500:
            return "MEDIUM"
        else:
            return "HIGH"
    except Exception as e:
        return None
    
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

spending_tier_udf = udf(spending_tier, StringType())
df = spark.createDataFrame(
    [(1, 45.0), (2, 220.0), (3, 890.0), (4, None)],
    ["customer_id", "total_amount"]
)
df.withColumn(
    "spending_tier",
    spending_tier_udf(col("total_amount"))
).show()

spark.udf.register("spending_tier", spending_tier, StringType())


+-----------+------------+-------------+
|customer_id|total_amount|spending_tier|
+-----------+------------+-------------+
|          1|        45.0|          LOW|
|          2|       220.0|       MEDIUM|
|          3|       890.0|         HIGH|
|          4|        NULL|      UNKNOWN|
+-----------+------------+-------------+

