In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, date_add, date_sub, datediff, months_between, current_date, current_timestamp, year, month, dayofmonth, date_format

# Create a Spark session
spark = SparkSession.builder.appName("Date Functions").getOrCreate()

# Sample data
data = [
    (1, "2024-11-10", "2024-11-20"),
    (2, "2023-10-05", "2023-10-15"),
    (3, "2022-05-01", "2022-05-10")
]

# Create a DataFrame
df = spark.createDataFrame(data, ["id", "order_date", "delivery_date"])

# Convert strings to date format
df = df.withColumn("order_date", to_date(col("order_date"), "yyyy-MM-dd"))
df = df.withColumn("delivery_date", to_date(col("delivery_date"), "yyyy-MM-dd"))

# Add current date and timestamp cols
df = df.withColumn("current_date", current_date())
df = df.withColumn("current_timestamp", current_timestamp())

# Calculate the difference in days between delivery and order dates
df = df.withColumn("days_to_delivery", datediff(col("delivery_date"), col("order_date")))

# Add 5 days to the delivery date
df = df.withColumn("delivery_plus_5", date_add(col("delivery_date"), 5))

# Subtract 5 days from the delivery date
df = df.withColumn("delivery_minus_5", date_sub(col("delivery_date"), 5))

# Calculate the months between order and delivery dates
df = df.withColumn("months_between_order_delivery", months_between(col("delivery_date"), col("order_date")))

# Calculate the difference in days between current date and delivery date
df = df.withColumn("days_from_today_to_delivery", datediff(current_date(), col("delivery_date")))

# Extract year, month, and day
df = df.withColumn("year", year(col("order_date")))
df = df.withColumn("month", month(col("order_date")))
df = df.withColumn("day", dayofmonth(col("order_date")))

# Format date as string
df = df.withColumn("formatted_date", date_format(col("order_date"), "MMMM dd, yyyy"))

# Show results
display(df)

FILTER

In [0]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PySpark Filtering Example") \
    .getOrCreate()

# Sample DataFrame
data = [
    ("Alice", "HR", 5000),
    ("Bob", "IT", 6000),
    ("Charlie", "Finance", 7000),
    ("David", "IT", 4000),
    ("Eve", "HR", 5500),
    ("Frank", "Finance", 8000),
]
columns = ["name", "department", "salary"]

df = spark.createDataFrame(data, columns)

# Show original data
print("Original Data:")
df.show()

# Filter rows with salary greater than 6000
print("Filter: Salary > 6000")
df.filter(df.salary > 6000).show()

# Filter rows belonging to a specific department
print("Filter: Department = 'IT'")
df.filter(df.department == "IT").show()

# Combine multiple filter conditions
print("Filter: Salary > 5000 and Department = 'HR'")
df.filter((df.salary > 5000) & (df.department == "HR")).show()

# Using SQL-like where clause
print("Filter: Salary < 6000 using where()")
df.where("salary < 6000").show()

# Filter using isin (e.g., department is either IT or HR)
print("Filter: Department in ('IT', 'HR')")
df.filter(df.department.isin("IT", "HR")).show()

# Filter rows where a column is null or not null
from pyspark.sql.functions import col
data_with_nulls = [
    ("Alice", None, 5000),
    ("Bob", "IT", 6000),
    (None, "Finance", 7000),
    ("David", "IT", 4000),
]
df_with_nulls = spark.createDataFrame(data_with_nulls, columns)

print("Filter: Rows where department is not null")
df_with_nulls.filter(col("department").isNotNull()).show()

print("Filter: Rows where name is null")
df_with_nulls.filter(col("name").isNull()).show()

# Stop Spark Session
spark.stop()

Why Use F for Functions?
In PySpark, most aggregation functions are available in the pyspark.sql.functions module. It's a common practice to import this module as F for two reasons:

Clarity: It distinguishes PySpark functions (e.g., F.sum) from Python built-ins (e.g., sum).
Readability: Using F makes the code concise and easier to understand.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum, count

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PySpark Grouping Example") \
    .getOrCreate()

# Sample DataFrame
data = [
    ("Alice", "HR", 5000),
    ("Bob", "IT", 6000),
    ("Charlie", "Finance", 7000),
    ("David", "IT", 6000),
    ("Eve", "HR", 5500),
    ("Frank", "Finance", 8000),
]
columns = ["name", "department", "salary"]

df = spark.createDataFrame(data, columns)

# Show original data
print("Original Data:")
df.show()

# Group by department and calculate aggregates
print("Group by Department - Count:")
df.groupBy("department").count().show()

print("Group by Department - Sum of Salaries:")
df.groupBy("department").sum("salary").show()

print("Group by Department - Average Salary:")
df.groupBy("department").agg(avg("salary")).show()

print("Group by Department - Multiple Aggregates:")
df.groupBy("department").agg(
    count("name").alias("employee_count"),
    sum("salary").alias("total_salary"),
    avg("salary").alias("average_salary")
).show()

# Stop Spark Session
spark.stop()

Wrapping Up: Why PySpark?
So, why should you love PySpark? Here’s the TL;DR:

Speed: PySpark can process huge datasets in parallel across multiple machines. Your laptop never stood a chance.
Scale: Whether you’re working with 10MB or 10TB of data, PySpark’s got your back.
Power: SQL-like queries, machine learning, real-time analytics-all rolled into one package.

In [0]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("PySpark Join Example") \
    .getOrCreate()

# Sample DataFrame 1 (employees)
data1 = [
    (1, "Alice", "HR"),
    (2, "Bob", "IT"),
    (3, "Charlie", "Finance"),
    (4, "David", "IT")
]
columns1 = ["id", "name", "dept"]
df1 = spark.createDataFrame(data1, columns1)

# Sample DataFrame 2 (departments)
data2 = [
    ("HR", "Human Resources"),
    ("IT", "Information Technology"),
    ("Marketing", "Marketing"),
]
columns2 = ["dept", "dept_name"]
df2 = spark.createDataFrame(data2, columns2)

# Perform joins
# Inner Join
inner_join = df1.join(df2, on="dept", how="inner")
print("Inner Join Result:")
inner_join.show()

# Left Join
left_join = df1.join(df2, on="dept", how="left")
print("Left Join Result:")
left_join.show()

# Full Outer Join
outer_join = df1.join(df2, on="dept", how="outer")
print("Full Outer Join Result:")
outer_join.show()

# Stop the Spark session
spark.stop()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.master("local").appName("Arithmetic Example").getOrCreate()

# Sample data
data = [(1, 10, 3), (2, 20, 5), (3, 15, 4)]
df = spark.createDataFrame(data, ["id", "value1", "value2"])

# Perform arithmetic operations
df = df.withColumn("sum", col("value1") + col("value2")) \
       .withColumn("difference", col("value1") - col("value2")) \
       .withColumn("product", col("value1") * col("value2")) \
       .withColumn("quotient", col("value1") / col("value2")) \
       .withColumn("remainder", col("value1") % col("value2"))

df.show()

In [0]:
from pyspark.sql.functions import abs, round, ceil, sqrt

# Apply math functions
df = df.withColumn("absolute_value1", abs(col("value1"))) \
       .withColumn("rounded_value2", round(col("value2"), 1)) \
       .withColumn("ceil_value1", ceil(col("value1"))) \
       .withColumn("sqrt_value2", sqrt(col("value2")))

df.show()

In [0]:
from pyspark.sql.functions import pow, log10

# Combining functions
df = df.withColumn("power_value1", pow(col("value1"), 2)) \
       .withColumn("log10_value2", log10(col("value2")))

df.show()

In [0]:
# Reading CSV with more options
df = spark.read.option("header", True) \
               .option("inferSchema", True) \
               .option("nullValue", "N/A") \
               .option("dateFormat", "MM/dd/yyyy") \
               .option("mode", "DROPMALFORMED") \
               .csv("/path/to/your/data.csv")

df.show(5)


# Writing CSV with delimiter and compression
df.write.option("header", True) \
        .option("sep", ";") \
        .option("compression", "gzip") \
        .csv("/path/to/compressed_data")

Null Values: Use nullValue to specify a string to interpret as null, such as "N/A" or "NULL".
Date Format: Set dateFormat to specify the date format for date fields.
Mode: Determines how Spark handles corrupt records. Options include:
PERMISSIVE (default): Puts corrupt records in a separate column.
DROPMALFORMED: Drops rows with bad records.
FAILFAST: Throws an error for corrupt records.
Example with null values, date format, and mode options:

In [0]:
Null Values: Use nullValue to specify a string to interpret as null, such as "N/A" or "NULL".
Date Format: Set dateFormat to specify the date format for date fields.
Mode: Determines how Spark handles corrupt records. Options include:
PERMISSIVE (default): Puts corrupt records in a separate column.
DROPMALFORMED: Drops rows with bad records.
FAILFAST: Throws an error for corrupt records.
Example with null values, date format, and mode options:
    

    Header: Set header=True to include column names in the first row.
Mode: Controls the behavior if the output file or directory already exists.
overwrite: Replaces existing data.
append: Adds data to the existing file.
ignore: Skips writing if the file exists.
error or errorifexists: Throws an error if the file exists.


Delimiter: Set sep to specify a custom delimiter.
Compression: Choose a compression method such as gzip, bzip2, lz4, snappy, or deflate.

In [0]:
# Reading a JSON with multiline and schema sampling
df = spark.read.option("multiline", True) \
               .option("samplingRatio", 0.5) \
               .json("/path/to/your/multiline_data.json")

df.show(5)

# Writing JSON with partitioning by year
df.write.option("header", True) \
        .partitionBy("year") \
        .json("/path/to/partitioned_output")

In [0]:
2. DataFrame Column Notation (df.colName)
You can access columns as attributes of the DataFrame directly, making the syntax cleaner and allowing complex operations.

Examples:

# Selecting columns using dot notation
df.select(df.name, df.age)

# Filtering rows based on column conditions
df.filter(df.age > 30)
Note: Avoid this syntax for column names containing spaces or special characters.

When to use it: This notation is handy for accessing and transforming columns and can make code more readable.



In [0]:
3. Using col() Function
The col() function from pyspark.sql.functions is versatile and ideal when passing column names as variables or when chaining multiple column operations.

Examples:

from pyspark.sql.functions import col

# Selecting columns using col()
df.select(col("name"), col("age"))

# Filtering rows using col() for flexibility
age_column = "age"
df.filter(col(age_column) > 30)
When to use it: Use col() when dynamically referencing column names or passing them as variables. This is common in reusable code or functions.

In [0]:
4. Bracket Notation (df["colName"])
Bracket notation lets you reference columns using dictionary-style syntax. It’s flexible and frequently used for transformations and chaining.

Examples:

# Selecting columns
df.select(df["name"], df["age"])

# Filtering with expressions
df.filter(df["age"] > 30)

# Applying transformations
df.select((df["age"] + 10).alias("age_plus_10"))
When to use it: Use bracket notation when dealing with columns with spaces or special characters. It’s also helpful for complex expressions.

In [0]:
Summary
Each method of referencing columns in PySpark has unique strengths:

String Names: Great for SQL-style filtering and simple selections.
Dot Notation (df.colName): Clear for simple selections and filtering.
col() Function: Flexible for dynamic column references.
Bracket Notation (df["colName"]): Robust for complex expressions.
lit(): Ideal for working with constants alongside column data.
Using the right column notation can help make your PySpark code more readable, flexible, and SQL-friendly.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, concat, concat_ws, substring, upper, lower, initcap, 
    trim, ltrim, rtrim, regexp_replace, regexp_extract, length, 
    instr, lpad, rpad
)

# Sample data
data = [
    ("John", "Doe", "john.doe@example.com", "   active   ", "12345"),
    ("Jane", "Smith", "jane.smith@work.org", "inactive", "67890"),
    ("Sam", "Brown", "sam.brown@data.net", "active   ", "111213")
]

# Create DataFrame
spark = SparkSession.builder.appName("StringFunctionsWithColumn").getOrCreate()
df = spark.createDataFrame(data, ["first_name", "last_name", "email", "status", "account_id"])

# Applying string functions using withColumn
df_transformed = df \
    .withColumn("full_name", concat_ws(" ", col("first_name"), col("last_name"))) \
    .withColumn("email_uppercase", upper(col("email"))) \
    .withColumn("email_domain", regexp_extract(col("email"), r"@(\w+)", 1)) \
    .withColumn("trimmed_status", trim(col("status"))) \
    .withColumn("padded_account_id", lpad(col("account_id"), 10, "0")) \
    .withColumn("email_prefix", substring(col("email"), 1, 5)) \
    .withColumn("cleaned_email", regexp_replace(col("email"), r"[.@]", "-")) \
    .withColumn("email_length", length(col("email"))) \
    .withColumn("first_name_initcap", initcap(col("first_name"))) \
    .withColumn("description", concat_ws(" | ", col("full_name"), col("trimmed_status"), col("email_domain")))

# Show the resulting DataFrame
df_transformed.show(truncate=False)


In [0]:
from pyspark.sql.functions import sum

# Window specification for cumulative sum
window_spec = Window.partitionBy("category").orderBy("date").rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Apply cumulative sum
df = df.withColumn("cumulative_sales", sum("sales").over(window_spec))

# Show the result
df.show()

In [0]:
from pyspark.sql.functions import avg

# Window specification for moving average over the last 3 rows
window_spec = Window.partitionBy("category").orderBy("date").rowsBetween(-2, Window.currentRow)

# Apply moving average
df = df.withColumn("moving_avg_sales", avg("sales").over(window_spec))

# Show the result
df.show()




In [0]:
from pyspark.sql.functions import sum

# Window specification excluding future rows
window_spec = Window.partitionBy("category").orderBy("date").rowsBetween(Window.unboundedPreceding, -1)

# Apply cumulative sum excluding current and future rows
df = df.withColumn("cumulative_sales_excluding_future", sum("sales").over(window_spec))

# Show the result
df.show()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg
from pyspark.sql.window import Window

# Sample data
data = [
    ("2024-01-01", "A", 100),
    ("2024-01-02", "A", 150),
    ("2024-01-03", "A", 200),
    ("2024-01-01", "B", 50),
    ("2024-01-02", "B", 75),
    ("2024-01-03", "B", 100)
]

# Create DataFrame
spark = SparkSession.builder.appName("WindowFunctions").getOrCreate()
df = spark.createDataFrame(data, ["date", "category", "sales"])

# 1. Cumulative Sum
window_spec = Window.partitionBy("category").orderBy("date").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_cumulative_sum = df.withColumn("cumulative_sales", sum("sales").over(window_spec))

# 2. Moving Average (3-day window)
window_spec_avg = Window.partitionBy("category").orderBy("date").rowsBetween(-2, Window.currentRow)
df_moving_avg = df_cumulative_sum.withColumn("moving_avg_sales", avg("sales").over(window_spec_avg))

# 3. Excluding Current & Future Rows
window_spec_excluding_future = Window.partitionBy("category").orderBy("date").rowsBetween(Window.unboundedPreceding, -1)
df_final = df_moving_avg.withColumn("cumulative_sales_excluding_curennt_n_future", sum("sales").over(window_spec_excluding_future))

# Window.unboundedPreceding: Starts from the first row.
# -1: Ends one row before the current row.


# Show the result
df_final.show()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, rank, dense_rank, row_number
from pyspark.sql.window import Window

# Sample data
data = [
    ("Electronics", "Phone", 1000),
    ("Electronics", "Laptop", 1500),
    ("Electronics", "Tablet", 800),
    ("Furniture", "Chair", 300),
    ("Furniture", "Table", 300),
    ("Furniture", "Desk", 600),
]

# Create DataFrame
spark = SparkSession.builder.appName("WindowFunctions").getOrCreate()
df = spark.createDataFrame(data, ["category", "product", "sales"])

# Define window specification
window_spec = Window.partitionBy("category").orderBy("sales")

# Apply window functions
df_transformed = df \
    .withColumn("rank", rank().over(window_spec)) \
    .withColumn("dense_rank", dense_rank().over(window_spec)) \
    .withColumn("row_number", row_number().over(window_spec)) \
    .withColumn("cumulative_sales", sum("sales").over(window_spec)) \
    .withColumn("average_sales", avg("sales").over(window_spec))

df_transformed.show()