In [1]:
pip install pyspark



In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark=SparkSession.builder.appName("Day 8").getOrCreate()

Problem 1: PySpark – Calculate Rolling 3-Day Average of Sales

You have a PySpark DataFrame containing daily sales data. Write a PySpark program to calculate the rolling 3-day average sales for each date, ordered by the date column.



In [5]:
data = [
    ("2025-01-01", 100),
    ("2025-01-02", 200),
    ("2025-01-03", 300),
    ("2025-01-04", 400),
    ("2025-01-05", 500),
]

columns = ["sale_date", "sales"]

df = spark.createDataFrame(data, columns)

In [6]:
df.show()

+----------+-----+
| sale_date|sales|
+----------+-----+
|2025-01-01|  100|
|2025-01-02|  200|
|2025-01-03|  300|
|2025-01-04|  400|
|2025-01-05|  500|
+----------+-----+



In [9]:
 window=Window.orderBy("sale_date").rowsBetween(-2,0)
 result=df.withColumn("rolling_3_day_avg",avg("sales").over(window)).select("sale_date","sales","rolling_3_day_avg").show()

+----------+-----+-----------------+
| sale_date|sales|rolling_3_day_avg|
+----------+-----+-----------------+
|2025-01-01|  100|            100.0|
|2025-01-02|  200|            150.0|
|2025-01-03|  300|            200.0|
|2025-01-04|  400|            300.0|
|2025-01-05|  500|            400.0|
+----------+-----+-----------------+



Problem 2: SQL – Find Customers with Increasing Purchase Amounts

You have a SQL table purchases(customer_id, purchase_date, amount). Write a query to find customers whose purchase amounts strictly increased with each new purchase date.

In [16]:
data = [
    ("C1", "2025-01-01", 100),
    ("C1", "2025-01-05", 200),
    ("C1", "2025-01-10", 300),
    ("C2", "2025-01-02", 150),
    ("C2", "2025-01-06", 120),
    ("C3", "2025-01-03", 200),
    ("C3", "2025-01-09", 250),
]

columns = ["customer_id", "purchase_date", "amount"]

df = spark.createDataFrame(data, columns)

df = df.withColumn("purchase_date", to_date("purchase_date"))

df.createOrReplaceTempView("purchases")
df.show()

+-----------+-------------+------+
|customer_id|purchase_date|amount|
+-----------+-------------+------+
|         C1|   2025-01-01|   100|
|         C1|   2025-01-05|   200|
|         C1|   2025-01-10|   300|
|         C2|   2025-01-02|   150|
|         C2|   2025-01-06|   120|
|         C3|   2025-01-03|   200|
|         C3|   2025-01-09|   250|
+-----------+-------------+------+



In [26]:
spark.sql(
    """
    select customer_id
    from
    (
    select customer_id,amount,
    lag(amount) over(partition by customer_id order by purchase_date) prev_amount
    from purchases
    )
    group by customer_id
    having sum(
      case
          when amount<=prev_amount then 1
          else 0
      end
    )=0;
    """
).show()

+-----------+
|customer_id|
+-----------+
|         C1|
|         C3|
+-----------+

