In [1]:
pip install pyspark



In [2]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Day 7").getOrCreate()

Problem 1: PySpark – Calculate Session Gaps for Users


You have a PySpark DataFrame with user session information. Each row represents a session start date for a user. Write a PySpark program to calculate the gap in days between consecutive sessions for each user.



In [3]:
data = [
    ("U1", "2025-01-01"),
    ("U1", "2025-01-05"),
    ("U1", "2025-01-10"),
    ("U2", "2025-01-02"),
    ("U2", "2025-01-04")
]
columns = ["user_id", "session_date"]

df = spark.createDataFrame(data, columns).withColumn("session_date", to_date("session_date"))
df.show()

+-------+------------+
|user_id|session_date|
+-------+------------+
|     U1|  2025-01-01|
|     U1|  2025-01-05|
|     U1|  2025-01-10|
|     U2|  2025-01-02|
|     U2|  2025-01-04|
+-------+------------+



In [4]:
window=Window.partitionBy("user_id").orderBy("session_date")
df_with_date=df.withColumn("last_session",lag("session_date").over(window))\
                .withColumn("days_since_last_session",datediff("session_date","last_session"))\
                .drop("last_session").show()

+-------+------------+-----------------------+
|user_id|session_date|days_since_last_session|
+-------+------------+-----------------------+
|     U1|  2025-01-01|                   NULL|
|     U1|  2025-01-05|                      4|
|     U1|  2025-01-10|                      5|
|     U2|  2025-01-02|                   NULL|
|     U2|  2025-01-04|                      2|
+-------+------------+-----------------------+



Problem 2: SQL – Find Customers with Only One Purchase


You have a SQL table purchases(customer_id, purchase_date, amount) representing customer purchases. Write a SQL query to find all customers who made exactly one purchase.



In [8]:
data = [
    ("C1", "2025-01-01", 200),
    ("C1", "2025-01-10", 300),
    ("C2", "2025-01-05", 150),
    ("C3", "2025-01-02", 100),
    ("C3", "2025-01-04", 200),
]

columns = ["customer_id", "purchase_date", "amount"]

df = spark.createDataFrame(data, columns)

df.createOrReplaceTempView("purchases")

In [7]:
spark.sql(
    """
    select customer_id from purchases
    group by customer_id
    having count(*)=1
    """
).show()

+-----------+
|customer_id|
+-----------+
|         C2|
+-----------+

