In [31]:
pip install pyspark



In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *

Problem 1: PySpark – Detect Missing Dates in a Time Series


You have daily sales data, but some dates are missing. Find all missing dates in the range.

In [33]:
data = [("2025-01-01", 100),
        ("2025-01-02", 150),
        ("2025-01-04", 120),
        ("2025-01-06", 200)]
columns = ["date", "sales"]

sales = spark.createDataFrame(data, columns)
sales = sales.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

In [34]:
from typing import Sequence
# Min and Max date
date_min=sales.select(min("date")).collect()[0][0]
date_max=sales.select(max("date")).collect()[0][0]

# Date exploding
full_dates=spark.createDataFrame([(date_min,date_max)],["start","end"])\
          .select(explode(sequence(col("start"),col("end"))).alias("date"))

# Missing dates
missing_dates=full_dates.join(sales,on="date",how="left_anti")

missing_dates.show()

+----------+
|      date|
+----------+
|2025-01-05|
|2025-01-03|
+----------+



Problem 2: SQL – Find the Longest Streak of Active Days per User

You are given a SQL table user_logins(user_id, login_date). Write a SQL query to find the longest streak of consecutive login days for each user.

In [35]:
data = [
    ("u1", "2025-08-01"),
    ("u1", "2025-08-02"),
    ("u1", "2025-08-03"),
    ("u1", "2025-08-05"),
    ("u2", "2025-08-01"),
    ("u2", "2025-08-03"),
    ("u2", "2025-08-04"),
    ("u3", "2025-08-01"),
    ("u3", "2025-08-02"),
    ("u3", "2025-08-04"),
    ("u3", "2025-08-05"),
    ("u3", "2025-08-06"),
]

schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("login_date", StringType(), True)
])

user_logins_df = spark.createDataFrame(data, schema) \
    .withColumn("login_date", to_date("login_date", "yyyy-MM-dd"))

user_logins_df.createOrReplaceTempView("user_logins")

In [36]:
result=spark.sql(
    """
    with ordered as(
      select user_id,login_date,
      row_number() over(partition by user_id order by login_date) as rn
      from user_logins
    ),
    grouped as (
      select user_id, login_date,
      date_sub(login_date,rn) as grp
      from ordered
    ),
    streak as (
      select user_id, count(*) as streak_length from grouped
      group by user_id,grp
    )

    select user_id,max(streak_length)as longest_streak from streak
    group by user_id order by user_id;
    """
)
result.show()

+-------+--------------+
|user_id|longest_streak|
+-------+--------------+
|     u1|             3|
|     u2|             2|
|     u3|             3|
+-------+--------------+



Problem 3: PySpark – Find the Top-Selling Product per Day



You are given a PySpark DataFrame sales containing daily product sales. Write a PySpark program to find the top-selling product for each day based on total sales amount.



In [37]:
data = [
    ("2025-02-01", "A", 100),
    ("2025-02-01", "B", 200),
    ("2025-02-01", "C", 150),
    ("2025-02-02", "A", 180),
    ("2025-02-02", "B", 120),
    ("2025-02-02", "C", 220),
]
columns = ["date", "product", "sales_amount"]

spark = SparkSession.builder.getOrCreate()
sales = spark.createDataFrame(data, columns)

In [38]:
  # Aggregate sales
  daily_sales = sales.groupBy("date","product").agg(sum(col("sales_amount")).alias("sales_amount"))

  # # Window partition by date
  window= Window.partitionBy("date").orderBy(col("sales_amount").desc())

  # # Top-selling product
  top_selling_product= daily_sales.withColumn("rank",rank().over(window)).filter(col("rank")==1).select("date","product","sales_amount")
  top_selling_product.show()

+----------+-------+------------+
|      date|product|sales_amount|
+----------+-------+------------+
|2025-02-01|      B|         200|
|2025-02-02|      C|         220|
+----------+-------+------------+



Problem 4: SQL – Find Users Who Purchase for N Consecutive Days

Given a table purchases(user_id, purchase_date), write a SQL query to find all users who purchased items for at least 3 consecutive days.

In [39]:
data = [
    ("u1", "2025-08-01"),
    ("u1", "2025-08-02"),
    ("u1", "2025-08-03"),
    ("u1", "2025-08-05"),
    ("u2", "2025-08-01"),
    ("u2", "2025-08-04"),
    ("u2", "2025-08-05"),
    ("u2", "2025-08-06"),
]

schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("purchase_date", StringType(), True)
])

purchases_df = spark.createDataFrame(data, schema) \
    .withColumn("purchase_date", to_date("purchase_date", "yyyy-MM-dd"))

purchases_df.createOrReplaceTempView("purchases")

In [40]:
result = spark.sql("""
    with ordered as (
      select user_id,purchase_date,
      row_number() over(partition by user_id order by purchase_date) as rn
      from purchases
),
grouped as (
      select user_id,purchase_date,
      date_sub(purchase_date,rn) as grp
      from ordered
),
streaks as (
      select user_id,count(*) as streak_length from grouped
      group by user_id,grp
)

select distinct user_id
from streaks
where streak_length >=3
""")
result.show()

+-------+
|user_id|
+-------+
|     u1|
|     u2|
+-------+

