<pre>Problem Statement
You are given a PySpark DataFrame containing daily sales records. However, some dates are missing. Write a PySpark program to identify all the missing dates within the given range of dates.

Sample Input (DataFrame sales)
date	sales
2025-01-01	100
2025-01-02	150
2025-01-04	120
2025-01-06	200
Expected Output
missing_date
2025-01-03
2025-01-05<pre>

In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType
from datetime import datetime
from pyspark.sql.functions import col,to_date,aggregate,count, row_number, lag, datediff,when
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [5]:
spark = SparkSession.builder.appName("Daily-Coding-Day-1").getOrCreate()

In [6]:
data = [['2025-01-01',100],
['2025-01-02',150],
['2025-01-04',120],
['2025-01-06',200]]

In [7]:
schema = ['Date','sales']

In [8]:
df = spark.createDataFrame(data=data,schema=schema)

In [10]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- sales: long (nullable = true)



In [16]:
df = df.withColumn('Date', to_date(col('Date'), "yyyy-MM-dd"))

In [37]:
df_dates = df.agg(F.min('date').alias('min_date'),F.max('date').alias('max_date'))
df_dates.show()

+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2025-01-01|2025-01-06|
+----------+----------+



In [41]:
all_dates = df_dates.select(F.expr("explode(sequence(min_date, max_date, interval 1 day)) as Date"))
all_dates.show()

+----------+
|      Date|
+----------+
|2025-01-01|
|2025-01-02|
|2025-01-03|
|2025-01-04|
|2025-01-05|
|2025-01-06|
+----------+



In [43]:
missing = all_dates.join(df,on='Date',how='left_anti')
missing.show()

+----------+
|      Date|
+----------+
|2025-01-05|
|2025-01-03|
+----------+



<pre>
You are given a SQL table user_logins(user_id, login_date). Write a SQL query to find the longest streak of consecutive login days for each user.

Sample Input (user_logins)
user_id	login_date
1	2025-01-01
1	2025-01-02
1	2025-01-04
2	2025-01-01
2	2025-01-02
2	2025-01-03
2	2025-01-05
Expected Output
user_id	longest_streak
1	2
2	3</pre>






In [61]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, row_number, lag, datediff
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("ConsecutiveLoginStreak").getOrCreate()

data = [
    (1, "2025-01-01"),
    (1, "2025-01-02"),
    (1, "2025-01-04"),
    (2, "2025-01-01"),
    (2, "2025-01-02"),
    (2, "2025-01-03"),
    (2, "2025-01-05")]
columns = ["user_id", "login_date"]

df = spark.createDataFrame(data, columns)

df = df.withColumn("login_date", col("login_date").cast("date"))



In [62]:
window_spec = Window.partitionBy('user_id').orderBy('login_date')
df = df.withColumn("row_number", row_number().over(window_spec))
df = df.withColumn("prev_login_date", lag("login_date").over(window_spec))
df.show()

+-------+----------+----------+---------------+
|user_id|login_date|row_number|prev_login_date|
+-------+----------+----------+---------------+
|      1|2025-01-01|         1|           NULL|
|      1|2025-01-02|         2|     2025-01-01|
|      1|2025-01-04|         3|     2025-01-02|
|      2|2025-01-01|         1|           NULL|
|      2|2025-01-02|         2|     2025-01-01|
|      2|2025-01-03|         3|     2025-01-02|
|      2|2025-01-05|         4|     2025-01-03|
+-------+----------+----------+---------------+



In [63]:
df = df.withColumn("date_diff", datediff(col("login_date"), col("prev_login_date")))
df.show()

+-------+----------+----------+---------------+---------+
|user_id|login_date|row_number|prev_login_date|date_diff|
+-------+----------+----------+---------------+---------+
|      1|2025-01-01|         1|           NULL|     NULL|
|      1|2025-01-02|         2|     2025-01-01|        1|
|      1|2025-01-04|         3|     2025-01-02|        2|
|      2|2025-01-01|         1|           NULL|     NULL|
|      2|2025-01-02|         2|     2025-01-01|        1|
|      2|2025-01-03|         3|     2025-01-02|        1|
|      2|2025-01-05|         4|     2025-01-03|        2|
+-------+----------+----------+---------------+---------+



In [65]:
df = df.withColumn("streak",when(col("date_diff") > 1, 1).otherwise(0))
df.show()

+-------+----------+----------+---------------+---------+------+
|user_id|login_date|row_number|prev_login_date|date_diff|streak|
+-------+----------+----------+---------------+---------+------+
|      1|2025-01-01|         1|           NULL|     NULL|     0|
|      1|2025-01-02|         2|     2025-01-01|        1|     0|
|      1|2025-01-04|         3|     2025-01-02|        2|     1|
|      2|2025-01-01|         1|           NULL|     NULL|     0|
|      2|2025-01-02|         2|     2025-01-01|        1|     0|
|      2|2025-01-03|         3|     2025-01-02|        1|     0|
|      2|2025-01-05|         4|     2025-01-03|        2|     1|
+-------+----------+----------+---------------+---------+------+



In [70]:
df_streaks = df.groupBy("user_id", "streak").agg(count('streak').alias("streak_length"))
df_streaks.show()

+-------+------+-------------+
|user_id|streak|streak_length|
+-------+------+-------------+
|      1|     0|            2|
|      1|     1|            1|
|      2|     0|            3|
|      2|     1|            1|
+-------+------+-------------+



In [76]:
df_longest_streak = df_streaks.groupBy("user_id").agg(F.max(col("streak_length")).alias("longest_streak"))
df_longest_streak.show()

+-------+--------------+
|user_id|longest_streak|
+-------+--------------+
|      1|             2|
|      2|             3|
+-------+--------------+

