<a href="https://colab.research.google.com/github/Sreekar-Kandhadai/pyspark-interview-questions/blob/main/Pyspark_Date_Function_Questions_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
1. Extracting Year:
Given a column event_date (format: yyyy-MM-dd),
extract only the year for each row.
Sample data:
event_date: ["2023-04-15","2022-11-30", "2021-08-25"]

In [8]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark=SparkSession.builder.appName("learning").getOrCreate()

data=[('2023-04-15',),('2022-11-30',), ('2021-08-25',)]

schema=['event_date']

df=spark.createDataFrame(data,schema)

df.show()

df=df.withColumn("event_date",to_date(col('event_date'),'yyyy-MM-dd'))

df1=df.withColumn("year",year(col('event_date')))

df1.show()

+----------+
|event_date|
+----------+
|2023-04-15|
|2022-11-30|
|2021-08-25|
+----------+

+----------+----+
|event_date|year|
+----------+----+
|2023-04-15|2023|
|2022-11-30|2022|
|2021-08-25|2021|
+----------+----+



In [None]:
2.Date Difference Calculation:
Calculate the difference (in days) between two date
columns start_date and end_date.
Sample data:
start_date: ["2023-01-01","2023-03-15"]
end_date: ["2023-02-01","2023-03-20"]


In [12]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark=SparkSession.builder.appName("learning").getOrCreate()

data=[("2023-01-01","2023-02-01"),("2023-03-15","2023-03-20")]

schema=["start_date","end_date"]

df=spark.createDataFrame(data,schema)

df.show()

df1=df.withColumn("days",datediff(col('end_date'),col('start_date')))

df1.show()

+----------+----------+
|start_date|  end_date|
+----------+----------+
|2023-01-01|2023-02-01|
|2023-03-15|2023-03-20|
+----------+----------+

+----------+----------+----+
|start_date|  end_date|days|
+----------+----------+----+
|2023-01-01|2023-02-01|  31|
|2023-03-15|2023-03-20|   5|
+----------+----------+----+



In [None]:
3. Filter Records Based on Date:
Filter records where event_date is after 2023-06-01.
Sample data:
event_date: ["2023-05-15","2023-07-20","2023-06-05"]

In [15]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("learning").getOrCreate()

data=[("2023-05-15",),("2023-07-20",),("2023-06-05",)]

schema=["event_date"]

df=spark.createDataFrame(data,schema)

df.show()

df.filter("event_date > '2023-06-01'").show()

+----------+
|event_date|
+----------+
|2023-05-15|
|2023-07-20|
|2023-06-05|
+----------+

+----------+
|event_date|
+----------+
|2023-07-20|
|2023-06-05|
+----------+



In [None]:
4. Add 30 days to each date in the order_date column.
Sample data:
order_date: ["2023-01-10", "2023-06-15", "2023-07-30"]


In [16]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark=SparkSession.builder.appName("learning").getOrCreate()

data=[("2023-01-10",), ("2023-06-15",), ("2023-07-30",)]

schema=['order_date']

df=spark.createDataFrame(data,schema)

df.show()

df1=df.withColumn('new_date',date_add(col('order_date'),30))

df1.show()

+----------+
|order_date|
+----------+
|2023-01-10|
|2023-06-15|
|2023-07-30|
+----------+

+----------+----------+
|order_date|  new_date|
+----------+----------+
|2023-01-10|2023-02-09|
|2023-06-15|2023-07-15|
|2023-07-30|2023-08-29|
+----------+----------+



In [None]:
5. Find the Maximum Date:
Determine the latest date from a column
payment_date.
Sample data:
payment_date: ["2023-02-15", "2023-06-25", "2023-01-10"]

In [19]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("learning").getOrCreate()

data=[("2023-02-15",), ("2023-06-25",), ("2023-01-10",)]

schema=['payment_date']

df=spark.createDataFrame(data,schema)

df.show()

df.select(max('payment_date')).show()

+------------+
|payment_date|
+------------+
|  2023-02-15|
|  2023-06-25|
|  2023-01-10|
+------------+

+-----------------+
|max(payment_date)|
+-----------------+
|       2023-06-25|
+-----------------+



In [None]:
6.Truncate Date to First Day of Month:
Truncate the sale_date to the first day of its respective
month.
Sample data:
sale_date: ["2023-04-12","2023-07-23","2023-08-05"]

In [26]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark=SparkSession.builder.appName("learning").getOrCreate()

data=[("2023-04-12",),("2023-07-23",),("2023-08-05",)]

schema=["sale_date"]

df=spark.createDataFrame(data,schema)

df.show()

df.withColumn("new_date",trunc(col('sale_date'),"month")).show()





+----------+
| sale_date|
+----------+
|2023-04-12|
|2023-07-23|
|2023-08-05|
+----------+

+----------+----------+
| sale_date|  new_date|
+----------+----------+
|2023-04-12|2023-04-01|
|2023-07-23|2023-07-01|
|2023-08-05|2023-08-01|
+----------+----------+



In [None]:
7. Group by Year:
Group records by year extracted from the column
transaction_date.
Sample data:
transaction_date: ["2023-06-12", "2022-11-09","2021-04-01"]

In [27]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

data=[("2023-06-12",), ("2022-11-09",),("2021-04-01",)]

schema=['transaction_date']

df=spark.createDataFrame(data,schema)

df.show()

df1=df.withColumn("year",year(col('transaction_date'))).groupBy('year').agg(count("*").alias('count'))

df1.show()

+----------------+
|transaction_date|
+----------------+
|      2023-06-12|
|      2022-11-09|
|      2021-04-01|
+----------------+

+----+-----+
|year|count|
+----+-----+
|2023|    1|
|2022|    1|
|2021|    1|
+----+-----+



In [None]:
8. Filter Records Within a Date Range:
Filter records where visit_date is between 2023-01-01 and 2023-05-01.
Sample data:
visit_date: ["2023-02-15","2023-06-01", "2023-03-20"]

In [28]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark=SparkSession.builder.appName("learning").getOrCreate()

data=[("2023-02-15",),("2023-06-01",), ("2023-03-20",)]

schema=['visit_date']

df=spark.createDataFrame(data,schema)

df.show()

df.filter(col('visit_date').between('2023-01-01','2023-05-01')).show()

+----------+
|visit_date|
+----------+
|2023-02-15|
|2023-06-01|
|2023-03-20|
+----------+

+----------+
|visit_date|
+----------+
|2023-02-15|
|2023-03-20|
+----------+



In [None]:
9. Day of Week Extraction:
Extract the day of the week from attendance_date.
Sample data:
attendance_date: ["2023-08-11", "2023-07-25", "2023-09-01"]

In [29]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

data=[("2023-08-11",), ("2023-07-25",), ("2023-09-01",)]

schema=['attendance_date']

df=spark.createDataFrame(data,schema)

df.show()

df1=df.withColumn('day_of_week_num', dayofweek(col('attendance_date')))\
      .withColumn('day_of_week_name',date_format(col('attendance_date'),'EEEE'))

df1.show()





+---------------+
|attendance_date|
+---------------+
|     2023-08-11|
|     2023-07-25|
|     2023-09-01|
+---------------+

+---------------+---------------+----------------+
|attendance_date|day_of_week_num|day_of_week_name|
+---------------+---------------+----------------+
|     2023-08-11|              6|          Friday|
|     2023-07-25|              3|         Tuesday|
|     2023-09-01|              6|          Friday|
+---------------+---------------+----------------+



In [None]:
10. Check Leap Year:
Identify if each date in birth_date falls in a leap year.
Sample data:
birth_date: ["2020-03-01","2019-12-15", "2024-02-29"]


In [30]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

data=[("2020-03-01",),("2019-12-15",), ("2024-02-29",)]

schema=['birth_date']

df=spark.createDataFrame(data,schema)

df.show()

df1=df.withColumn("leap_year", when(((year(col('birth_date'))%4==0) & ((year(col('birth_date'))%400==0) | (year(col('birth_date'))%100!=0) )) ,lit('yes') ).otherwise('No'))

df1.show()

+----------+
|birth_date|
+----------+
|2020-03-01|
|2019-12-15|
|2024-02-29|
+----------+

+----------+---------+
|birth_date|leap_year|
+----------+---------+
|2020-03-01|      yes|
|2019-12-15|       No|
|2024-02-29|      yes|
+----------+---------+



In [None]:
11.Convert String to Date:
Convert a string column arrival_time (format: ddMM-yyyy) to date format.
Sample data:
arrival_time: ["15-04-2023", "20-08-2023", "01-12-2023"]

In [33]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

data=[("15-04-2023",), ("20-08-2023",), ("01-12-2023",)]

schema=["arrival_time"]

df=spark.createDataFrame(data,schema)

df.show()

df.printSchema()

df1=df.withColumn("arrival_time",to_date(col("arrival_time"),'dd-MM-yyy'))

df1.printSchema()

+------------+
|arrival_time|
+------------+
|  15-04-2023|
|  20-08-2023|
|  01-12-2023|
+------------+

root
 |-- arrival_time: string (nullable = true)

root
 |-- arrival_time: date (nullable = true)



In [None]:
12. Calculate Week Number:
For each date in shipment_date, calculate the week
number of the year.
Sample data:
shipment_date: ["2023-02-15", "2023-08-01", "2023-12-25"]

In [37]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

data= [("2023-02-15",), ("2023-08-01",), ("2023-12-25",)]

schema=["shipment_date"]

df=spark.createDataFrame(data,schema)

df.show()

df1=df.withColumn("week_number",weekofyear(col('shipment_date')))

df1.show()


+-------------+
|shipment_date|
+-------------+
|   2023-02-15|
|   2023-08-01|
|   2023-12-25|
+-------------+

+-------------+-----------+
|shipment_date|week_number|
+-------------+-----------+
|   2023-02-15|          7|
|   2023-08-01|         31|
|   2023-12-25|         52|
+-------------+-----------+



In [None]:
13. Find Records from the Last 7 Days:
Identify all records where log_date is within the last
7 days from the current date.
Sample data:
log_date: ["2023-08-08","2023-08-11", "2023-08-15"]

In [50]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

data=[("2025-02-17",),("2025-02-16",), ("2025-02-07",)]

schema=["log_date"]

df=spark.createDataFrame(data,schema)

df.show()

df1=df.filter(datediff(current_date(),col('log_date'))<=7)

df1.show()

+----------+
|  log_date|
+----------+
|2025-02-17|
|2025-02-16|
|2025-02-07|
+----------+

+----------+
|  log_date|
+----------+
|2025-02-17|
|2025-02-16|
+----------+



In [None]:
14. Format Date as String:
Format the booking_date as dd/MM/yyyy.
Sample data:
booking_date: ["2023-07-12", "2023-09-15", "2023-05-30"]


In [55]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("learning").getOrCreate()

data=[("2023-07-12",), ("2023-09-15",), ("2023-05-30",)]

schema=["booking_date"]

df=spark.createDataFrame(data,schema)

df.show()

df1=df.withColumn("booking_date",date_format(col("booking_date"),'dd/MM/yyyy'))\
      .withColumn("booking_date",col("booking_date").cast("string"))

df1.show()

+------------+
|booking_date|
+------------+
|  2023-07-12|
|  2023-09-15|
|  2023-05-30|
+------------+

+------------+
|booking_date|
+------------+
|  12/07/2023|
|  15/09/2023|
|  30/05/2023|
+------------+

