<a href="https://colab.research.google.com/github/Sreekar-Kandhadai/pyspark-interview-questions/blob/main/Goldman_Sachs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
The problem is to calculate the minimum number of platforms required at a train station based on the given arrival_times and departure_times.

Problem Breakdown:
We need to merge both arrival_time and departure_time into a unified dataset.
We'll use a window function to track how many platforms are required at each point in time.
For each train arrival, we'll add a platform (+1) and for each train departure, we'll subtract a platform (-1).

Finally, we will calculate the maximum number of platforms required at any point in time during the day.

In [12]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

from pyspark.sql.window import Window


spark=SparkSession.builder.appName("learning").getOrCreate()

arrivals_data = [
    (1, '2024-11-17 08:00'),
    (2, '2024-11-17 08:05'),
    (3, '2024-11-17 08:05'),
    (4, '2024-11-17 08:10'),
    (5, '2024-11-17 08:10'),
    (6, '2024-11-17 12:15'),
    (7, '2024-11-17 12:20'),
    (8, '2024-11-17 12:25'),
    (9, '2024-11-17 15:00'),
    (10, '2024-11-17 15:00'),
    (11, '2024-11-17 15:00'),
    (12, '2024-11-17 15:06'),
    (13, '2024-11-17 20:00'),
    (14, '2024-11-17 20:10')
]

departures_data = [
    (1, '2024-11-17 08:15'),
    (2, '2024-11-17 08:10'),
    (3, '2024-11-17 08:20'),
    (4, '2024-11-17 08:25'),
    (5, '2024-11-17 08:20'),
    (6, '2024-11-17 13:00'),
    (7, '2024-11-17 12:25'),
    (8, '2024-11-17 12:30'),
    (9, '2024-11-17 15:05'),
    (10, '2024-11-17 15:10'),
    (11, '2024-11-17 15:15'),
    (12, '2024-11-17 15:15'),
    (13, '2024-11-17 20:15'),
    (14, '2024-11-17 20:15')
]

# Define schema for the data
arrival_schema = ['train_id', 'arrival_time']
departure_schema = ['train_id', 'departure_time']

arrival_df=spark.createDataFrame(arrivals_data,arrival_schema).withColumn("event_type",lit(1)).withColumnRenamed("arrival_time","event_time")\
.select("train_id","event_time","event_type")

arrival_df.show()

departure_df=spark.createDataFrame(departures_data,departure_schema).withColumn("event_type",lit(-1)).withColumnRenamed("departure_time","event_time")\
.select("train_id","event_time","event_type")

departure_df.show()

df1=arrival_df.union(departure_df)

window_spec=Window.orderBy("event_time")

df2=df1.withColumn("platforms_needed",sum('event_type').over(window_spec))

df2.show()

df3=df2.select(max('platforms_needed')).collect()[0][0]

print(f"the minimum platforms required are {df3}")

+--------+----------------+----------+
|train_id|      event_time|event_type|
+--------+----------------+----------+
|       1|2024-11-17 08:00|         1|
|       2|2024-11-17 08:05|         1|
|       3|2024-11-17 08:05|         1|
|       4|2024-11-17 08:10|         1|
|       5|2024-11-17 08:10|         1|
|       6|2024-11-17 12:15|         1|
|       7|2024-11-17 12:20|         1|
|       8|2024-11-17 12:25|         1|
|       9|2024-11-17 15:00|         1|
|      10|2024-11-17 15:00|         1|
|      11|2024-11-17 15:00|         1|
|      12|2024-11-17 15:06|         1|
|      13|2024-11-17 20:00|         1|
|      14|2024-11-17 20:10|         1|
+--------+----------------+----------+

+--------+----------------+----------+
|train_id|      event_time|event_type|
+--------+----------------+----------+
|       1|2024-11-17 08:15|        -1|
|       2|2024-11-17 08:10|        -1|
|       3|2024-11-17 08:20|        -1|
|       4|2024-11-17 08:25|        -1|
|       5|2024-11-17 08: