In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("optimization").getOrCreate()
spark

In [2]:
df = spark.read.csv("Flight_Schedule.csv", inferSchema = True, header=True)

df.show()

+------------+-------+---------+-----------+--------------------+----------------------+--------------------+----------+----------+
|flightNumber|airline|   origin|destination|           dayOfWeek|scheduledDepartureTime|scheduledArrivalTime| validFrom|   validTo|
+------------+-------+---------+-----------+--------------------+----------------------+--------------------+----------+----------+
|         425|  GoAir|    Delhi|  Hyderabad|Sunday,Monday,Tue...|   2025-04-23 05:45:00|                NULL|28-10-2018|30-03-2019|
|         423|  GoAir|    Delhi|  Hyderabad|            Saturday|   2025-04-23 07:30:00|                NULL|28-10-2018|28-10-2018|
|         423|  GoAir|    Delhi|  Hyderabad|              Friday|   2025-04-23 07:30:00|                NULL|03-11-2018|01-12-2018|
|         423|  GoAir|    Delhi|  Hyderabad|              Friday|   2025-04-23 07:30:00|                NULL|02-02-2019|30-03-2019|
|         423|  GoAir|    Delhi|  Hyderabad|Sunday,Monday,Tue...|   2025-04-

In [6]:
from pyspark.sql.functions import dayofmonth, year, month, to_date, col

splitData = df.withColumn("validFromDate", to_date(col("validFrom"), "dd-MM-yyyy")) \
            .withColumn("validFromYear", year(col("validFromDate"))) \
            .withColumn("validFromMonth", month(col("validFromDate"))) \
            .withColumn("validFromDate", dayofmonth(col("validFromDate"))) \
            .withColumn("validToDate", to_date(col("validTo"), "dd-MM-yyyy")) \
            .withColumn("validToYear", year(col("validToDate"))) \
            .withColumn("validToMonth", month(col("validToDate"))) \
            .withColumn("validToDate", dayofmonth(col("validToDate")))
splitData.select("flightNumber","airline","origin","destination","validFromDate", "scheduledDepartureTime","dayOfWeek","validToDate", "validFromYear","validToYear","validFromMonth","validToMonth") \
             .distinct().show()


+------------+-------------+---------+-----------+-------------+----------------------+--------------------+-----------+-------------+-----------+--------------+------------+
|flightNumber|      airline|   origin|destination|validFromDate|scheduledDepartureTime|           dayOfWeek|validToDate|validFromYear|validToYear|validFromMonth|validToMonth|
+------------+-------------+---------+-----------+-------------+----------------------+--------------------+-----------+-------------+-----------+--------------+------------+
|         120|        GoAir|Bengaluru|      Delhi|            2|   2025-04-23 11:35:00|              Monday|         22|         2019|       2019|             4|          10|
|         633|        GoAir|Ahmedabad|       Pune|           28|   2025-04-23 21:10:00|Sunday,Monday,Tue...|         30|         2018|       2019|            10|           3|
|         514|    Air India|     Pune|  Hyderabad|           28|   2025-04-23 15:05:00|Sunday,Monday,Tue...|         29|     

In [7]:
from pyspark.sql.functions import struct

splitData.createOrReplaceTempView("flights")
dfStruct = df.select("flightNumber", "airline", struct("origin", "destination","scheduledDepartureTime").alias("routeInformation"), "dayOfWeek").show(5, truncate=False)


+------------+-------+---------------------------------------+--------------------------------------------------------+
|flightNumber|airline|routeInformation                       |dayOfWeek                                               |
+------------+-------+---------------------------------------+--------------------------------------------------------+
|425         |GoAir  |{Delhi, Hyderabad, 2025-04-23 05:45:00}|Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday|
|423         |GoAir  |{Delhi, Hyderabad, 2025-04-23 07:30:00}|Saturday                                                |
|423         |GoAir  |{Delhi, Hyderabad, 2025-04-23 07:30:00}|Friday                                                  |
|423         |GoAir  |{Delhi, Hyderabad, 2025-04-23 07:30:00}|Friday                                                  |
|423         |GoAir  |{Delhi, Hyderabad, 2025-04-23 07:30:00}|Sunday,Monday,Tuesday,Wednesday,Thursday,Saturday       |
+------------+-------+------------------

In [8]:
from pyspark.sql.functions import split, array_contains

df_with_array = df.select(
    "flightNumber",
    "airline",
    "origin",
    "destination",
    split("dayOfWeek", ",").alias("days_array")
)
df_sunday_flights = df_with_array.filter(array_contains("days_array", "Sunday")).show(5, truncate=False)


+------------+-------+------+-----------+----------------------------------------------------------------+
|flightNumber|airline|origin|destination|days_array                                                      |
+------------+-------+------+-----------+----------------------------------------------------------------+
|425         |GoAir  |Delhi |Hyderabad  |[Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday]|
|423         |GoAir  |Delhi |Hyderabad  |[Sunday, Monday, Tuesday, Wednesday, Thursday, Saturday]        |
|423         |GoAir  |Delhi |Hyderabad  |[Sunday, Monday, Tuesday, Wednesday, Thursday, Saturday]        |
|423         |GoAir  |Delhi |Hyderabad  |[Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday]|
|422         |GoAir  |Delhi |Hyderabad  |[Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday]|
+------------+-------+------+-----------+----------------------------------------------------------------+
only showing top 5 rows

