In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, to_timestamp, when, concat, lit, date_format
from pyspark.sql.types import TimestampType, StringType
import datetime

# Initialize Spark session
spark = SparkSession.builder.appName("FlightDataCleaning").getOrCreate()

# Read the CSV file
df = spark.read.csv('cleaned__df.csv', header=True, inferSchema=True)

# UDF to format time
def format_heure(chaine):
    if chaine is None:
        return None
    chaine = int(chaine)
    if chaine == 2400:
        chaine = 0
    return "{0:02d}:{1:02d}:00".format(chaine // 100, chaine % 100)

format_heure_udf = udf(format_heure, StringType())

# Apply the UDF to time columns
time_columns = ["SCHEDULED_ARRIVAL", "DEPARTURE_TIME", "ARRIVAL_TIME"]
for col_name in time_columns:
    df = df.withColumn(col_name, format_heure_udf(col(col_name)))

# Combine date and time for SCHEDULED_DEPARTURE
df = df.withColumn("SCHEDULED_DEPARTURE",
    to_timestamp(
        when(format_heure_udf(col("SCHEDULED_DEPARTURE")) == "00:00:00",
             date_add(col("DATE"), 1).cast("string") + " " + format_heure_udf(col("SCHEDULED_DEPARTURE")))
        .otherwise(concat(col("DATE").cast("string"), lit(" "), format_heure_udf(col("SCHEDULED_DEPARTURE"))))
    )
)

# Format SCHEDULED_DEPARTURE as desired
df = df.withColumn("SCHEDULED_DEPARTURE", date_format(col("SCHEDULED_DEPARTURE"), "yyyy-MM-dd HH:mm:ss"))

# Show all columns with a focus on the formatted ones
df.select("*").show(5, truncate=False)

# Optionally, write the entire dataset to a CSV file
df.coalesce(1).write.mode("overwrite").option("header", "true").csv('cleaned_full_flight_data.csv')

# Stop the Spark session
spark.stop()

+----+-----+---+-----------+----+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+-----------------+------------------+-----------------+-------------------+-----------------+-----------------+-------------------------------------------+-------------+------------+--------------+---------------+----------------+-------------------------------------------+----------------+-----------------+-------------------+--------------------+---------------------+----------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRL|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATI

In [None]:

# Optionally, write the cleaned DataFrame back to a CSV file
cleaned__df.coalesce(1).write.mode("overwrite").csv('output_file_path.csv', header=True)

# Stop the Spark session
spark.stop()

NameError: name 'cleaned__df' is not defined