In [0]:
# Base path where all the project data is stored
# This path is controlled by databricks workspace permissions
base_path='dbfs:/FileStore/tables/FileStores/flight_project/'

In [0]:
#listing the raw folder where created folders are present
dbutils.fs.ls(f'{base_path}/raw')

[FileInfo(path='dbfs:/FileStore/tables/FileStores/flight_project/raw/Streaming/', name='Streaming/', size=0, modificationTime=1767197295000),
 FileInfo(path='dbfs:/FileStore/tables/FileStores/flight_project/raw/batch/', name='batch/', size=0, modificationTime=1767074306000)]

In [0]:
#listing the files present in the batch folder
dbutils.fs.ls(f'{base_path}/raw/batch')

[FileInfo(path='dbfs:/FileStore/tables/FileStores/flight_project/raw/batch/airlines.csv', name='airlines.csv', size=359, modificationTime=1767074414000),
 FileInfo(path='dbfs:/FileStore/tables/FileStores/flight_project/raw/batch/airports.csv', name='airports.csv', size=23867, modificationTime=1767074306000),
 FileInfo(path='dbfs:/FileStore/tables/FileStores/flight_project/raw/batch/flights_sample.csv', name='flights_sample.csv', size=20142, modificationTime=1767074415000)]

In [0]:
#listing the files present in the streaming folder
dbutils.fs.ls(f'{base_path}/raw/Streaming/flights')

[FileInfo(path='dbfs:/FileStore/tables/FileStores/flight_project/raw/Streaming/flights/flights_latest_.csv', name='flights_latest_.csv', size=1239845, modificationTime=1767197295000)]

In [0]:
# Reading the airlines data as a dataframe and checking the schema from the raw folder
airlines_df = spark.read.format('csv').option('header','true').load(f'{base_path}/raw/batch/airlines.csv')
airlines_df.show(10)
airlines_df.printSchema()

+---------+--------------------+
|IATA_CODE|             AIRLINE|
+---------+--------------------+
|       UA|United Air Lines ...|
|       AA|American Airlines...|
|       US|     US Airways Inc.|
|       F9|Frontier Airlines...|
|       B6|     JetBlue Airways|
|       OO|Skywest Airlines ...|
|       AS|Alaska Airlines Inc.|
|       NK|    Spirit Air Lines|
|       WN|Southwest Airline...|
|       DL|Delta Air Lines Inc.|
+---------+--------------------+
only showing top 10 rows
root
 |-- IATA_CODE: string (nullable = true)
 |-- AIRLINE: string (nullable = true)



In [0]:
#reading the airports data as a dataframe and checking the schema from the raw folder
airports_df = spark.read.format('csv').option('header','true').load(f'{base_path}/raw/batch/airports.csv')
airports_df.show(10)
airports_df.printSchema()

+---------+--------------------+-------------+-----+-------+--------+----------+
|IATA_CODE|             AIRPORT|         CITY|STATE|COUNTRY|LATITUDE| LONGITUDE|
+---------+--------------------+-------------+-----+-------+--------+----------+
|      ABE|Lehigh Valley Int...|    Allentown|   PA|    USA|40.65236| -75.44040|
|      ABI|Abilene Regional ...|      Abilene|   TX|    USA|32.41132| -99.68190|
|      ABQ|Albuquerque Inter...|  Albuquerque|   NM|    USA|35.04022|-106.60919|
|      ABR|Aberdeen Regional...|     Aberdeen|   SD|    USA|45.44906| -98.42183|
|      ABY|Southwest Georgia...|       Albany|   GA|    USA|31.53552| -84.19447|
|      ACK|Nantucket Memoria...|    Nantucket|   MA|    USA|41.25305| -70.06018|
|      ACT|Waco Regional Air...|         Waco|   TX|    USA|31.61129| -97.23052|
|      ACV|      Arcata Airport|Arcata/Eureka|   CA|    USA|40.97812|-124.10862|
|      ACY|Atlantic City Int...|Atlantic City|   NJ|    USA|39.45758| -74.57717|
|      ADK|        Adak Airp

In [0]:
#reading the flights_historical data as a dataframe and checking the schema from the raw folder
flight_historical_df = spark.read.format('csv').option('header','true').load(f'{base_path}/raw/batch/flights_sample.csv')
display(flight_historical_df.limit(10))
flight_historical_df.printSchema()

YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
2015,1,1,4,AS,98,N407AS,ANC,SEA,5,2354,-11,21,15,205,194,169,1448,404,4,430,408,-22,0,0,,,,,,
2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2,-8,12,14,280,279,263,2330,737,4,750,741,-9,0,0,,,,,,
2015,1,1,4,US,840,N171US,SFO,CLT,20,18,-2,16,34,286,293,266,2296,800,11,806,811,5,0,0,,,,,,
2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15,-5,15,30,285,281,258,2342,748,8,805,756,-9,0,0,,,,,,
2015,1,1,4,AS,135,N527AS,SEA,ANC,25,24,-1,11,35,235,215,199,1448,254,5,320,259,-21,0,0,,,,,,
2015,1,1,4,DL,806,N3730B,SFO,MSP,25,20,-5,18,38,217,230,206,1589,604,6,602,610,8,0,0,,,,,,
2015,1,1,4,NK,612,N635NK,LAS,MSP,25,19,-6,11,30,181,170,154,1299,504,5,526,509,-17,0,0,,,,,,
2015,1,1,4,US,2013,N584UW,LAX,CLT,30,44,14,13,57,273,249,228,2125,745,8,803,753,-10,0,0,,,,,,
2015,1,1,4,AA,1112,N3LAAA,SFO,DFW,30,19,-11,17,36,195,193,173,1464,529,3,545,532,-13,0,0,,,,,,
2015,1,1,4,DL,1173,N826DN,LAS,ATL,30,33,3,12,45,221,203,186,1747,651,5,711,656,-15,0,0,,,,,,


root
 |-- YEAR: string (nullable = true)
 |-- MONTH: string (nullable = true)
 |-- DAY: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: string (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: string (nullable = true)
 |-- DEPARTURE_TIME: string (nullable = true)
 |-- DEPARTURE_DELAY: string (nullable = true)
 |-- TAXI_OUT: string (nullable = true)
 |-- WHEELS_OFF: string (nullable = true)
 |-- SCHEDULED_TIME: string (nullable = true)
 |-- ELAPSED_TIME: string (nullable = true)
 |-- AIR_TIME: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- WHEELS_ON: string (nullable = true)
 |-- TAXI_IN: string (nullable = true)
 |-- SCHEDULED_ARRIVAL: string (nullable = true)
 |-- ARRIVAL_TIME: string (nullable = true)
 |-- ARRIVAL_DELAY: string (nullable = true)
 |-- D

In [0]:
flights_latest=spark.read.format('csv').option('header','true').load(f'{base_path}/raw/Streaming/flights')
display(flights_latest.limit(10))
flights_latest.printSchema()

YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
2018,4,8,6,DL,65,N340NW,ATL,MIA,1619,1619.0,0.0,13.0,1632.0,113,122.0,82.0,594,1754.0,27.0,1812,1821.0,9.0,0,0,,,,,,
2019,3,5,6,OO,5180,N779SK,HDN,LAX,1623,1614.0,-9.0,21.0,1635.0,137,140.0,111.0,763,1726.0,8.0,1740,1734.0,-6.0,0,0,,,,,,
2020,3,12,6,DL,2004,N3764D,LAX,SLC,1623,1616.0,-7.0,25.0,1641.0,113,119.0,88.0,590,1909.0,6.0,1916,1915.0,-1.0,0,0,,,,,,
2022,6,5,6,WN,2984,N463WN,SNA,LAS,1620,1622.0,2.0,7.0,1629.0,65,53.0,39.0,226,1708.0,7.0,1725,1715.0,-10.0,0,0,,,,,,
2022,8,7,6,AS,858,N530AS,OGG,SAN,1625,1620.0,-5.0,8.0,1628.0,320,293.0,281.0,2541,2309.0,4.0,2345,2313.0,-32.0,0,0,,,,,,
2021,1,18,6,WN,4398,N940WN,MSY,BWI,1620,1616.0,-4.0,7.0,1623.0,140,134.0,119.0,998,1922.0,8.0,1940,1930.0,-10.0,0,0,,,,,,
2020,3,28,6,WN,1080,N8319F,BNA,MCO,1625,1644.0,19.0,9.0,1653.0,105,104.0,88.0,616,1921.0,7.0,1910,1928.0,18.0,0,0,,0.0,0.0,0.0,18.0,0.0
2021,2,2,6,DL,1277,N942DN,ATL,MSY,1620,1619.0,-1.0,9.0,1628.0,96,88.0,74.0,425,1642.0,5.0,1656,1647.0,-9.0,0,0,,,,,,
2020,2,11,6,US,1802,N753US,CLT,MSY,1625,1620.0,-5.0,17.0,1637.0,127,123.0,99.0,651,1716.0,7.0,1732,1723.0,-9.0,0,0,,,,,,
2018,7,10,6,US,1791,N173US,CLT,TPA,1625,1616.0,-9.0,21.0,1637.0,100,100.0,76.0,507,1753.0,3.0,1805,1756.0,-9.0,0,0,,,,,,


root
 |-- YEAR: string (nullable = true)
 |-- MONTH: string (nullable = true)
 |-- DAY: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: string (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: string (nullable = true)
 |-- DEPARTURE_TIME: string (nullable = true)
 |-- DEPARTURE_DELAY: string (nullable = true)
 |-- TAXI_OUT: string (nullable = true)
 |-- WHEELS_OFF: string (nullable = true)
 |-- SCHEDULED_TIME: string (nullable = true)
 |-- ELAPSED_TIME: string (nullable = true)
 |-- AIR_TIME: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- WHEELS_ON: string (nullable = true)
 |-- TAXI_IN: string (nullable = true)
 |-- SCHEDULED_ARRIVAL: string (nullable = true)
 |-- ARRIVAL_TIME: string (nullable = true)
 |-- ARRIVAL_DELAY: string (nullable = true)
 |-- D

"Raw zone stores source data without schema enforcement to support lineage and reprocessing"