### 🧭 Week 1 – Project Initialization and Dataset Setup

**Goals**
- Build a unified understanding of flight delay datasets (2008 → 2024)
- Prepare optimized, memory-efficient DataFrames for analysis
- Quantify nulls, types, and structure consistency before cleaning

**KPIs**
- % of missing values identified per dataset
- Memory usage before / after optimization
- Rows retained after sampling
- Schema alignment across datasets

**Workflow**
1. Load all 3 CSVs into pandas
2. Explore schema, datatypes, nulls, and size
3. Perform sampling for quick exploration
4. Apply dtype optimization to reduce memory footprint


In [10]:
import pandas as pd
import numpy as np

# Load datasets
delayed_2008 = pd.read_csv("dataset/DelayedFlights.csv")
delays_2019_23 = pd.read_csv("dataset/flight_delays.csv")
sample_2024 = pd.read_csv("dataset/flights_sample_3m.csv")

# Quick previews
print("=== DelayedFlights (2008) ===")
display(delayed_2008.head(3))

print("\n=== flight_delays (2019–2023) ===")
display(delays_2019_23.head(3))

print("\n=== flight_sample_3m (2024) ===")
display(sample_2024.head(3))


=== DelayedFlights (2008) ===


Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,...,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,...,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,...,3.0,17.0,0,N,0,,,,,



=== flight_delays (2019–2023) ===


Unnamed: 0,FlightID,Airline,FlightNumber,Origin,Destination,ScheduledDeparture,ActualDeparture,ScheduledArrival,ActualArrival,DelayMinutes,DelayReason,Cancelled,Diverted,AircraftType,TailNumber,Distance
0,1,United,4558,ORD,MIA,2024-09-01 08:11,2024-09-01 08:30,2024-09-01 12:11,2024-09-01 12:19,8,Weather,True,False,Boeing 737,N71066,1031
1,2,Delta,8021,LAX,MIA,2024-09-01 10:25,2024-09-01 10:41,2024-09-01 13:25,2024-09-01 13:27,2,Air Traffic Control,True,True,Airbus A320,N22657,1006
2,3,Southwest,7520,DFW,SFO,2024-09-01 16:53,2024-09-01 17:05,2024-09-01 17:53,2024-09-01 18:07,14,Weather,True,True,Boeing 737,N95611,2980



=== flight_sample_3m (2024) ===


Unnamed: 0,FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,...,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
0,2019-01-09,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,1562,FLL,"Fort Lauderdale, FL",EWR,"Newark, NJ",...,0.0,186.0,176.0,153.0,1065.0,,,,,
1,2022-11-19,Delta Air Lines Inc.,Delta Air Lines Inc.: DL,DL,19790,1149,MSP,"Minneapolis, MN",SEA,"Seattle, WA",...,0.0,235.0,236.0,189.0,1399.0,,,,,
2,2022-07-22,United Air Lines Inc.,United Air Lines Inc.: UA,UA,19977,459,DEN,"Denver, CO",MSP,"Minneapolis, MN",...,0.0,118.0,112.0,87.0,680.0,,,,,


In [6]:
# Utility function for quick dataset summary
def dataset_summary(df, name):
    print(f"\n📊 Summary for {name}")
    print("-" * 50)
    print(f"Shape: {df.shape}")
    print("Column Types:")
    print(df.dtypes.value_counts())
    print("\nMissing Values (Top 10):")
    print(df.isnull().sum().sort_values(ascending=False).head(10))
    mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"\nMemory Usage: {mem:.2f} MB")



In [11]:
dataset_summary(delayed_2008, "DelayedFlights (2008)")


📊 Summary for DelayedFlights (2008)
--------------------------------------------------
Shape: (1936758, 30)
Column Types:
float64    14
int64      11
object      5
Name: count, dtype: int64

Missing Values (Top 10):
NASDelay             689270
CarrierDelay         689270
LateAircraftDelay    689270
SecurityDelay        689270
WeatherDelay         689270
ActualElapsedTime      8387
AirTime                8387
ArrDelay               8387
TaxiIn                 7110
ArrTime                7110
dtype: int64

Memory Usage: 923.51 MB


In [12]:
dataset_summary(delays_2019_23, "flight_delays (2019–23)")


📊 Summary for flight_delays (2019–23)
--------------------------------------------------
Shape: (1747627, 16)
Column Types:
object    10
int64      4
bool       2
Name: count, dtype: int64

Missing Values (Top 10):
DelayReason           468873
FlightID                   0
FlightNumber               0
Airline                    0
Destination                0
ScheduledDeparture         0
ActualDeparture            0
Origin                     0
ScheduledArrival           0
ActualArrival              0
dtype: int64

Memory Usage: 1169.84 MB


In [13]:
dataset_summary(sample_2024, "flight_sample_3m (2024)")


📊 Summary for flight_sample_3m (2024)
--------------------------------------------------
Shape: (3000000, 32)
Column Types:
float64    19
object      9
int64       4
Name: count, dtype: int64

Missing Values (Top 10):
CANCELLATION_CODE          2920860
DELAY_DUE_LATE_AIRCRAFT    2466137
DELAY_DUE_CARRIER          2466137
DELAY_DUE_SECURITY         2466137
DELAY_DUE_NAS              2466137
DELAY_DUE_WEATHER          2466137
ARR_DELAY                    86198
ELAPSED_TIME                 86198
AIR_TIME                     86198
WHEELS_ON                    79944
dtype: int64

Memory Usage: 2174.67 MB


In [14]:
# 1️⃣ Sampling (10 000 rows or less if smaller)
sample_2008 = delayed_2008.sample(n=min(10000, len(delayed_2008)), random_state=42)
sample_2019_23 = delays_2019_23.sample(n=min(10000, len(delays_2019_23)), random_state=42)
sample_2024_small = sample_2024.sample(n=min(1000, len(sample_2024)), random_state=42)

print(sample_2008.shape, sample_2019_23.shape, sample_2024_small.shape)

# 2️⃣ Memory optimization helper
def optimize_dtypes(df):
    for col in df.select_dtypes(include=["int64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")
    for col in df.select_dtypes(include=["float64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")
    return df

# Apply optimization
delayed_2008_opt = optimize_dtypes(delayed_2008)
delays_2019_23_opt = optimize_dtypes(delays_2019_23)
sample_2024_opt = optimize_dtypes(sample_2024)

# Compare memory before vs after
def memory_report(before_df, after_df, name):
    before = before_df.memory_usage(deep=True).sum() / 1024**2
    after = after_df.memory_usage(deep=True).sum() / 1024**2
    print(f"{name}: {before:.2f} MB → {after:.2f} MB ({(before-after)/before*100:.1f}% reduction)")


(10000, 30) (10000, 16) (1000, 32)


In [15]:
memory_report(delayed_2008, delayed_2008_opt, "DelayedFlights (2008)")
memory_report(delays_2019_23, delays_2019_23_opt, "flight_delays (2019–23)")
memory_report(sample_2024, sample_2024_opt, "flight_sample_3m (2024)")


DelayedFlights (2008): 692.63 MB → 692.63 MB (0.0% reduction)
flight_delays (2019–23): 1131.51 MB → 1131.51 MB (0.0% reduction)
flight_sample_3m (2024): 1888.57 MB → 1888.57 MB (0.0% reduction)
