In [1]:
# Import libraries
import pandas as pd
import numpy as np

# Load dataset
file_path = "DelayedFlights.csv"   # change to your CSV filename
df = pd.read_csv(file_path)

# Quick overview
print(df.shape)       # rows, cols
print(df.columns)     # all column names
df.head(5)


(1936758, 30)
Index(['Unnamed: 0', 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime',
       'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum',
       'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,...,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,...,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,...,3.0,17.0,0,N,0,,,,,
3,4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,...,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
4,5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,...,4.0,10.0,0,N,0,,,,,


In [2]:
df['FlightDate'] = pd.to_datetime(
    df[['Year', 'Month', 'DayofMonth']].rename(
        columns={'Year': 'year', 'Month': 'month', 'DayofMonth': 'day'}
    )
)


In [3]:
# Check data types
print(df.info())

# Missing values per column
print(df.isnull().sum())

# Memory usage in MB
print("Memory usage (MB):", round(df.memory_usage(deep=True).sum() / 1024**2, 2))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1936758 entries, 0 to 1936757
Data columns (total 31 columns):
 #   Column             Dtype         
---  ------             -----         
 0   Unnamed: 0         int64         
 1   Year               int64         
 2   Month              int64         
 3   DayofMonth         int64         
 4   DayOfWeek          int64         
 5   DepTime            float64       
 6   CRSDepTime         int64         
 7   ArrTime            float64       
 8   CRSArrTime         int64         
 9   UniqueCarrier      object        
 10  FlightNum          int64         
 11  TailNum            object        
 12  ActualElapsedTime  float64       
 13  CRSElapsedTime     float64       
 14  AirTime            float64       
 15  ArrDelay           float64       
 16  DepDelay           float64       
 17  Origin             object        
 18  Dest               object        
 19  Distance           int64         
 20  TaxiIn             float

None
Unnamed: 0                0
Year                      0
Month                     0
DayofMonth                0
DayOfWeek                 0
DepTime                   0
CRSDepTime                0
ArrTime                7110
CRSArrTime                0
UniqueCarrier             0
FlightNum                 0
TailNum                   5
ActualElapsedTime      8387
CRSElapsedTime          198
AirTime                8387
ArrDelay               8387
DepDelay                  0
Origin                    0
Dest                      0
Distance                  0
TaxiIn                 7110
TaxiOut                 455
Cancelled                 0
CancellationCode          0
Diverted                  0
CarrierDelay         689270
WeatherDelay         689270
NASDelay             689270
SecurityDelay        689270
LateAircraftDelay    689270
FlightDate                0
dtype: int64
Memory usage (MB): 864.41


In [4]:
import pandas as pd

# List of delay columns
delay_cols = [
    'ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay',
    'NASDelay', 'SecurityDelay', 'LateAircraftDelay'
]

# Fill NaN with 0 for delay columns (interpreted as "no delay")
df[delay_cols] = df[delay_cols].fillna(0)

# Handle Cancelled column: fill NaN with 0, ensure integer (0 or 1)
df['Cancelled'] = df['Cancelled'].fillna(0).astype(int)

# ✅ Create FlightDate from component columns (rename required)
df['FlightDate'] = pd.to_datetime(
    df[['Year', 'Month', 'DayofMonth']].rename(
        columns={'Year': 'year', 'Month': 'month', 'DayofMonth': 'day'}
    )
)

# Create Hour from CRSDepTime (scheduled departure time)
df['Hour'] = df['CRSDepTime'].astype(str).str.zfill(4).str[:2].astype(int)

# Create Route feature
df['Route'] = df['Origin'].astype(str) + "-" + df['Dest'].astype(str)


In [None]:
# Save cleaned data for reuse
df.to_csv("AirlineData_Cleaned.csv", index=False)
