In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv('DelayedFlights.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1936758 entries, 0 to 1936757
Data columns (total 30 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   Year               int64  
 2   Month              int64  
 3   DayofMonth         int64  
 4   DayOfWeek          int64  
 5   DepTime            float64
 6   CRSDepTime         int64  
 7   ArrTime            float64
 8   CRSArrTime         int64  
 9   UniqueCarrier      object 
 10  FlightNum          int64  
 11  TailNum            object 
 12  ActualElapsedTime  float64
 13  CRSElapsedTime     float64
 14  AirTime            float64
 15  ArrDelay           float64
 16  DepDelay           float64
 17  Origin             object 
 18  Dest               object 
 19  Distance           int64  
 20  TaxiIn             float64
 21  TaxiOut            float64
 22  Cancelled          int64  
 23  CancellationCode   object 
 24  Diverted           int64  
 25  CarrierDelay      

In [5]:
df_cleaned = df.dropna(how='any')
print(df_cleaned)

         Unnamed: 0  Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  \
3                 4  2008      1           3          4   1829.0        1755   
5                 6  2008      1           3          4   1937.0        1830   
7                11  2008      1           3          4   1644.0        1510   
9                16  2008      1           3          4   1452.0        1425   
11               18  2008      1           3          4   1323.0        1255   
...             ...   ...    ...         ...        ...      ...         ...   
1936751     7009705  2008     12          13          6    921.0         830   
1936752     7009709  2008     12          13          6   1552.0        1520   
1936753     7009710  2008     12          13          6   1250.0        1220   
1936754     7009717  2008     12          13          6    657.0         600   
1936755     7009718  2008     12          13          6   1007.0         847   

         ArrTime  CRSArrTime UniqueCarr

In [6]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,...,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,...,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,...,3.0,17.0,0,N,0,,,,,
3,4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,...,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
4,5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,...,4.0,10.0,0,N,0,,,,,
5,6,2008,1,3,4,1937.0,1830,2037.0,1940,WN,...,3.0,7.0,0,N,0,10.0,0.0,0.0,0.0,47.0
6,10,2008,1,3,4,706.0,700,916.0,915,WN,...,5.0,19.0,0,N,0,,,,,
7,11,2008,1,3,4,1644.0,1510,1845.0,1725,WN,...,6.0,8.0,0,N,0,8.0,0.0,0.0,0.0,72.0
8,15,2008,1,3,4,1029.0,1020,1021.0,1010,WN,...,6.0,9.0,0,N,0,,,,,
9,16,2008,1,3,4,1452.0,1425,1640.0,1625,WN,...,7.0,8.0,0,N,0,3.0,0.0,0.0,0.0,12.0


In [7]:
print("Shape of original dataset:", df.shape)
print("Shape of cleaned dataset:", df_cleaned.shape)

Shape of original dataset: (1936758, 30)
Shape of cleaned dataset: (1247486, 30)


In [8]:
df_cleaned.to_csv("cleaned_dataset.csv", index=False)

In [9]:
categorical_columns = ['UniqueCarrier','TailNum','Origin','Dest']


In [10]:
label_mapping = {}

In [11]:
for column in categorical_columns:
    # Create a mapping of unique values to integers
    label_mapping[column] = {label: idx for idx, label in enumerate(df[column].unique())}
    
    # Apply the mapping to the column
    df[column] = df[column].map(label_mapping[column])



In [12]:
# Print the updated dataset
print("\nUpdated Dataset:")
print(df.head())


Updated Dataset:
   Unnamed: 0  Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  \
0           0  2008      1           3          4   2003.0        1955   
1           1  2008      1           3          4    754.0         735   
2           2  2008      1           3          4    628.0         620   
3           4  2008      1           3          4   1829.0        1755   
4           5  2008      1           3          4   1940.0        1915   

   ArrTime  CRSArrTime  UniqueCarrier  ...  TaxiIn  TaxiOut  Cancelled  \
0   2211.0        2225              0  ...     4.0      8.0          0   
1   1002.0        1000              0  ...     5.0     10.0          0   
2    804.0         750              0  ...     3.0     17.0          0   
3   1959.0        1925              0  ...     3.0     10.0          0   
4   2121.0        2110              0  ...     4.0     10.0          0   

   CancellationCode  Diverted  CarrierDelay  WeatherDelay  NASDelay  \
0                 N  

In [13]:
selected_features = ['DepTime','ArrTime','UniqueCarrier','FlightNum','TailNum','Distance','DayOfWeek','Month','Origin','Dest','TaxiIn','TaxiOut','WeatherDelay','CarrierDelay','NASDelay','LateAircraftDelay','Cancelled']
df_selected = df[selected_features]

In [14]:
#Creating new features
df['TimeOfDay'] = pd.cut(df['DepTime'], bins=[0, 600, 1200, 1800, 2400], labels=['Night', 'Morning', 'Afternoon', 'Evening'], include_lowest=True)


In [15]:
import numpy as np

# Assuming your bool array is named 'bool_array'
numeric_array = TimeOfDay.astype(np.uint8)


NameError: name 'TimeOfDay' is not defined

In [None]:
df['Season'] = df['Month'].map({1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Fall', 10: 'Fall', 11: 'Fall', 12: 'Winter'})

In [None]:
df['IsWeekend'] = (df['DayOfWeek'].isin(['Saturday', 'Sunday'])).astype(int)

In [None]:
df['IsMorningFlight'] = (df['DepTime'] < 1200).astype(int)

In [None]:
df['DepTimeDifference'] = df['DepTime'] - df['CRSDepTime']


In [None]:
# Calculate the delay as the difference between actual departure and scheduled departure
df['DepartureDelay'] = df['DepTime'] - df['CRSDepTime']

# Print the updated dataset
print(df[['DepTime', 'CRSDepTime', 'DepartureDelay']])

In [None]:
# Convert categorical features to one-hot encoding
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Updated list of selected features, excluding original categorical columns
selected_features = ['DepTime', 'ArrTime', 'FlightNum', 'Distance', 'DayOfWeek', 'Month',
                      'TaxiIn', 'TaxiOut', 'WeatherDelay', 'CarrierDelay', 'NASDelay',
                      'LateAircraftDelay', 'Cancelled', 'TimeOfDay', 'Season', 'IsWeekend',
                      'IsMorningFlight', 'DepTimeDifference']

# Separate features and target variable
X = df[selected_features].values  # Features
y = df['DepartureDelay'].values  # Target variable

# Separate numeric and categorical features
numeric_features = df.select_dtypes(include=['number']).columns
categorical_features = df.select_dtypes(include=['object']).columns


In [None]:
# Impute numeric features with mean
numeric_imputer = SimpleImputer(strategy='mean')
df[numeric_features] = numeric_imputer.fit_transform(df[numeric_features])

# Impute 'DepartureDelay' with mean
departure_delay_imputer = SimpleImputer(strategy='mean')
df['DepartureDelay'] = departure_delay_imputer.fit_transform(df[['DepartureDelay']])

# Impute categorical features with a constant value ('missing')
constant_imputer = SimpleImputer(strategy='constant', fill_value='missing')
df[categorical_features] = constant_imputer.fit_transform(df[categorical_features])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize models
linear_model = LinearRegression()
random_forest_model = RandomForestRegressor(random_state=42)
gradient_boosting_model = GradientBoostingRegressor(random_state=42)


In [None]:
# Train the models
random_forest_model.fit(X_train, y_train)
gradient_boosting_model.fit(X_train, y_train)