In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Step 2: Load the dataset
file_path = '/content/Data 1 - New York 2023.csv'  # Adjust path if necessary
data = pd.read_csv(file_path)

# Step 3: Explore the dataset
print("Dataset Preview:")
print(data.head())  # Display the first few rows
print("\nDataset Info:")
print(data.info())  # Check data types and missing values
print("\nSummary Statistics:")
print(data.describe())  # Summary statistics for numerical columns

# Step 4: Data Cleaning and Feature Engineering
# Drop rows with missing target values
data = data.dropna(subset=['DEP_DELAY_NEW'])

# Extract useful features from CRS_DEP_TIME (e.g., hour of day)
data['CRS_DEP_HOUR'] = data['CRS_DEP_TIME'] // 100  # Extract hour
data['CRS_DEP_MINUTES'] = data['CRS_DEP_TIME'] % 100  # Extract minutes

# Step 5: Define Features (X) and Target (y)
X = data[['OP_UNIQUE_CARRIER', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME',
          'CRS_DEP_HOUR', 'CRS_DEP_MINUTES', 'DISTANCE']]
y = data['DEP_DELAY_NEW']

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Build the preprocessing pipeline
# One-hot encode categorical features and scale numerical features
categorical_features = ['OP_UNIQUE_CARRIER', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME']
numerical_features = ['CRS_DEP_HOUR', 'CRS_DEP_MINUTES', 'DISTANCE']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # One-hot encode categorical features
    ])

# Step 8: Build the pipeline with a Random Forest Regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Step 9: Train the model
pipeline.fit(X_train, y_train)
print("Model training complete.")

# Step 10: Evaluate the model
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"\nEvaluation Metrics:\nMean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Step 11: Save the trained model
joblib.dump(pipeline, '/content/flight_delay_model.pkl')
print("Model saved as 'flight_delay_model.pkl'.")

# Step 12: Predict delays for new flight data
# Define new flight details
new_flight = pd.DataFrame({
    'OP_UNIQUE_CARRIER': ['AA'],  # Airline code (e.g., 'AA' for American Airlines)
    'ORIGIN_CITY_NAME': ['New York, NY'],  # Origin city
    'DEST_CITY_NAME': ['Los Angeles, CA'],  # Destination city
    'CRS_DEP_HOUR': [10],  # Scheduled departure hour (e.g., 10 AM)
    'CRS_DEP_MINUTES': [30],  # Scheduled departure minutes (e.g., 10:30 AM)
    'DISTANCE': [2475]  # Flight distance in miles
})

# Predict the delay
predicted_delay = pipeline.predict(new_flight)
print(f"Predicted Delay for the flight: {predicted_delay[0]:.2f} minutes")

# Step 13: Predict for multiple flights
new_flights = pd.DataFrame({
    'OP_UNIQUE_CARRIER': ['AA', 'UA', 'DL'],  # Airline codes
    'ORIGIN_CITY_NAME': ['New York, NY', 'Chicago, IL', 'Atlanta, GA'],  # Origin cities
    'DEST_CITY_NAME': ['Los Angeles, CA', 'Houston, TX', 'San Francisco, CA'],  # Destinations
    'CRS_DEP_HOUR': [10, 14, 16],  # Scheduled hours
    'CRS_DEP_MINUTES': [30, 45, 15],  # Scheduled minutes
    'DISTANCE': [2475, 925, 2130]  # Distances in miles
})

# Predict delays for multiple flights
predicted_delays = pipeline.predict(new_flights)
new_flights['Predicted Delay (minutes)'] = predicted_delays

# Display the results
print("\nPredicted Delays for Multiple Flights:")
print(new_flights)


Dataset Preview:
   YEAR OP_UNIQUE_CARRIER TAIL_NUM  OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID  \
0  2023                9E   N131EV               4642              14492   
1  2023                9E   N131EV               4647              11057   
2  2023                9E   N131EV               4658              12953   
3  2023                9E   N131EV               4660              12478   
4  2023                9E   N131EV               4670              12953   

     ORIGIN_CITY_NAME  DEST_AIRPORT_ID   DEST_CITY_NAME  CRS_DEP_TIME  \
0  Raleigh/Durham, NC            12478     New York, NY          2020   
1       Charlotte, NC            12953     New York, NY          1356   
2        New York, NY            11193   Cincinnati, OH           835   
3        New York, NY            13487  Minneapolis, MN           800   
4        New York, NY            13342    Milwaukee, WI          1900   

   DEP_TIME  DEP_DELAY  DEP_DELAY_NEW  AIR_TIME  FLIGHTS  DISTANCE  \
0    2032.0      