In [2]:
!pip install requests pandas scikit-learn matplotlib seaborn




In [3]:
import pandas as pd
import numpy as np
import requests
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# Load the dataset from the environment
file_name = "/content/Data 1 - New York 2023.csv"  # Modify this with the correct file path if needed
data = pd.read_csv(file_name)

# Preview the data
print("Dataset Preview:")
print(data.head())


Dataset Preview:
   YEAR OP_UNIQUE_CARRIER TAIL_NUM  OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID  \
0  2023                9E   N131EV               4642              14492   
1  2023                9E   N131EV               4647              11057   
2  2023                9E   N131EV               4658              12953   
3  2023                9E   N131EV               4660              12478   
4  2023                9E   N131EV               4670              12953   

     ORIGIN_CITY_NAME  DEST_AIRPORT_ID   DEST_CITY_NAME  CRS_DEP_TIME  \
0  Raleigh/Durham, NC            12478     New York, NY          2020   
1       Charlotte, NC            12953     New York, NY          1356   
2        New York, NY            11193   Cincinnati, OH           835   
3        New York, NY            13487  Minneapolis, MN           800   
4        New York, NY            13342    Milwaukee, WI          1900   

   DEP_TIME  DEP_DELAY  DEP_DELAY_NEW  AIR_TIME  FLIGHTS  DISTANCE  \
0    2032.0      

In [5]:
# Filter relevant columns and drop rows with missing values
data = data[['ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'DEP_DELAY', 'CRS_DEP_TIME', 'DISTANCE', 'AIR_TIME']].dropna()

# Extract unique cities for weather data
unique_cities = data['ORIGIN_CITY_NAME'].unique()[:5]  # Use the first 5 cities for simplicity
print("Selected Cities for Weather Data:", unique_cities)


Selected Cities for Weather Data: ['Raleigh/Durham, NC' 'Charlotte, NC' 'New York, NY' 'Pittsburgh, PA'
 'State College, PA']


5. Fetch Weather Data Using OpenWeatherMap API

In [6]:
# API key and URL for OpenWeatherMap
API_KEY = "YOUR_API_KEY"
BASE_URL = "http://api.openweathermap.org/data/2.5/weather"

# Function to fetch weather data for a city
def get_weather(city):
    params = {
        "q": city,
        "appid": API_KEY,
        "units": "metric"
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        return {
            "City": city,
            "Temperature": data["main"]["temp"],
            "Humidity": data["main"]["humidity"],
            "Weather": data["weather"][0]["description"]
        }
    else:
        return {"City": city, "Temperature": None, "Humidity": None, "Weather": None}

# Fetch weather data for all selected cities
weather_data = [get_weather(city) for city in unique_cities]
weather_df = pd.DataFrame(weather_data)
print("\nWeather Data:")
print(weather_df)



Weather Data:
                 City Temperature Humidity Weather
0  Raleigh/Durham, NC        None     None    None
1       Charlotte, NC        None     None    None
2        New York, NY        None     None    None
3      Pittsburgh, PA        None     None    None
4   State College, PA        None     None    None


6. Merge Weather Data with the Flight Dataset

In [7]:
# Merge weather data with the main dataset
merged_data = data.merge(weather_df, left_on="ORIGIN_CITY_NAME", right_on="City", how="left")

# Drop unnecessary columns
merged_data = merged_data.drop(columns=['City'])
print("\nMerged Dataset Preview:")
print(merged_data.head())



Merged Dataset Preview:
     ORIGIN_CITY_NAME   DEST_CITY_NAME  DEP_DELAY  CRS_DEP_TIME  DISTANCE  \
0  Raleigh/Durham, NC     New York, NY       12.0          2020       427   
1       Charlotte, NC     New York, NY       -2.0          1356       544   
2        New York, NY   Cincinnati, OH       63.0           835       585   
3        New York, NY  Minneapolis, MN       -1.0           800      1029   
4        New York, NY    Milwaukee, WI        0.0          1900       738   

   AIR_TIME Temperature Humidity Weather  
0      62.0        None     None    None  
1     104.0        None     None    None  
2      91.0        None     None    None  
3     152.0        None     None    None  
4     132.0        None     None    None  


7. Visualize Data

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(merged_data.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Distribution of Departure Delays
plt.figure(figsize=(8, 6))
sns.histplot(merged_data['DEP_DELAY'], kde=True, bins=30, color='blue')
plt.title("Distribution of Departure Delays")
plt.xlabel("Delay (minutes)")
plt.ylabel("Frequency")
plt.show()


8. Train a Random Forest Model

In [9]:
# Select features and target variable
features = ['CRS_DEP_TIME', 'DISTANCE', 'AIR_TIME', 'Temperature', 'Humidity']
target = 'DEP_DELAY'

# Drop rows with missing weather data
final_data = merged_data.dropna(subset=features)

# Split the data
X = final_data[features]
y = final_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error: {rmse}")


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

9. Visualize Predictions

In [10]:
# Plot Actual vs Predicted Values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='purple')
plt.title("Actual vs Predicted Departure Delays")
plt.xlabel("Actual Delay (minutes)")
plt.ylabel("Predicted Delay (minutes)")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.show()


NameError: name 'y_test' is not defined

<Figure size 800x600 with 0 Axes>