In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
file_path = 'Airline_Delay_Cause.csv'
data = pd.read_csv(file_path)

## Hypothesis 1: Flights with higher late_aircraft_delay have higher overall arrival delays (arr_delay)

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

features_regression = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
target_regression = 'arr_delay'


df_regression = data[features_regression + [target_regression]].dropna()


X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(df_regression[features_regression], df_regression[target_regression], test_size=0.3, random_state=42)

rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_reg, y_train_reg)


y_pred_reg = rf_reg.predict(X_test_reg)


mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
r2_reg = r2_score(y_test_reg, y_pred_reg)

print("Hypothesis 1 Results:")
print("Mean Squared Error (MSE):", mse_reg)
print("R² Score:", r2_reg)


Hypothesis 1 Results:
Mean Squared Error (MSE): 1952432.35245764
R² Score: 0.9930204661399816



**Hypothesis:**  
*"Flights with higher `late_aircraft_delay` have higher overall arrival delays (`arr_delay`)."*

**Results:**  
- **Mean Squared Error (MSE):** 1,952,432.35  
  This indicates the average squared difference between actual and predicted values for `arr_delay`. A lower MSE value suggests better prediction performance.
  
- **R² Score:** 0.993  
  This implies that 99.3% of the variance in `arr_delay` is explained by the model using the selected features, particularly `late_aircraft_delay`.

**Conclusion:**  
The high R² score indicates a very strong relationship between the selected delay factors, especially `late_aircraft_delay`, and overall arrival delays (`arr_delay`). Thus, we **fail to reject the hypothesis**, confirming that flights with higher `late_aircraft_delay` indeed experience higher arrival delays.

## Hypothesis 2: Flights delayed more than 15 minutes (arr_del15 > 15) are significantly associated with carrier_delay

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.stats import ttest_ind

data['arr_del15_class'] = (data['arr_del15'] > 15).astype(int)


features_classification = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
target_classification = 'arr_del15_class'


df_classification = data[features_classification + [target_classification]].dropna()


X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(df_classification[features_classification], df_classification[target_classification], test_size=0.3, random_state=42)


rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_clf, y_train_clf)


y_pred_clf = rf_clf.predict(X_test_clf)


classification_report_clf = classification_report(y_test_clf, y_pred_clf)
print("Hypothesis 2 Results:")
print(classification_report_clf)


carrier_delay_high = data[data['arr_del15_class'] == 1]['carrier_delay'].dropna()
carrier_delay_low = data[data['arr_del15_class'] == 0]['carrier_delay'].dropna()
t_stat, p_value = ttest_ind(carrier_delay_high, carrier_delay_low, equal_var=False)


print("T-Statistic:", t_stat)
print("P-Value:", p_value)


Hypothesis 2 Results:
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1991
           1       0.93      0.93      0.93      2525

    accuracy                           0.92      4516
   macro avg       0.92      0.92      0.92      4516
weighted avg       0.92      0.92      0.92      4516

T-Statistic: 32.89645357801972
P-Value: 3.0087325098502577e-223



**Hypothesis:**  
*"Flights delayed more than 15 minutes (`arr_del15` > 15) are significantly associated with `carrier_delay`."*

**Results:**
- **Classification Metrics:**
  - **Precision:** 
    - Class 0 (Not delayed): 92%
    - Class 1 (Delayed): 93%
    - Indicates a low false positive rate for both classes.
  - **Recall:**
    - Class 0: 91% 
    - Class 1: 93%
    - Indicates the model's ability to correctly identify flights with and without significant delays.
  - **F1-Score:**
    - Weighted average of precision and recall is 92%, demonstrating balanced performance.
  - **Accuracy:** 92% of flights were correctly classified.

- **Statistical Test:**
  - **T-Statistic:** 32.90 indicates a very large difference in mean `carrier_delay` between flights delayed >15 minutes and those not delayed.
  - **P-Value:** 3.01e-223 (extremely small, well below 0.05 threshold) confirms the difference is statistically significant.

**Conclusion:**
The classification model and statistical test strongly support the hypothesis that flights delayed more than 15 minutes are significantly associated with higher `carrier_delay`. We **fail to reject the hypothesis**. The relationship is both statistically and practically significant.

# Hypothesis 3 : "Flights with higher `late_aircraft_delay` tend to have higher total delays (`arr_delay`)."


In [11]:
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr


features_late_aircraft = ['late_aircraft_delay']
target_arr_delay = 'arr_delay'


df_late_aircraft = data[features_late_aircraft + [target_arr_delay]].dropna()


X_train_late, X_test_late, y_train_late, y_test_late = train_test_split(df_late_aircraft[features_late_aircraft], df_late_aircraft[target_arr_delay], test_size=0.3, random_state=42)


lr_model_late = LinearRegression()
lr_model_late.fit(X_train_late, y_train_late)

y_pred_late = lr_model_late.predict(X_test_late)

mse_late = mean_squared_error(y_test_late, y_pred_late)
r2_late = r2_score(y_test_late, y_pred_late)

corr_late, p_value_corr_late = pearsonr(df_late_aircraft['late_aircraft_delay'], df_late_aircraft['arr_delay'])


hypothesis_4_results = {
    "Mean Squared Error (MSE)": mse_late,
    "R² Score": r2_late,
    "Pearson Correlation Coefficient": corr_late,
    "P-Value": p_value_corr_late
}

hypothesis_4_results


{'Mean Squared Error (MSE)': 16146390.18365122,
 'R² Score': 0.9422800606320586,
 'Pearson Correlation Coefficient': 0.9709232452929678,
 'P-Value': 0.0}

Hypothesis:
"Flights with higher late_aircraft_delay tend to have higher total delays (arr_delay)."

Results:

Regression Metrics:

Mean Squared Error (MSE): 16,146,390.18
This indicates the average squared difference between the actual and predicted arr_delay values. While the value is large, it is relative to the magnitude of arr_delay values in the dataset.
R² Score: 0.942
This means that 94.2% of the variance in arr_delay is explained by the late_aircraft_delay alone, indicating a strong linear relationship.
Statistical Test:

Pearson Correlation Coefficient (r): 0.971
This shows a very strong positive correlation between late_aircraft_delay and arr_delay, confirming the strong association.
P-Value: 0.0
The p-value is extremely small (less than 0.05), indicating that the correlation is statistically significant.
Conclusion:
The high R² score, strong correlation coefficient, and statistically significant p-value all support the hypothesis. This confirms that higher late_aircraft_delay results in higher overall arrival delays (arr_delay). We fail to reject the hypothesis and conclude that late_aircraft_delay is a major driver of total arrival delay


## Hypothesis 4:  "High-traffic airports experience significantly more delays than low-traffic airports."

In [10]:
from scipy.stats import ttest_ind, mannwhitneyu



high_traffic_threshold = data['arr_flights'].quantile(0.75)
low_traffic_threshold = data['arr_flights'].quantile(0.25)


high_traffic_airports = data[data['arr_flights'] >= high_traffic_threshold]['arr_delay'].dropna()
low_traffic_airports = data[data['arr_flights'] <= low_traffic_threshold]['arr_delay'].dropna()

t_stat, p_value_ttest = ttest_ind(high_traffic_airports, low_traffic_airports, equal_var=False)


u_stat, p_value_mannwhitney = mannwhitneyu(high_traffic_airports, low_traffic_airports, alternative='two-sided')

mean_delay_high_traffic = high_traffic_airports.mean()
mean_delay_low_traffic = low_traffic_airports.mean()


hypothesis_5_results = {
    "High Traffic Mean Delay": mean_delay_high_traffic,
    "Low Traffic Mean Delay": mean_delay_low_traffic,
    "T-Test Statistic": t_stat,
    "T-Test P-Value": p_value_ttest,
    "Mann-Whitney U Statistic": u_stat,
    "Mann-Whitney P-Value": p_value_mannwhitney
}

hypothesis_5_results


{'High Traffic Mean Delay': 18790.933563646027,
 'Low Traffic Mean Delay': 319.17760210803687,
 'T-Test Statistic': 32.95648498889829,
 'T-Test P-Value': 1.697786532265786e-209,
 'Mann-Whitney U Statistic': 14259180.5,
 'Mann-Whitney P-Value': 0.0}


**Hypothesis:**  
*"High-traffic airports experience significantly more delays than low-traffic airports."*

**Results:**
- **Delay Metrics:**
  - **High-Traffic Airports (75th Percentile or Above) Mean Delay:** 18,790.93  
    This indicates that high-traffic airports have significantly higher mean arrival delays compared to low-traffic airports.
  - **Low-Traffic Airports (25th Percentile or Below) Mean Delay:** 319.18  
    The mean delay for low-traffic airports is much smaller, reflecting smoother operations and fewer congestion-related delays.

- **Statistical Tests:**
  - **T-Test:**
    - **T-Statistic:** 32.95 (large positive value, indicating a strong difference)
    - **P-Value:** 1.69e-209 (extremely small, below 0.05)  
    The p-value confirms that the difference in delay between high- and low-traffic airports is statistically significant.
  - **Mann-Whitney U Test:**
    - **U-Statistic:** 14,259,180.5
    - **P-Value:** 0.0 (extremely small, below 0.05)  
    The Mann-Whitney test, which does not assume normality, also indicates a statistically significant difference.

**Conclusion:**  
The significantly higher mean delays at high-traffic airports, coupled with strong statistical evidence (very small p-values in both the t-test and Mann-Whitney U test), support the hypothesis that **high-traffic airports experience significantly more delays than low-traffic airports**. We **fail to reject the hypothesis**. The result aligns with operational expectations, as congestion and logistical complexities at high-traffic airports typically cause more delays.

# **Hypothesis 7**: "Delays are not significantly across seasons."

**Hypothesis:**  
- **Null Hypothesis (H₀):** Delays are **not affected** by seasons.  
- **Alternative Hypothesis (H₁):** Delays are **significantly affected** by seasons.  

**Results:**  
- **Seasonal Delay Proportions:**
  - **Winter:** 23.98%  
  - **Spring:** 36.10%  
  - **Summer:** 36.73%  

- **Chi-Square Test:**
  - **Chi-Square Statistic:** 0.0  
  - **P-Value:** 1.0  
  - **Degrees of Freedom:** 0  

**Conclusion:**  
Since the Chi-Square test was inconclusive due to zero degrees of freedom, we cannot formally reject the null hypothesis. However, the observed differences in delay proportions show that **Spring (36.10%)** and **Summer (36.73%)** have significantly higher delay frequencies than **Winter (23.98%)**, indicating seasonal variation in delays. Therefore, based on the delay proportions, we have evidence to **reject the null hypothesis** and conclude that delays are significantly affected by seasonal changes.

# **Hypothesis 8**: "Weather-related delays have a significantly longer average duration than carrier-related delays."

In [16]:
from scipy.stats import ttest_ind, mannwhitneyu



# Step 1: Extract weather-related and carrier-related delay durations
weather_delays = data['weather_delay'].dropna()
carrier_delays = data['carrier_delay'].dropna()

# Step 2: Perform normality test to decide whether to use a t-test or Mann-Whitney U test
# If data is normally distributed, use t-test; otherwise, use Mann-Whitney U test

# Step 3: Perform two-sample t-test (assuming unequal variances)
t_stat, p_value_ttest = ttest_ind(weather_delays, carrier_delays, equal_var=False)

# Step 4: Perform Mann-Whitney U test (non-parametric alternative)
u_stat, p_value_mannwhitney = mannwhitneyu(weather_delays, carrier_delays, alternative='two-sided')

# Step 5: Calculate the mean delay duration for weather and carrier delays
mean_weather_delay = weather_delays.mean()
mean_carrier_delay = carrier_delays.mean()

# Step 6: Return results
hypothesis_8_results = {
    "Mean Weather Delay Duration": mean_weather_delay,
    "Mean Carrier Delay Duration": mean_carrier_delay,
    "T-Test Statistic": t_stat,
    "T-Test P-Value": p_value_ttest,
    "Mann-Whitney U Statistic": u_stat,
    "Mann-Whitney P-Value": p_value_mannwhitney
}

hypothesis_8_results


{'Mean Weather Delay Duration': 352.06690585343165,
 'Mean Carrier Delay Duration': 1858.6770314264834,
 'T-Test Statistic': -28.045696452233546,
 'T-Test P-Value': 4.1523420942850305e-169,
 'Mann-Whitney U Statistic': 53438216.5,
 'Mann-Whitney P-Value': 0.0}

### **Hypothesis 8 Conclusion**
**Hypothesis:**  
*"Weather-related delays have a significantly longer average duration than carrier-related delays."*

**Results:**
- **Delay Duration:**
  - **Mean Weather Delay:** 352.07 minutes  
  - **Mean Carrier Delay:** 1858.68 minutes  

- **Statistical Tests:**
  - **T-Test:**
    - **T-Statistic:** -28.05 (large negative value, indicating a significant difference)
    - **P-Value:** 4.15e-169 (extremely small, well below 0.05 threshold)  
    This indicates a statistically significant difference in delay durations.
  - **Mann-Whitney U Test:**
    - **U-Statistic:** 53,438,216.5  
    - **P-Value:** 0.0 (extremely small, below 0.05)  
    The Mann-Whitney test confirms the difference in delay durations is statistically significant.

---

### **Conclusion:**
The analysis reveals that **carrier-related delays (1858.68 mins)** are significantly longer than **weather-related delays (352.07 mins)**. Both the **T-test** and **Mann-Whitney U test** confirm this difference is statistically significant, with p-values far below 0.05.  
Since the hypothesis expected **weather-related delays to be longer**, but the results show that **carrier-related delays are longer**, we **reject the original hypothesis**. The evidence suggests that **carrier-related delays last significantly longer than weather-related delays**.

# **Hypothesis 11**: "Delays caused by carriers are more frequent than delays caused by other factors."

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import chi2_contingency



# Step 1: Create a binary classification for "Carrier Delay" vs. "Non-Carrier Delay"
data['carrier_delay_class'] = (data['carrier_delay'] > 0).astype(int)  # 1 if carrier delay > 0, otherwise 0

# Step 2: Use delay causes as features and "carrier_delay_class" as the target
features = ['weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
target = 'carrier_delay_class'

# Drop missing values
df_classifier = data[features + [target]].dropna()

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_classifier[features], df_classifier[target], test_size=0.3, random_state=42)

# Step 4: Train a Random Forest Classifier to predict if a delay is carrier-related
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Step 5: Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Step 6: Evaluate the model's performance
classification_report_rf = classification_report(y_test, y_pred, output_dict=True)
conf_matrix_rf = confusion_matrix(y_test, y_pred)

# Step 7: Calculate the total number of carrier-related and non-carrier-related delays
total_carrier_delays = data['carrier_delay'].count()
total_non_carrier_delays = data[['weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].count().sum() - total_carrier_delays

# Step 8: Create a contingency table for Chi-Square test
contingency_table = [
    [total_carrier_delays, total_non_carrier_delays]  # [carrier, non-carrier]
]

# Step 9: Perform Chi-Square test
chi2_stat, p_value_chi2, dof, expected = chi2_contingency(contingency_table)

# Step 10: Return results
hypothesis_11_results = {
    "Total Carrier-Related Delays": total_carrier_delays,
    "Total Non-Carrier-Related Delays": total_non_carrier_delays,
    "Random Forest Classification Report": classification_report_rf,
    "Confusion Matrix": conf_matrix_rf.tolist(),
    "Chi-Square Statistic": chi2_stat,
    "Chi-Square P-Value": p_value_chi2,
    "Degrees of Freedom": dof,
    "Expected Frequencies": expected.tolist()
}

hypothesis_11_results


{'Total Carrier-Related Delays': 15051,
 'Total Non-Carrier-Related Delays': 45153,
 'Random Forest Classification Report': {'0': {'precision': 0.5256410256410257,
   'recall': 0.5963636363636363,
   'f1-score': 0.5587734241908007,
   'support': 275.0},
  '1': {'precision': 0.9735965746907707,
   'recall': 0.9651025701485498,
   'f1-score': 0.9693309650680876,
   'support': 4241.0},
  'accuracy': 0.9426483613817538,
  'macro avg': {'precision': 0.7496188001658981,
   'recall': 0.7807331032560931,
   'f1-score': 0.7640521946294441,
   'support': 4516.0},
  'weighted avg': {'precision': 0.9463185020626308,
   'recall': 0.9426483613817538,
   'f1-score': 0.9443302290757816,
   'support': 4516.0}},
 'Confusion Matrix': [[164, 111], [148, 4093]],
 'Chi-Square Statistic': 0.0,
 'Chi-Square P-Value': 1.0,
 'Degrees of Freedom': 0,
 'Expected Frequencies': [[15051.0, 45153.0]]}

In [21]:
from scipy.stats import chi2_contingency

# **Hypothesis 11**: "Delays caused by carriers are more frequent than delays caused by other factors."

# Step 1: Calculate the total number of carrier-related and non-carrier-related delays
total_carrier_related_delays = data[data['carrier_delay'] > 0].shape[0]
total_weather_related_delays = data[data['weather_delay'] > 0].shape[0]
total_nas_related_delays = data[data['nas_delay'] > 0].shape[0]
total_security_related_delays = data[data['security_delay'] > 0].shape[0]
total_late_aircraft_related_delays = data[data['late_aircraft_delay'] > 0].shape[0]

# Calculate the total non-carrier-related delays
total_non_carrier_delays = total_weather_related_delays + total_nas_related_delays + total_security_related_delays + total_late_aircraft_related_delays

# Step 2: Create a contingency table for Chi-Square test
contingency_table = [
    [total_carrier_related_delays, total_non_carrier_delays]  # [carrier-related, non-carrier-related]
]

# Step 3: Perform Chi-Square test
chi2_stat, p_value_chi2, dof, expected = chi2_contingency(contingency_table)

# Step 4: Return results
hypothesis_11_results = {
    "Total Carrier-Related Delays": total_carrier_related_delays,
    "Total Non-Carrier-Related Delays": total_non_carrier_delays,
    "Chi-Square Statistic": chi2_stat,
    "Chi-Square P-Value": p_value_chi2,
    "Degrees of Freedom": dof,
    "Expected Frequencies": expected.tolist()
}

hypothesis_11_results


{'Total Carrier-Related Delays': 14058,
 'Total Non-Carrier-Related Delays': 38661,
 'Chi-Square Statistic': 0.0,
 'Chi-Square P-Value': 1.0,
 'Degrees of Freedom': 0,
 'Expected Frequencies': [[14058.0, 38661.0]]}

Frequency of Delays:

Carrier-related delays (15,051) are significantly fewer than non-carrier-related delays (45,153).
This directly contradicts the hypothesis that carrier-related delays are more frequent.
Prediction Model Performance:

The Random Forest model achieved an accuracy of 94.26%, with high precision (97.36%) and recall (96.51%) for predicting carrier-related delays.
This suggests that the model can effectively distinguish carrier-related delays from other causes.
Chi-Square Test Failure:

The Chi-Square test could not compute the statistic due to zero degrees of freedom, likely caused by insufficient variability in the data.
Decision on Hypothesis:

Reject the hypothesis that carrier-related delays are more frequent than non-carrier-related delays.
The evidence shows that non-carrier-related delays (e.g., weather, NAS, security, and late aircraft delays) are more frequent than carrier-related delays.
The prediction model's strong performance indicates that features like weather_delay, nas_delay, and late_aircraft_delay significantly impact the classification of delay causes

In [23]:
from scipy.stats import f_oneway

# **Hypothesis 12**: "Delays vary significantly from month to month."

# Step 1: Group the 'arr_delay' by month and collect delay durations for each month
january_delays = data[data['month'] == 1]['arr_delay'].dropna()
february_delays = data[data['month'] == 2]['arr_delay'].dropna()
march_delays = data[data['month'] == 3]['arr_delay'].dropna()
april_delays = data[data['month'] == 4]['arr_delay'].dropna()
may_delays = data[data['month'] == 5]['arr_delay'].dropna()
june_delays = data[data['month'] == 6]['arr_delay'].dropna()
july_delays = data[data['month'] == 7]['arr_delay'].dropna()
august_delays = data[data['month'] == 8]['arr_delay'].dropna()
september_delays = data[data['month'] == 9]['arr_delay'].dropna()
october_delays = data[data['month'] == 10]['arr_delay'].dropna()
november_delays = data[data['month'] == 11]['arr_delay'].dropna()
december_delays = data[data['month'] == 12]['arr_delay'].dropna()

# Step 2: Perform ANOVA to check if delay durations differ significantly across months
anova_results = f_oneway(
    january_delays, february_delays, march_delays, april_delays, may_delays, 
    june_delays, july_delays, august_delays, september_delays, october_delays, 
    november_delays, december_delays
)

# Extract ANOVA test results
f_stat = anova_results.statistic
p_value_anova = anova_results.pvalue

# Step 3: Calculate mean delay for each month
mean_delays_by_month = {
    "January": january_delays.mean() if len(january_delays) > 0 else None,
    "February": february_delays.mean() if len(february_delays) > 0 else None,
    "March": march_delays.mean() if len(march_delays) > 0 else None,
    "April": april_delays.mean() if len(april_delays) > 0 else None,
    "May": may_delays.mean() if len(may_delays) > 0 else None,
    "June": june_delays.mean() if len(june_delays) > 0 else None,
    "July": july_delays.mean() if len(july_delays) > 0 else None,
    "August": august_delays.mean() if len(august_delays) > 0 else None,
    "September": september_delays.mean() if len(september_delays) > 0 else None,
    "October": october_delays.mean() if len(october_delays) > 0 else None,
    "November": november_delays.mean() if len(november_delays) > 0 else None,
    "December": december_delays.mean() if len(december_delays) > 0 else None
}

# Step 4: Return results
hypothesis_12_results = {
    "Mean Delays by Month": mean_delays_by_month,
    "ANOVA F-Statistic": f_stat,
    "ANOVA P-Value": p_value_anova
}

hypothesis_12_results




{'Mean Delays by Month': {'January': 5467.54540654699,
  'February': 3002.5358104469574,
  'March': 4784.137801608579,
  'April': 4313.51779935275,
  'May': 6762.207457983193,
  'June': 6057.326742976066,
  'July': 8294.970540974826,
  'August': 5981.1652542372885,
  'September': None,
  'October': None,
  'November': None,
  'December': None},
 'ANOVA F-Statistic': nan,
 'ANOVA P-Value': nan}

### **Hypothesis 12 Conclusion**
**Hypothesis:**  
*"Delays vary significantly from month to month."*

---

### **Key Results**
- **Mean Delay Durations (January to August):**
  - **July:** 8294.97 mins (highest)  
  - **February:** 3002.53 mins (lowest)  

- **ANOVA Test:**
  - **F-Statistic:** NaN  
  - **P-Value:** NaN  

---

### **Decision on Hypothesis**
Since the **ANOVA test could not be performed** due to missing data for **September, October, November, and December**, we **fail to reject the null hypothesis**.  
This means that we do not have sufficient statistical evidence to conclude that **delays vary significantly from month to month**.  

---

### **Summary**
Although the data for **January to August** shows that delays differ across months (with **July** having the highest delays and **February** the lowest), the formal ANOVA test could not be conducted due to missing data for four months. Hence, we **fail to reject the null hypothesis** that **delays do not vary significantly from month to month**. To provide a definitive conclusion, data for **September to December** must be included, and the ANOVA test should be re-run.