In [None]:
import functions as f

In [None]:
f.df.groupby('airline')['class'].unique()

In [None]:
#Flights with no stops
df_nonstop = f.df[(f.df['class'] == 'Economy') & (f.df['stops'] == '0')]

In [None]:
#Flights with 1 stop
df_1stop = f.df[(f.df['class'] == 'Economy') & (f.df['stops'] == '1')]

In [None]:
#Flights with 2 or more stops
df_2more_stop = f.df[(f.df['class'] == 'Economy') & (f.df['stops'] == '2+')]

In [None]:
f.price_airline(df_nonstop)

In [None]:
outliers = df_nonstop.groupby('airline', group_keys=False).apply(f.detect_outliers_iqr)
print(f"Total outliers detected: {len(outliers)}")

In [None]:
print(f"Outlier percentage: {len(outliers) / len(df_nonstop):.2%}")

In [None]:
results = f.shapiro_by_airline(f.df)

In [None]:
groups = [group["price"].values for name, group in f.df.groupby("airline")]

# Perform one-way ANOVA
f_stat, p_value = f.f_oneway(*groups)

print("F-statistic:", f_stat)
print("p-value:", p_value)

In [None]:
summary_df = df_nonstop.groupby('airline').agg({"price": ['mean', 'std', 'count','sem']}).reset_index()
summary_df.columns = ['airline', 'mean', 'std', 'count','sem'] 
summary_df['lower_bound'] = summary_df['mean'] - (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df['upper_bound'] = summary_df['mean'] + (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df['error'] = (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df.sort_values('mean', ascending=False, inplace=True)
display(summary_df)

In [None]:
f.plot_airline_means(summary_df, f)

In [None]:
mean_prices = f.plot_mean_price_by_airline(df_nonstop, f)

In [None]:
f.price_airline(df_1stop)

In [None]:
outliers = df_1stop.groupby('airline', group_keys=False).apply(f.detect_outliers_iqr)
print(f"Total outliers detected: {len(outliers)}")

In [None]:
print(f"Outlier percentage: {len(outliers) / len(df_nonstop):.2%}")

In [None]:
groups = [df_1stop[df_1stop['airline'] == airline]['price'] for airline in df_1stop['airline'].unique()]

stat, p = f.kruskal(*groups)

print("Kruskal–Wallis H-statistic:", stat)
print("p-value:", p)

In [None]:
stat, p_value = f.kruskal_by_airline(df_1stop, f)

In [None]:
mean_prices = f.plot_mean_price_by_airline(df_1stop, f)

In [None]:
summary_df = df_1stop.groupby('airline').agg({"price": ['mean', 'std', 'count','sem']}).reset_index()
summary_df.columns = ['airline', 'mean', 'std', 'count','sem'] 
summary_df['lower_bound'] = summary_df['mean'] - (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df['upper_bound'] = summary_df['mean'] + (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df['error'] = (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df.sort_values('mean', ascending=False, inplace=True)
display(summary_df)

In [None]:
f.plot_airline_means(summary_df, f)

In [None]:
f.price_airline(df_2more_stop)

In [None]:
outliers = df_2more_stop.groupby('airline', group_keys=False).apply(f.detect_outliers_iqr)
print(f"Total outliers detected: {len(outliers)}")

In [None]:
print(f"Outlier percentage: {len(outliers) / len(df_nonstop):.2%}")

In [None]:
stat, p_value = f.kruskal_by_airline(df_2more_stop, f)

In [None]:
mean_prices = f.plot_mean_price_by_airline(df_2more_stop, f)

In [None]:
summary_df = df_2more_stop.groupby('airline').agg({"price": ['mean', 'std', 'count','sem']}).reset_index()
summary_df.columns = ['airline', 'mean', 'std', 'count','sem'] 
summary_df['lower_bound'] = summary_df['mean'] - (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df['upper_bound'] = summary_df['mean'] + (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df['error'] = (f.st.norm.ppf(1-(0.05/2))*summary_df['sem'])
summary_df.sort_values('mean', ascending=False, inplace=True)
display(summary_df)

In [None]:
f.plot_airline_means(summary_df, f)

In [None]:
# total rows
total_rows = len(df_nonstop)

# number of rows where lead_time_days < 15
last_minute = len(df_nonstop[df_nonstop['lead_time_days'] < 15])

# percentage
percentage_last_minute = (last_minute / total_rows) * 100

print(f"Percentage of bookings made less than 15 days before departure: {percentage_last_minute:.2f}%")

In [None]:
# Drop NaN or invalid values
df_valid = df_nonstop[['lead_time_days', 'price']].dropna()

# Spearman correlation
corr, pval = f.spearmanr(df_valid['lead_time_days'], df_valid['price'])

print(f"Spearman correlation = {corr:.3f}")
print(f"p-value = {pval:.5f}")

if pval < 0.05:
    print("Reject H0 → There is a statistically significant relationship between lead time and price.")
else:
    print("Fail to reject H0 → No statistically significant relationship found.")

In [None]:
f.plot_price_trend_by_lead_time(df_nonstop, f)

In [None]:
# Convert duration ("HH:MM:SS") → total minutes
f.df['duration'] = (
    f.pd.to_timedelta(f.df['duration']).dt.total_seconds() / 60
)

# Keep relevant columns
f.df = f.df[['airline', 'duration', 'price']]

# Remove invalid rows (zero or null durations/prices)
f.df = f.df[(f.df['duration'] > 0) & (f.df['price'] > 0)]

In [None]:
f.plot_price_vs_duration(f, f.df)

In [None]:
# Pearson correlation (linear)
pearson_r, pearson_p = f.pearsonr(f.df['duration'], f.df['price'])
# Spearman correlation (monotonic)
spearman_r, spearman_p = f.spearmanr(f.df['duration'], f.df['price'])

print(f"Pearson r = {pearson_r:.3f} (p = {pearson_p:.3e})")
print(f"Spearman r = {spearman_r:.3f} (p = {spearman_p:.3e})")

In [None]:
df = f.pd.read_csv(f.config['data']['clean_data']['full_clean'], sep=";")

In [None]:
df['stops'] = df['stops'].replace('2+', '2')

In [None]:
# Keep relevant columns
df_stops = df[['stops', 'price']].copy()

# Ensure 'stops' is numeric
df_stops['stops'] = f.pd.to_numeric(df_stops['stops'], errors='coerce')

# Drop missing or invalid values
df_stops = df_stops.dropna(subset=['stops', 'price'])

In [None]:
f.plot_price_by_stops(df_stops, f)

In [None]:
f.plot_price_density_by_stops(df_stops, f)

In [None]:
df_stops['flight_type'] = df_stops['stops'].apply(lambda x: 'Direct' if x == 0 else 'Connecting')

In [None]:
f.plot_avg_price_by_stops(df_stops, f)

In [None]:
price_by_stops = df_stops.groupby('stops')['price'].describe()[['mean','std','count']]
print(price_by_stops)

In [None]:
# Create the 'flight_type' variable
df_stops['flight_type'] = df_stops['stops'].apply(lambda x: 'Direct' if x == 0 else 'Connecting')

# Split prices into two groups
direct_prices = df_stops[df_stops['flight_type'] == 'Direct']['price']
connecting_prices = df_stops[df_stops['flight_type'] == 'Connecting']['price']

# Run one-way ANOVA
f_stat, p_value = f.st.stats.f_oneway(direct_prices, connecting_prices)

print(f"ANOVA F-statistic: {f_stat:.3f}, p-value: {p_value:.3e}")

In [None]:
df_class = df[['class', 'price']].copy()
df_class['class'] = df_class['class'].str.strip().str.capitalize()
df_class = df_class.dropna(subset=['class', 'price'])

In [None]:
f.plot_price_by_class(df_class, f)

In [None]:
f.plot_avg_price_by_class(df, f)

In [None]:
f.plot_price_distribution_by_class(df, f)

In [None]:
price_by_class = df_class.groupby('class')['price'].describe()[['mean','std','count']]
print(price_by_class)

In [None]:
economy = df_class.loc[df_class['class'] == 'Economy', 'price']
business = df_class.loc[df_class['class'] == 'Business', 'price']

t_stat, p_value = f.st.stats.ttest_ind(economy, business, equal_var=False)
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3e}")