# In order to habe a better understanding of our data, we would like to check how these data distributed.

In [3]:
import pandas as pd
import datetime
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
paris = pd.read_csv('../dep&ret_flights/dep&ret_flights_0320/paris_0320.csv')
berlin = pd.read_csv('../dep&ret_flights/dep&ret_flights_0320/berlin_0320.csv')
madrid = pd.read_csv('../dep&ret_flights/dep&ret_flights_0320/madrid_0320.csv')
lisbon = pd.read_csv('../dep&ret_flights/dep&ret_flights_0320/lisbon_0320.csv')
rome = pd.read_csv('../dep&ret_flights/dep&ret_flights_0320/rome_0320.csv')
bud = pd.read_csv('../dep&ret_flights/dep&ret_flights_0320/budapest_0320.csv')
athens = pd.read_csv('../dep&ret_flights/dep&ret_flights_0320/athens_0320.csv')

In [3]:
# List of dataframes
dfs = [paris, berlin, madrid, lisbon, rome, bud, athens]

# Columns to convert to datetime
columns_to_convert = ['depFlight_departure', 'depFlight_arrival', 'retFlight_departure', 'retFlight_arrival']

# Convert columns to datetime
for df in dfs:
    for column in columns_to_convert:
        df[column] = pd.to_datetime(df[column])
# Extract the month from the departure date
for df in dfs:
    df['departure_month'] = df['depFlight_departure'].dt.month

## First, we check the overall data distribution

In [None]:
from plotnine import *

city_dfs = {
    'Paris': paris, 
    'Berlin': berlin, 
    'Madrid': madrid, 
    'Lisbon': lisbon, 
    'Rome': rome, 
    'Budapest': bud, 
    'Athens': athens
}
# Create an empty list to store the modified dataframes
modified_dfs = []

# Iterate over each city and add the 'city' column
for city, df in city_dfs.items():
    df['city'] = city
    modified_dfs.append(df)

# Concatenate the modified dataframes into a single dataframe
combined_df = pd.concat(modified_dfs)

# Create a dictionary mapping city names to colors
color_mapping = {
    'Paris': 'blue',
    'Berlin': 'green',
    'Madrid': 'red',
    'Lisbon': 'purple',
    'Rome': 'orange',
    'Budapest': 'brown',
    'Athens': 'CadetBlue'
}

# Plot KDEs for each city
distribution = (
    ggplot(combined_df, aes(x='price_amount', fill='factor(city)')) +
    geom_density(alpha=0.4) +
    scale_fill_manual(values=color_mapping, guide=guide_legend(title='City')) +
    labs(x='Price Amount', y='Density', title='Price Amount KDE for Cities') +
    theme_bw()
)
#save plot
distribution.save("distribution.png", dpi=300)


## And then check the distribution for each city

In [None]:
from plotnine import (
    ggplot,
    aes,
    after_stat,
    geom_density,
    geom_histogram,
    geom_vline,
    geom_rect,
    labs,
    annotate,
    theme_tufte
)

In [None]:
city_dfs = {'Paris': paris, 'Berlin': berlin, 'Madrid': madrid, 'Lisbon': lisbon, 'Rome': rome, 'Budapest': bud, 'Athens': athens}
colors = ['blue', 'green', 'red', 'purple', 'orange', 'brown', 'CadetBlue']

In [None]:
# Iterate over each city and create the distribution plot
for city, df in city_dfs.items():
    Q1 = df['price_amount'].quantile(0.25)
    Q3 = df['price_amount'].quantile(0.75)
    median = df['price_amount'].quantile(0.5)
    region = (Q1, Q3)

    plot = (
        ggplot(df, aes('price_amount')) +
        geom_histogram(fill='gray', color='white', bins=30,alpha=0.5) +
        annotate(geom_rect, xmin=region[0], xmax=region[1], ymin=0, ymax=np.inf, fill=colors[list(city_dfs.keys()).index(city)], alpha=0.5) +
        annotate(geom_vline, xintercept=median, color=colors[list(city_dfs.keys()).index(city)], size=1) +
        labs(x='Price Amount', y='Frequency', title=f'Price Amount Distribution for {city}') +
        theme_bw() +
        theme(legend_position='top_right', legend_title=element_blank()) +
        guides(fill=False, color=False) +
        scale_fill_manual(values=[colors[list(city_dfs.keys()).index(city)]] * 3, labels=['Q1', 'Median', 'Q3'])
    )

    # Save the plot
    plot.save(f'distribution_{city}.png')
    
    # Display the plot
    print(plot)

## We found there were heavy tails for each city's distribution, which means there were a lot extremely high ticket price. So then we carried out the analysis of outliers.

In [7]:
def get_outliers(df, column, factor=1.5):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - factor * iqr
    upper_bound = q3 + factor * iqr

    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    
    # Drop duplicates based on 'id' column
    outliers = outliers.drop_duplicates(subset=['id'])

    return outliers

In [8]:
paris_out = get_outliers(paris,'price_amount')
berlin_out = get_outliers(berlin,'price_amount')
mad_out = get_outliers(madrid,'price_amount')
lis_out = get_outliers(lisbon,'price_amount')
rome_out = get_outliers(rome,'price_amount')
bud_out = get_outliers(bud,'price_amount')
athens_out = get_outliers(athens,'price_amount')

## Now, we carry out the outlier analysis by each city

In [None]:
# List of dataframes
dfs = [paris_out, berlin_out, mad_out, lis_out, rome_out, bud_out, athens_out]
df_names = ["Paris", "Berlin", "Madrid", "Lisbon", "Rome", "Budapest", "Athens"]

# Define a custom color palette
city_colors = ['blue', 'green', 'red', 'purple', 'orange', 'brown', 'CadetBlue']
from plotnine import ggplot, aes, geom_bar, facet_wrap
from plotnine import ggplot, aes, geom_bar, facet_wrap, scale_fill_manual, labs, theme, element_text

# Add city column to each dataframe
for df, city in zip(dfs, df_names):
    df['city'] = city

# Concatenate all dataframes
df_all = pd.concat(dfs, ignore_index=True)

# Create a dictionary for colors
city_color_dict = dict(zip(df_names, city_colors))

month=(
    ggplot(df_all)
    + aes(x='departure_month', fill='city')
    + geom_bar(position='dodge')
    + facet_wrap('~ city', scales='free_y',ncol=2)  # Add scales='free_y' to allow different y ranges
    + scale_fill_manual(values=city_color_dict)
    + labs(title="Number of Outliers per Month", x="Month", y="Number of Outliers")
    + theme(
        plot_title=element_text(size=16),
        strip_text_x = element_text(size = 11)
    )
)
month.save("outliers_per_month.png", height= 25, width = 12.5, dpi=96)

In [None]:
df_all['depFlight_name'] = df_all['depFlight_name'].apply(lambda x: x.replace(' ', '\n', 1))
carrier=(
    ggplot(df_all)
    + aes(x='depFlight_name', fill='city')
    + geom_bar(position='dodge')
    + facet_wrap('~ city', scales='free',ncol=2)  
    + scale_fill_manual(values=city_color_dict)
    + labs(title="Number of Outliers per Carrier", x="Carriers", y="Number of Outliers")
    + theme(
        plot_title=element_text(size=16),
        strip_text_x = element_text(size = 11),
        axis_text_x=element_text(rotation=45)
    )
)
carrier.save("outliers_carrier.png", height= 25, width = 12.5, dpi=96)

In [None]:
df_all['depFlight_origin.name'] = df_all['depFlight_origin.name'].apply(lambda x: x.replace(' ', '\n', 1))
airport=(
    ggplot(df_all)
    + aes(x='depFlight_origin.name', fill='city')
    + geom_bar(position='dodge')
    + facet_wrap('~ city', scales='free',ncol=2)  
    + scale_fill_manual(values=city_color_dict)
    + labs(title="Number of Outliers per Airport", x="Airports", y="Number of Outliers")
    + theme(
        plot_title=element_text(size=16),
        strip_text_x = element_text(size = 11),
        axis_text_x=element_text(rotation=45)
    )
)
airport.save("outliers_airport", height= 25, width = 12.5, dpi=96)