# First, we load all data in to one dataframe to generate the plot for our needs

In [62]:
import pandas as pd
import plotly.graph_objects as go
from scipy import stats
import numpy as np

# Define the dates and city names
dates = ['0320', '0327', '0403', '0410', '0417', '0423', '0501']
cities = ['Paris', 'Berlin', 'Madrid', 'Lisbon', 'Rome', 'Budapest', 'Athens']

# Initialize an empty list to hold dataframes
dataframes = []

# Loop over each city
for city in cities:
    # Load city data
    filenames = [f'dep&ret_flights/dep&ret_flights_{date}/{city.lower()}_{date}.csv' for date in dates]
    city_dataframes = [pd.read_csv(filename) for filename in filenames]
    # Concatenate city dataframes and add collection_date and city columns
    city_df = pd.concat([df.assign(collection_date=pd.to_datetime(date, format="%m%d").replace(year=2023)) for df, date in zip(city_dataframes, dates)])
    city_df['city'] = city
    dataframes.append(city_df)

# Concatenate all city dataframes
combined_df = pd.concat(dataframes)


# Then, we pick a color for each city which is the same as the one when we plotted the data distribution for the first time. And now we would like to know how the data distribution changes over time, so we 

In [32]:
import plotly.graph_objects as go
import numpy as np
from scipy.stats import gaussian_kde
import pandas as pd

# Separate data by city
grouped = combined_df.groupby('city')

# Colors for cities
city_color = {'Paris': 'blue', 'Berlin': 'green', 'Madrid': 'red', 'Lisbon': 'purple', 'Rome': 'orange', 'Budapest': 'brown', 'Athens': 'CadetBlue'}

# Iterate over each group (city)
for city, group in grouped:
    fig = go.Figure()

    # Sort the collection dates
    collection_dates = sorted(group['collection_date'].unique())

    # Create a KDE plot for each collection date
    for date in collection_dates:
        subset = group[group['collection_date'] == date]
        
        # Calculate quartiles
        Q1 = np.percentile(subset['price_amount'], 25)
        median = np.percentile(subset['price_amount'], 50)
        Q3 = np.percentile(subset['price_amount'], 75)
        
        # Calculate KDE
        x = np.linspace(min(subset['price_amount']), max(subset['price_amount']), 500)
        y = gaussian_kde(subset['price_amount'])(x)
        
        # Convert numpy.datetime64 to datetime
        date = pd.to_datetime(str(date)).strftime('%Y-%m-%d')
        
        fig.add_trace(
            go.Scatter(
                x=x,
                y=y,
                mode='lines',
                fill='tozeroy',  # Add this line
                name=date,  # Format date
                line=dict(color=city_color[city]),
                visible=False,
                hovertemplate=f"Q1: {Q1}<br>Median: {median}<br>Q3: {Q3}<extra></extra>"
            )
        )
    
    # Make the first trace visible
    fig.data[0].visible = True

    # Create slider steps
    steps = []
    for i, date in enumerate(collection_dates):
        step = dict(
            method="restyle",
            args=["visible", [False]*len(collection_dates)],
            label=pd.to_datetime(str(date)).strftime('%Y-%m-%d')  # Format date
        )
        step["args"][1][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    # Create and add slider
    sliders = [dict(
        active=0,
        currentvalue={"prefix": "Collection date: "},
        pad={"t": 50},
        steps=steps
    )]
    fig.update_layout(
        sliders=sliders,
        title_text=f'Animated KDE Plot for {city}',
        xaxis_title='Price Amount',
        yaxis_title='Density',
    )
    
    # Display the figure
    fig.show()

    # Save the figure as an HTML file with a filename based on the city
    filename = f'output_{city}.html'
    fig.write_html(filename)




# To have a more quantitive sense of how the distributions change over time we choose wasserstein_distance to evaluate.

In [63]:
import numpy as np
from scipy.stats import wasserstein_distance
import pandas as pd

# Separate data by city
grouped = combined_df.groupby('city')

# Initialize list to store all data
all_data = []

# Iterate over each group (city)
for city, group in grouped:

    # Sort the collection dates
    collection_dates = sorted(group['collection_date'].unique())

    # Compute Wasserstein distance for each pair of consecutive dates
    for i in range(len(collection_dates) - 1):
        subset1 = group[group['collection_date'] == collection_dates[i]]['price_amount']
        subset2 = group[group['collection_date'] == collection_dates[i + 1]]['price_amount']

        distance = wasserstein_distance(subset1, subset2)
        
        # Create a list of date ranges
        date_range = f"{pd.to_datetime(str(collection_dates[i])).strftime('%Y-%m-%d')} - {pd.to_datetime(str(collection_dates[i+1])).strftime('%Y-%m-%d')}"

        # Append the data for this city and date range to the list
        all_data.append([city, date_range, distance])

# Create a DataFrame from the data
df = pd.DataFrame(all_data, columns=['City', 'Date Range', 'Wasserstein Distance'])

# Display the DataFrame
print(df)



        City               Date Range  Wasserstein Distance
0     Athens  2023-03-20 - 2023-03-27             10.105596
1     Athens  2023-03-27 - 2023-04-03              9.086664
2     Athens  2023-04-03 - 2023-04-10              8.751244
3     Athens  2023-04-10 - 2023-04-17             12.654902
4     Athens  2023-04-17 - 2023-04-23              4.669020
5     Athens  2023-04-23 - 2023-05-01              7.389467
6     Berlin  2023-03-20 - 2023-03-27             13.182765
7     Berlin  2023-03-27 - 2023-04-03              5.941163
8     Berlin  2023-04-03 - 2023-04-10              9.873239
9     Berlin  2023-04-10 - 2023-04-17             33.887699
10    Berlin  2023-04-17 - 2023-04-23              5.382316
11    Berlin  2023-04-23 - 2023-05-01              7.397256
12  Budapest  2023-03-20 - 2023-03-27             18.592755
13  Budapest  2023-03-27 - 2023-04-03              7.939328
14  Budapest  2023-04-03 - 2023-04-10              7.394751
15  Budapest  2023-04-10 - 2023-04-17   

## We visualised the Wasserstein distance changes

In [None]:
from plotnine import ggplot, aes, geom_line, facet_wrap, scale_color_manual, theme, element_text,labs,ggsave
# Define the colors for each city
city_color = {
    'Paris': 'blue',
    'Berlin': 'green',
    'Madrid': 'red',
    'Lisbon': 'purple',
    'Rome': 'orange',
    'Budapest': 'brown',
    'Athens': 'CadetBlue',
}

# Create a new 'Date Range Short' column that only includes month and day
df['Date Range Short'] = df['Date Range'].apply(lambda x: '-'.join([date[5:].replace('-', '') for date in x.split(' - ')]))

# Plot all cities on the same plot
Wasserstein= (ggplot(df, aes(x='Date Range Short', y='Wasserstein Distance', group='City', color='City'))
    + geom_line()
    + scale_color_manual(values=city_color)
    + theme(axis_text_x=element_text(rotation=90, hjust=1))
    + labs(x='Date Range', y='Wasserstein Distance', color='City'))
Wasserstein.save(filename="Wasserstein.png", dpi=300)