In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import numpy as np

from order import apps_2024, phase_order, domain_order, color_discrete_map

In [16]:
# read data
df = pd.read_parquet('../data/raw/features.parquet')

# remove travel domain
df = df.loc[:, ~df.columns.get_level_values('domain').str.contains('Travel')]

# filter apps: drop 2024
df_filtered = df.drop(columns=[app for app in df.columns.get_level_values('app') if app in apps_2024], level='app')

In [17]:
# make sub data frames
# create a dataframe for cross-cutting features
df_cross_cutting = df_filtered.loc[df_filtered.index.get_level_values('type') == 'Cross-Cutting']

# create a dataframe for domain-specific features
df_phase_specific = df_filtered.loc[df_filtered.index.get_level_values('type') == 'Phase-Specific']

In [18]:
# group data by phase
# phase specific features
df_grouped_phase = df_phase_specific.groupby(level='phase', axis=0).mean()

# cross-cutting features
df_grouped_cross = df_cross_cutting.groupby(level='phase', axis=0).mean()

# all
df_grouped_all = df_filtered.groupby(level='phase', axis=0).sum()


The 'axis' keyword in DataFrame.groupby is deprecated and will be removed in a future version.


The 'axis' keyword in DataFrame.groupby is deprecated and will be removed in a future version.


The 'axis' keyword in DataFrame.groupby is deprecated and will be removed in a future version.



In [19]:
# calculate mean values and order based on phase_order array
df_grouped_phase_mean = df_grouped_phase.mean(axis=1).reindex(phase_order).fillna(0)
df_grouped_cross_mean = df_grouped_cross.mean(axis=1).reindex(phase_order).fillna(0)
df_grouped_all_mean = df_grouped_all.mean(axis=1).reindex(phase_order).fillna(0)

In [21]:
# reorder columns
df_grouped_all = df_grouped_all.reindex(columns=domain_order, level='domain')

# make flat dataframe
df_flat = df_grouped_all.sum().reset_index()
df_flat.columns = ['domain', 'app', 'feature_count']



# boxplot including points for all apps
fig = px.box(df_flat, x='domain',
             y='feature_count',
             color='domain',
             points='all',
             color_discrete_map=color_discrete_map,
             template='plotly_white')

# remove legend
fig.update_layout(showlegend=False, height=600)

# make y start at 10
fig.update_yaxes(range=[10, 71])

# rename y to Number of Features per App
fig.update_yaxes(title='Number of Features per App')

# remove x axis title
fig.update_xaxes(title='', tickfont=dict(weight='bold'))

# show boxplot
fig.show()

# save figure and data
fig.write_image('../images/pdf/12_boxplot.pdf')
fig.write_image('../images/png/12_boxplot.png')