In [169]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px 
import plotly.graph_objects as go
from ipywidgets import widgets

In [170]:
my_cat_palette = sns.color_palette('Set2')
my_cont_palette = sns.color_palette('Blues')
sns.set_context('notebook')

In [171]:
data_folder = './data/'
credit_score_file = 'credit_score.csv'
disasters_file = 'disasters_data.csv'

# Part 2: Exploring multdimensional and hierarchical data with interaction

👉 **TODO 2.1:**  Choose a **multidimensional** dataset and explore it by creating interactive visualizations with Plotly. In your exploration, make use of at least one chart type for multidimensional data. See the Plotly overview of chart types [here](https://plotly.com/python/plotly-express/). 

At the end of your exploration, write a short summary that reflects on the interactions you used and how they impacted your exploration (in addition to the reflections per chart, as before). For example, you could mention if they helped you to identify a specific pattern or gain a specific insight (or not).

My multidimensional dataset was dowloaded from Kaggle: https://www.kaggle.com/datasets/sujithmandala/credit-score-classification-dataset

It describes the credit score classification of people considering several parameters

In [172]:
mdm_df = pd.read_csv(data_folder + credit_score_file)

In [173]:
mdm_df.head()

Unnamed: 0,Age,Gender,Income,Education,Marital Status,Number of Children,Home Ownership,Credit Score
0,25,Female,50000,Bachelor's Degree,Single,0,Rented,High
1,30,Male,100000,Master's Degree,Married,2,Owned,High
2,35,Female,75000,Doctorate,Married,1,Owned,High
3,40,Male,125000,High School Diploma,Single,0,Owned,High
4,45,Female,100000,Bachelor's Degree,Married,3,Owned,High


In [174]:
# Convert Credit Score column to numbers
target = 'Credit Score'

score_to_num = {'High' : 1.0, 'Average' : 0.5, 'Low' : 0.0}
num_to_score = {1.0 : 'High', 0.5 : 'Average', 0.0 : 'Low'}
    
mdm_df[target] = mdm_df[target].apply(lambda x: score_to_num[x])

# Convert Education column
def convertion(level):
    words = level.split()
    if len(words) > 2:
        return ' '.join(words[0:2])
    else:
        return words[0].rstrip("'s")

mdm_df['Education'] = mdm_df['Education'].apply(convertion)

In [175]:
# Color for both plots
color = mdm_df[target]
scale = [[0.0, 'rgb(243, 231, 155)'], [0.5, 'rgb(235, 127, 134)'], [1.0, 'rgb(92, 83, 165)']]

# Columns selection
categorical_columns = ['Home Ownership', 'Gender', 'Marital Status', 'Education']
numerical_columns = ['Number of Children', 'Income', 'Age']

# Create dimensions and set up for interactions
def create_dimension(df, column):
    return go.parcats.Dimension(values=df[column], label=column)

def create_num_dimension(df, column):
    return go.splom.Dimension(values=df[column], label=column)

def draw_parallel(df):
    dimensions = [create_dimension(df, col) for col in categorical_columns]
    target_dim = go.parcats.Dimension(values=df[target], label=target, ticktext=list(num_to_score.values()),
                                 categoryarray=list(num_to_score.keys()))
    dimensions.insert(0, target_dim)
    
    figure = go.FigureWidget(data=[go.Parcats(dimensions=dimensions,
                                              line={'color': df[target], 'colorscale': scale, 'shape': 'hspline'},
                                              labelfont={'size': 18, 'family': 'Times'},
                                              tickfont={'size': 15, 'family': 'Times'},
                                              arrangement='freeform')])
    
    return figure


num_dimensions = [create_num_dimension(mdm_df, col) for col in numerical_columns]

# Create scatter matrix for numerical data
textd = [num_to_score[elem] for elem in mdm_df[target]]

fig1 = go.FigureWidget(data=[
    go.Splom(dimensions=num_dimensions, 
             marker=dict(color=mdm_df[target], 
                         size=6, 
                         colorscale=scale,
                         line=dict(width=0.5, color='rgb(230,230,230)')), 
             text=textd)])

title = "Scatterplot Matrix (SPLOM) for Credit Score Dataset:"
fig1.update_layout(title=title,
                  dragmode='select',
                  width=600,
                  height=600,
                  hovermode='closest')

# Create Parallel Categories for categorical data
fig2 = draw_parallel(mdm_df)

title_parallel = "Credit Score with respect to categorical features"
fig2.update_layout(title=title_parallel,
                  dragmode='select',
                  width=900,
                  height=500,
                  hovermode='closest')


# Bind two plots to extend interaction
unique_idx = []
def update_colors(trace, points, state):
    selected = points.point_inds
    global unique_idx
    unique_idx = selected.copy()
    
    if len(selected) > 0:
        fig2.data = []
        fig2.add_traces(draw_parallel(mdm_df.loc[selected]).data)
        
def set_by_default(trace, points):
    fig2.data = []
    fig2.add_traces(draw_parallel(mdm_df).data)
    
    
fig1.data[0].on_selection(update_colors)
fig1.data[0].on_deselect(set_by_default)

# Show the data
display(widgets.VBox([fig1, fig2]))

VBox(children=(FigureWidget({
    'data': [{'dimensions': [{'label': 'Number of Children',
                   …

The dataset has 7 dimensions which were splitted to numerical and categorical. 
Credit Score is our Target Value.

The parallel sets as initial point can show us that usually people with higher education have better credit score than other.

The most powerfull categorical feature is the property ownership. People who have their own homes are usually have high credit score while people who rent a housing are mosty at the average and low levels.
Interesting fact that can be noticed from the visualization is the gender plays a role if a person does not have higher education, family partner and own property. In this case women have the low credit score and men have the average. We can suggest that in this case the amount of children can have an impact. However, using interaction we can notice that with 0 children there is unequality.

The scatter matrix is drawn in order to show the numerical features.

### Availabe interactions
##### Parallel sets
1) The user is able to switch dimensions and categories in order to have a look at the pairs of categorical features

2) The user can have a look at the particular set in order to get the count of people within set

##### Scatter matrix
1) The user can zoom camera at any particular scatter plot

2) The user can select the region with points and these points will be highlighted at the others scatter plots
(lasso selection, box selection)

There is a connection between scatter matrix and parallel sets. If the user selects a region at the scatter plot then the parallel sets will be adopted to the selected points. It can be used as a filter.
For example the user would like to see the situation for people up to 35 with the salary <= 75_000. In this case they can select this region at the scatter plot (Age, Income) and parallel sets plot will be updated

If the user drops selection (double click on any scatter plot), then the parallel set will be redrawn using all data

The presented visualzation works better for larger amount of data

👉 **TODO 2.2:**  Choose a **hierarchical** dataset and explore it with Plotly. In your exploration, make use of at least one chart type for hierarchical data. Depending on your dataset, we recommend to use a [treemap](https://plotly.com/python/treemaps/) or a [sunburst chart](https://plotly.com/python/sunburst-charts/) or a [tree plot](https://plotly.com/python/tree-plots/).

At the end of your exploration, write a short summary that reflects on the interactions you used and how they impacted your exploration (in addition to the reflections per chart, as before). For example, you could mention if they helped you to identify a specific pattern or gain a specific insight (or not).

The dataset was taken from Kaggle = https://www.kaggle.com/datasets/brsdincer/all-natural-disasters-19002021-eosdis

In [176]:
df = pd.read_csv(data_folder + disasters_file)

In [177]:
# let's identify interesting columns and rename them
columns = ['Year', 'Continent', 'Country', 'Disaster Subgroup', 'Disaster Type', 'Total Deaths', "Total Damages ('000 US$)"]

new_column_names = ['Year', 'Continent', 'Country', 'Disaster Subgroup','Disaster Type', 'Deaths', "Damage(k.$)"]

df = df[columns].rename(columns=dict(zip(columns, new_column_names)))
df.head(3)

Unnamed: 0,Year,Continent,Country,Disaster Subgroup,Disaster Type,Deaths,Damage(k.$)
0,1900,Africa,Cabo Verde,Climatological,Drought,11000.0,
1,1900,Asia,India,Climatological,Drought,1250000.0,
2,1902,Americas,Guatemala,Geophysical,Earthquake,2000.0,25000.0


The following hierarchy will be used:
1) Continent
2) Disaster Type 
3) Country

Year will be for interaction

In [178]:
# Let's do some preprocessing
basic_columns = ['Year', 'Continent', 'Country', 'Disaster Subgroup', 'Disaster Type']
target_columns = ['Deaths', 'Damage(k.$)']

# Let's get rid of nan-values for the main hierarchical columns
df = df[df[basic_columns].notna().all(axis=1)]
# ---------------------------------------------

# Remove rows which contain only nan-s for all target columns. They do not make value for us
df = df[df[target_columns].notna().any(axis=1)]
# ----------------------------------------------

# From 2000
df = df[df['Year'] >= 2000]
#--------------------------

# Make some country names shorter
def convert(name):
    return name.split('(')[0].rstrip(' ')
df['Country'] = df['Country'].apply(convert)
# -------------------------------------------

# Make copy to hande Nans for targets
df_deaths = df[df['Deaths'].notna()].copy()
df_damage = df[df['Damage(k.$)'].notna()].copy()
# ----------------------------------------------

# Aggregate over years
df_deaths = df_deaths.groupby(basic_columns[1:])[target_columns].sum().reset_index()
df_damage = df_damage.groupby(basic_columns[1:])[target_columns].sum().reset_index()
# ----------------------------------------------------------------------------------

# Convert to better values
df_deaths['Deaths thsd'] = df_deaths['Deaths'].apply(lambda x: round(x / 1_000, 3))
df_damage['Damage(bil.$)'] = df_damage['Damage(k.$)'].apply(lambda x: round(x / 1_000_000, 5))
# --------------------------------------------------------------------------------------------

In [179]:
fig_deaths = px.treemap(df_deaths, path=[px.Constant('all'), 'Continent', 'Disaster Type', 'Country'],
                        values='Deaths thsd', color='Deaths thsd', color_continuous_scale='YlOrRd')

fig_deaths.update_traces(hovertemplate='labels=%{label}<br>Total deaths=%{value}<extra></extra>')
tmp = fig_deaths.update_layout(margin=dict(t=50, l=25, r=25, b=25), 
                               width=1000, height=800, title='Data about deaths (thsd.)')

In [180]:
fig_damage = px.treemap(df_damage, path=[px.Constant('all'), 'Continent', 'Disaster Type', 'Country'],
                        values='Damage(bil.$)', color='Damage(bil.$)', color_continuous_scale='RdBu')

fig_damage.update_traces(hovertemplate='labels=%{label}<br>Total damage=%{value}<extra></extra>')
tmp = fig_damage.update_layout(margin=dict(t=50, l=25, r=25, b=25), 
                               width=1000, height=800, title='Data about damage (bil.$)')

In [181]:
fig_show = go.FigureWidget(data=[fig_deaths.data[0]])
tmp = fig_show.update_layout(title='Data about deaths (thsd.)', width=1000, height=800)

In [182]:
button_deaths = widgets.Button(description='Deaths')
button_damage = widgets.Button(description='Damage')

def change_data_deaths(callback):
    fig_show.data = []
    fig_show.add_traces(fig_deaths.data[0])
    tmp = fig_show.update_layout(title='Data about deaths (thsd.)', width=1000, height=800)
    
def change_data_damage(callback):
    fig_show.data = []
    fig_show.add_traces(fig_damage.data[0])
    tmp = fig_show.update_layout(title='Data about damage (bil.$)', width=1000, height=800)
        
        
button_deaths.on_click(change_data_deaths)
button_damage.on_click(change_data_damage)

container = widgets.HBox([button_deaths, button_damage])

In [183]:
widgets.VBox([container, fig_show])

VBox(children=(HBox(children=(Button(description='Deaths', style=ButtonStyle()), Button(description='Damage', …

Here we can observe the amount of victims and the damage of natural disasters over the world since 2020.
There is an oppurtunity to switch between treemaps using buttons. 
The user also can interact with treemap clicking of the interesting rectangles. The navigation bar is shown at the top of the reactangle.
The values are highlighted if user points at the particular rectangle

The used representation allows us to identify the most horrible disasters for each continent 