In [1]:
import os
import numpy as np
import pandas as pd

import plotly.graph_objects as go

In [2]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [3]:
from read_missing_data import read_missing_df

In [4]:
data = read_missing_df()

### Missing value by column

### This plot shows
- The number of missing value and the number of no missing value per column.
- At the y axis, the plot shows ticks at the both side, one shows proportion and the other shows exact number.
- Use two buttons, one shows all columns, and the other shows columns with missing values.
- Use slider at the x axis that user can change the number of features shown on the plot. This is friendly when data with large number of features.

#### Mian trace
In this plot, we use three traces.<br><br>
One trace is used to add ticks on the left side, others show the missing value information. All of them are vertical bar plot.
- Bar plot (main): missing value count / no missing value count.
```python
trace = go.Bar(
    x=missing_count.index, # no_missing_count.index
    y=missing_count.values, # no_missing_count.values
    orientation='v',
    hovertemplate=...,
    marker=dict(...),
)
```
    * orientation: `"v"` means vertical bar plot. `"h"` means horizantal bar plot.
    * marker: a dictionary of properties that control trace color, opacity, and so on. [go.scatter.Marker](https://plotly.com/python-api-reference/generated/plotly.graph_objects.bar.html#plotly.graph_objects.bar.Marker)
    
- Bar plot (auxiliary): add y ticks in proportional format.<br><br>
This bar plot purpose is to show correct y ticks at the opposite side of other traces. Hence, the trace color is set to transparent by setting opacity to 0. Also, different y axis reference is required.
```python
trace = go.Bar(
    x=total_percent.index,
    y=total_percent.values,
    orientation='v',
    opacity=0, # let trace color transparent
    hoverinfo='skip',
    yaxis='y2',
)
```
    * hoverinfo: determines which trace information appear on hover. If `"skip"`, no information is displayed upon hovering.
    * yaxis: set a reference between this trace’s y coordinates and a 2D cartesian y axis. `"y2"` means the y coordinates refer to `layout.yaxis2`. The default is `"y"` that the y coordinates refer to `layout.yaxis`/`layout.yaxis1`.

#### Layout: yaxis
Show proportion on the left y axis, and show exact number on the right y axis.
```python
yaxis1={'side': 'right', 'range': [0, len(data)*1.05]}
yaxis2={'side': 'left', 'overlaying': 'y', 'range': [0, 1.05]}
```
- overlaying: this axis is overlaid on top of the corresponding same-letter axis. `yaxis2={'overlaying': 'y'}` means yaxis2 overlays on the top of yaxis/yaxis1.

#### Layout: xaxis slider
Add `rangeslider` at the x axis to control the number of features shown on the plot.
```python
xaxis={'title': 'Feature name', 
       'tickangle': -90, 
       'rangeslider': {'visible': True}, 
       'type':'category',
      },
```

#### Layout: buttons
One button shows all features, the other only shows features with missing value and sort features by number of missing value (descending).<br>
We only need to update data, so button `method` is `"restyle"`.

In [5]:
_missing_pct = data.isna().sum()/data.shape[0]
_not_missing_pct = 1-(data.isna().sum()/data.shape[0])
_missing_cnt = data.isna().sum()
_not_missing_cnt = data.shape[0] - data.isna().sum()

sorted_col = list(_missing_cnt[_missing_cnt!=0].sort_values(ascending=False).index)
total_pct = pd.Series(data=[1.0]*data.shape[1], index=data.columns)

In [6]:
fig = go.Figure()

basic_color = 'rgba(0,0,0,0.7)'
_missing_color = "153,0,0"
missing_color = f'rgba({_missing_color},0.3)'
no_missing_color = f'rgba({_missing_color},0.7)'

color=pd.Series(data=[no_missing_color]*len(total_pct), index=data.columns)
color[_missing_cnt == 0] = basic_color

noms_cols = list(_missing_cnt[_missing_cnt == 0].index)
ms_cols = list(_missing_cnt[_missing_cnt != 0].index)

hovertemplate1 = "No missing: %{y} <extra></extra>"
hovertemplate2 = "Missing: %{y} <extra></extra>"


fig.add_trace(go.Bar(x=_not_missing_cnt.index, y=_not_missing_cnt.values, orientation='v', 
                     hovertemplate=hovertemplate1,
                     marker={'color': color}))
fig.add_trace(go.Bar(x=_missing_cnt.index, y=_missing_cnt.values, orientation='v', 
                     hovertemplate=hovertemplate2,
                     marker={'color': missing_color}))
fig.add_trace(go.Bar(x=total_pct.index, y=total_pct.values, orientation='v', 
                     opacity=0, hoverinfo='skip',
                     yaxis='y2'))



main_layout = dict(
    title=dict(
        text= 'Missing value by column',
        font=dict(size=22),
        y=0.99,
        x=0.00,
        xanchor='left',
        yanchor='top',
    ),
    barmode='stack',
    showlegend=False,
    xaxis={'title': 'Feature name', 'tickangle': -90, 
           'rangeslider': {'visible': True}, 
           'type':'category',
          },
    yaxis1={'side': 'right', 'range': [0, len(data)*1.05]},
    yaxis2={'side': 'left', 'overlaying': 'y', 'range': [0, 1.05]},
)

fig.update_layout(**main_layout)


fig.update_layout(
    updatemenus=[
        dict(
            type = "buttons",
            direction = "left",
            buttons=(
                {'args': [{'type':'bar', 
                           'x': [_not_missing_cnt.index, _missing_cnt.index, total_pct.index], 
                           'y': [_not_missing_cnt.values, _missing_cnt.values, total_pct.values],
                           'marker': [{'color': color}, {'color': missing_color}, None]
                          }],
                 'method': 'restyle', 'label': 'All',
                },
                {'args': [{'type':'bar', 
                           'x': [_not_missing_cnt[sorted_col].index, _missing_cnt[sorted_col].index, total_pct[ms_cols].index], 
                           'y': [_not_missing_cnt[sorted_col].values, _missing_cnt[sorted_col].values, total_pct[ms_cols].values],
                           'marker': [{'color': color[sorted_col]}, {'color': missing_color}, None]
                          }],
                 'method': 'restyle', 'label': 'Column with missing',
                },
            ),
            active=0,
            x=0.0,
            xanchor="left",
            y=1,
            yanchor="top",
            pad={"r": 0, "t": -50},
            bgcolor='rgba(255,255,255,1)',
        ),
    ]
)


fig.show()
# fig.write_html('./automl_plot/missing_value_by_column.html', config={'displaylogo': False}, include_plotlyjs='cdn', full_html=False)