In [1]:
import pandas as pd 
import numpy as np
import json

import panel as pn

import altair as alt
pn.extension('mathjax')
pn.extension('vega')

In [2]:
original_df = pd.read_csv('../data/use_case1/dou.csv')
gdc_description = pd.read_csv('../table-union/data/gdc_schema_description.csv')
gdc_description = gdc_description.set_index('column name')

with open('../table-union/arpa_result/top_20_Dou_UCEC_CPTAC3_meta_table_V2.json') as f:
    recommendations = json.load(f)

In [3]:
column_matches = []
percent_missing = original_df.isnull().sum() * 100 / len(original_df)

for i, c in enumerate(original_df.columns):
    column_matches.append({'Original name': c,
                           'Type': type(original_df.loc[0,c]).__name__,
                           'Missing': percent_missing[c],
                           'Top score': recommendations[i]['Top k columns'][0][1],
                           'Recommendations': recommendations[i]['Top k columns'],
                           'Column matched': False,
                           'Value matched': False,
                           'Column Match': None,
                           })
    
df_matches = pd.DataFrame(column_matches)
#df_matches.drop(['Recommendations'], axis=1, inplace=True)

In [4]:
rec_cols = set()
rec_table = []
rec_list = []

for d in recommendations:
    col_dict = {'Column': d['Candidate column']}
    for c in d['Top k columns']:
        rec_cols.add(c[0])
        col_dict[c[0]] = c[1]
        rec_list.append({'Column': d['Candidate column'],
                        'Recommendation': c[0],
                        'Value': c[1]})
    rec_table.append(col_dict)

rec_cols = list(rec_cols)
rec_cols.sort()

rec_table_df = pd.DataFrame(rec_table)
rec_list_df = pd.DataFrame(rec_list)
rec_list_df['Value'] = pd.to_numeric(rec_list_df['Value'])

## Heatmap of all columns

This is the heatmap with all columns and the recommendations (way too big)

In [5]:
alt.Chart(rec_list_df).mark_rect().encode(
    y=alt.X('Column:O', sort=None),
    x=alt.X('Recommendation:O', sort=None),
    color='Value:Q',
    tooltip=[
        alt.Tooltip("Column", title="Column"),
        alt.Tooltip("Recommendation", title="Recommendation"),
        alt.Tooltip("Value", title="Value"),
    ],
)

## Cluster columns based on name

- Cluster columns based on the Levenshtein (edit) distance between their names.
- Use [AffinityPropagation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html#sklearn-cluster-affinitypropagation) for the clustering
- The cluster "name" is the column name closer to its centroid

[Reference](https://stats.stackexchange.com/questions/123060/clustering-a-long-list-of-strings-words-into-similarity-groups)

In [6]:
from sklearn.cluster import AffinityPropagation
from Levenshtein import distance

words = rec_table_df['Column'].to_numpy()
lev_similarity = -1*np.array([[distance(w1,w2) for w1 in words] for w2 in words])
lev_similarity = lev_similarity.astype(np.float32)

affprop = AffinityPropagation(affinity="precomputed", max_iter=1000, damping=0.7)
affprop.fit(lev_similarity)

print(f'Number of clusters: {np.unique(affprop.labels_).shape[0]}\n')
cluster_names = []
clusters = {}
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))
    cluster_names.append(exemplar)
    clusters[exemplar] = cluster

Number of clusters: 29

 - *Proteomics_TMT_plex:* Genomics_subtype, Proteomics_Aliquot_ID, Proteomics_OCT, Proteomics_Parent_Sample_IDs, Proteomics_Participant_ID, Proteomics_TMT_batch, Proteomics_TMT_channel, Proteomics_TMT_plex, Proteomics_Tumor_Normal
 - *Path_Stage_Dist_Mets-pM:* Clin_Stage_Dist_Mets-cM, Path_Stage_Dist_Mets-pM, Path_Stage_Primary_Tumor-pT, Path_Stage_Reg_Lymph_Nodes-pN
 - *Race:* Age, BMI, Case_excluded, Country, Diabetes, ER_ESR1, Gender, LVSI, MLH1, MLH2, MSH6, PMS2, PR_PGR, Race, idx, p53
 - *Tumor_Site:* Ethnicity, Histologic_type, POLE_subtype, Stemness_score, Treatment_naive, Tumor_Focality, Tumor_Site, Tumor_Site_Other, Tumor_Size_cm, Tumor_purity, tumor_Stage-Pathological
 - *Estrogen_Receptor:* Estrogen_Receptor, Estrogen_Receptor_%, Histologic_Grade_FIGO, Progesterone_Receptor, Progesterone_Receptor_%
 - *MLH1_Promoter_Hypermethylation:* MLH1_Promoter_Hypermethylation
 - *EPIC_Bcells:* EPIC_Bcells, EPIC_CAFs, EPIC_CD4_Tcells, EPIC_CD8_Tcells, EPIC_Endoth



#### To-do

- [ ] Improve clustering
- [ ] Use column values for clustering
- [ ] Use recommendations for clustering

Those different cluster options can be options for the heatmap view

## Heatmap by cluster

In [7]:
def plot_heatmap_cluster(cluster_list, threshold):
    if len(cluster_list) > 0:
        cols = []
        for c in cluster_list:
            cols.extend(clusters[c])
    
        cluster_list_df = rec_list_df[rec_list_df['Column'].isin(cols)]
        cluster_list_df = cluster_list_df[cluster_list_df['Value'] >= threshold]

        select = alt.selection_point(encodings=['x', 'y'], 
                                          empty="none")
        
        base = alt.Chart(cluster_list_df).mark_rect().encode(
                    y=alt.Y('Column:O', sort=None),
                    x=alt.X('Recommendation:O', sort=None),
                    color='Value:Q',
                    tooltip=[
                        alt.Tooltip("Column", title="Column"),
                        alt.Tooltip("Recommendation", title="Recommendation"),
                        alt.Tooltip("Value", title="Value"),
                    ],
                    stroke=alt.condition(select, alt.value('black'), alt.value('white')),
                    strokeWidth=alt.condition(select, alt.StrokeWidthValue(2), alt.StrokeWidthValue(0)),
                    #fillOpacity=alt.condition(select, alt.value(1), alt.value(0.5)),
                ).add_selection(select)

        global heatmap
        heatmap = pn.pane.Vega(base, debounce=10)
        
        return heatmap

In [8]:
select_cluster = pn.widgets.MultiChoice(name='Column cluster', 
                                        options=cluster_names,
                                        width=220
                                        )
select_rec_groups = pn.widgets.MultiChoice(name='Recommendation group', # Not in use
                                            options=cluster_names,
                                            width=220
                                            )
thresh_slider = pn.widgets.EditableFloatSlider(name='Threshold', 
                                               start=0, end=1.0, step=0.01, value=0.5,
                                               width=220
                                              )

heatmap_bind = pn.bind(plot_heatmap_cluster, 
                       select_cluster,
                       thresh_slider
                      )

pn.Row(
    pn.Column('## Options',
              select_cluster,
              select_rec_groups,
              thresh_slider,
              #'## Details',
              width=250,
              height=700),
    pn.Column(
        '## Heatmap',
        heatmap_bind,
    )
)


#### To-do

- [x] Add column cluster selection
- [ ] Add recommendation group selection (need groups)
- [x] Add threshold selection
- [ ] Show columns without recommendations
- [ ] Add histogram (see below)
- [ ] Add recommendation info (description, type, etc)
- [ ] Add "Accept Match" buttom
- [ ] Add "Reject Match" buttom

## Add histogram

In [9]:
def plot_column_hist(selector):
    params = selector.param.get_param_values()
    column = params[1][1][1]['Column']
    recommendation = params[1][1][0]['Recommendation']
    
    if original_df[column].dtype == 'float64':
        chart = alt.Chart(original_df.fillna('Null'), height=300).mark_bar().encode(
                    alt.X(select_column.value, bin=True),
                    y='count()',
                ).properties(
                    width="container",
                    title='Histogram of '+column
                )
    else:
        values = list(original_df[column].unique())
        if len(values) == len(original_df[column]):
            string = f'''Values are unique. 
            Some samples: {values[:5]}'''
            return string
        else:
            if np.nan in values:
                values.remove(np.nan)
            values.sort()
            
            chart = alt.Chart(original_df.fillna('Null'), height=300).mark_bar().encode(
                        x=alt.X(
                            column+":N",
                            sort=values,
                        ),
                        y="count()",
                    ).properties(
                        width="container",
                        title='Histogram of '+column
                    )
    return chart

**Only run the following after selecting a cell in the heatmap**

In [17]:
heatmap.selection

Selection(name='Selection00198', param_1=[{'Recommendation': 'tissue_type'}, {'Column': 'Proteomics_Tumor_Normal'}])

In [18]:
heatmap.selection.param.get_param_values()

[('name', 'Selection00198'),
 ('param_1',
  [{'Recommendation': 'tissue_type'}, {'Column': 'Proteomics_Tumor_Normal'}])]

In [19]:
plot_column_hist(heatmap.selection)

#### To-do

- [ ] Connect histogram with heatmap selection
- [ ] Improve example list when the values are unique for the table rows
- [ ] Make it prettier