# Analyze Dataset Meta Data

Example screenshot

<img src="images/SciCat-Analysis.png" alt="Drawing" style="width: 600px;"/>


In [None]:
# This notebook requires a notebook server with the following packages:
# qgrid,ipypivot,ipyfilechooser
# It is also useful to have the voila backend support to run the GUI in a standalone mode
# This notebook is only tested in a standardnotebook (i.e. non-lab) environment

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import requests
import json
import collections
from pandas.io.json import json_normalize
import qgrid
import ipypivot as pt
import time
from ipyfilechooser import FileChooser
import os
import socket
import pandas as pd

In [None]:
print("Find editable version of this notebook by replacing 'voila/render' with 'notebooks' in this notebooks URL")


In [None]:
# build the GUI, constructed out of 4 "accordeon-tabs" 

class build_widgets():
    
    def __init__(self):
        ly=widgets.Layout(width='90%')
        self.w=interactive(self.login,
            {'manual': True},
            apiserver=widgets.Text(value='dacat.psi.ch',placeholder='SciCat API server, e.g. "dacat.psi.ch"',description='Server:',layout=ly),
            user=widgets.Text(value='',placeholder='Users account or functional account',description='Account:',layout=ly),
            pw=widgets.Password(value='',placeholder='add your password',description='Password:',layout=ly),
            token=widgets.Text(value='',placeholder='Generated token',description='Token:',layout=ly)
        )
        #change default interactive button text
        self.w.children[4].description="Login"

        self.bePatient = widgets.Label(value="No data available yet - please be patient, data is being fetched...")
        
        self.widget_list=[self.w,
                          self.bePatient,
                          self.bePatient,
                          self.bePatient]
        self.accordion = widgets.Accordion(children=self.widget_list)
        self.accordion.set_title(0, 'Login')
        self.accordion.set_title(1, 'Select Fields')
        self.accordion.set_title(2, 'Browse Tabular Result Datasets')
        self.accordion.set_title(3, 'Analysis/Aggregation')
        self.accordion.selected_index=0
        display(self.accordion)
        
    def login(self,apiserver,user,pw,token):
        print("Automatically switching to resulting dataset as soon as data is available ...")
        # api-endpoints
        self.API = "https://"+apiserver+"/api/v3"
        self.MSAD= "https://"+apiserver+"/auth/msad"
        access_token=''
        data = {'username':user,'password':pw} 
        r = requests.post(url = self.API +'/Users/login', data = data)
        if (r.status_code == 200):
            access_token = r.json()['id']
        else:
            r = requests.post(url = self.MSAD, data = data)
            if (r.status_code == 200):
                access_token = r.json()['access_token']
            else:
                print("Could not login, access_token undefined")
        self.token=access_token  
        self.w.children[3].value=access_token      
        # fill next accordion "tab" with selectable meta data fields
        self.widget_list[1]=self.multi_checkbox_widget(self.getKeys())
        self.accordion.children=self.widget_list
        # predefine key list 
        self.selected_options=['ownerEmail', 
                               'size',
                               'ownerGroup',
                               'updatedAt',
                               'datasetlifecycle.archiveStatusMessage',
                               'scientificMetadata.beamlineParameters.Beam energy.v',
                               'scientificMetadata.detectorParameters.Microscope',
                               'scientificMetadata.detectorParameters.Scintillator',
                               'scientificMetadata.detectorParameters.Objective']
        []
        # and fetch the data
        self.get_datasets(0)
        return

    def flatten(self, d, parent_key='', sep='.'):
        items = []
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, collections.MutableMapping):
                items.extend(self.flatten(v, new_key, sep=sep).items())
            else:
                items.append((new_key, v))
        return dict(items)

    def getKeys(self):
        # read first dataset to extract keys
        # where = {"datasetName":"disk1/VB1721Glob_/tif"}
        where = {"creationLocation":"/PSI/SLS/TOMCAT"}
        params = {"where":where, "limit":1}
        # defining a params dict for the parameters to be sent to the API 
        PARAMS = {'filter':json.dumps(params), 'access_token':self.token} 
        r = requests.get(url = self.API+"/Datasets", params = PARAMS) 
        # extracting data in json format 
        data = r.json()
        # extract keys
        flattened_dataset=self.flatten(data[0])
        # TODO create shortend names and example values
        return flattened_dataset.keys()


    def multi_checkbox_widget(self,descriptions):
        """ Widget with a search field and lots of checkboxes """
        textout = widgets.Output()
        with textout:
            print("Select the fields you want to study by using the checkboxes.")
            print("Use the search field to find specific field names")
            print("When finished press the button")
            print("This will lead you to the results tab")

        self.button = widgets.Button(
           description='Read data',
           disabled=False,
           button_style='primary',
           tooltip='Click me when fields are selected',
        )
        self.button.on_click(self.get_datasets)
        
        search_widget = widgets.Text()
        options_dict = {description: widgets.Checkbox(description=description, value=False, layout=widgets.Layout(width='90%')) for description in descriptions}
        option_checkboxes = [options_dict[description] for description in descriptions]

        options_widget = widgets.VBox(option_checkboxes, layout={'overflow': 'scroll'})

        # TODO display as tree, also allow selection of full subtree

        # Wire the search field to the checkboxes
        def on_text_change(change):
            search_input = change['new']
            if search_input == '':
                # Reset search field
                new_option_checkboxes = option_checkboxes
            else:
                # Filter by search field using difflib.

                #close_matchlen(df.index)es = difflib.get_close_matches(search_input, descriptions, cutoff=0.0)
                close_matches = filter(lambda k: search_input in k, descriptions)
                new_option_checkboxes = [options_dict[description] for description in close_matches]
            options_widget.children = new_option_checkboxes


        search_widget.observe(on_text_change, names='value')
        out = widgets.Output(layout={'border': '1px solid black'})

        @out.capture()
        def function_with_captured_output():
            self.selected_options = [checkbox.description for checkbox in option_checkboxes if checkbox.value]
            print(self.selected_options)

        def on_check_change(change):
            # check = change['new']
            out.clear_output()
            function_with_captured_output()

        for description in descriptions:
            options_dict[description].observe(on_check_change)    

        return widgets.VBox([self.button,textout,
                             widgets.HBox([search_widget,out]),
                             options_widget])

    def store_csv(self,b):
        self.result.to_csv(self.fc.selected_filename)
        
    def get_datasets(self,b):
        # where = {"creationLocation":"/PSI/SLS/TOMCAT"}
        # where = { 'ownerGroup': 'p17880'}
        where = {"creationLocation":"/PSI/SLS/TOMCAT"}
        # get count first and then read in batches
        PARAMS = {'where':json.dumps(where), 'access_token':self.token} 
        # sending get request and saving the response as response object 
        r = requests.get(url = self.API+"/Datasets/count", params = PARAMS)  
        data = r.json() 
        print("Number of datasets:",data)
        fields = { key:1 for key in self.selected_options }
        print("Waiting for results:")
        par = {"fields":fields,"where":where}
        # defining a params dict for the parameters to be sent to the API 
        PARAMS = {'filter':json.dumps(par), 'access_token':self.token} 
        # sending get request and saving the response as response object 
        r = requests.get(url = self.API+"/Datasets", params = PARAMS) 
        # extracting data in json format and flatten tree
        data = r.json() 
        df = json_normalize(data) 
        self.result=df
         
        # qgrid.set_grid_option('maxVisibleRows', 10)
        print("Finished")
        col_opts = { 
            'editable': False,
        }
        self.table=qgrid.show_grid(self.result, show_toolbar=False, column_options=col_opts)
        self.table.layout = widgets.Layout(width='80%')

        self.fc = FileChooser(
            os.getcwd(),
            filename='dataset-result-table.csv',
            title='<b>Optional: choose a file to save table data</b>'
        )
        self.fc.register_callback(self.store_csv)
        
        self.pivot=pt.PivotUI(df_data=self.result)
        opts = self.pivot.table.options
        opts.vals = ['size']
        opts.aggregatorName = 'Sum'
        opts.rendererName = 'Stacked Bar Chart'
        opts.rows = ['datasetlifecycle.archiveStatusMessage']
        opts.cols = ['Year']
        opts.cols = ['Year-Month']
        opts.derivedAttributes = {
            'Year': '$.pivotUtilities.derivers.dateFormat("updatedAt", "%y")',
            'Year-Month': '$.pivotUtilities.derivers.dateFormat("updatedAt", "%y/%m")'
        }

        self.widget_list[2]=widgets.VBox([self.fc,self.table])
        self.widget_list[3]=self.pivot
        self.accordion.children=self.widget_list
        # switch to result tab
        self.accordion.selected_index=2
        return self.result


In [None]:
x=build_widgets()

