# PubMed Topic Tracker
## 3. Interactive data exploration

This tool allows fully interactive exploration of the datasets preprocessed with the Content Explorer notebook. You can select a dataset to work with, select a set of entities to explore, and plot any entity or combination of entities.

Dependencies:
- pandas 1.2.1
- IPython 7.19.0
- tqdm 4.55.1
- matplotlib 3.3.3
- bokeh 2.2.3
- numpy 1.19.2
- IPython 7.19.2
- ipywidgets 7.6.3


## *Caveat* : normalization and lemmas
Normalization is performed in the same way for each entity, i.e:  normalized entity = count of entity / number of papers. So a normalized value of 0.1 for, e.g. a keyword means that said entity is present in 10% of the entries in the subset. This is true for keywords, MeSH terms and authors, but not for anything which is based on lemmas. 

The reason is: any keyword, MeSH term or author can appear maximum once per entry (you don't list twice an author or a keyword). But lemmas can appear more than once per paper, so the normalized values of lemmas should not be considered as percentages.

In [None]:
import re
import collections
import pandas as pd
import os
import ipywidgets as widgets
from ipywidgets import interactive
from IPython.display import display, Markdown
import numpy as np
import bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.palettes import Category20
from bokeh.models import Panel, Tabs, HoverTool, ColumnDataSource
output_notebook()
palette = Category20[20]

# print markdown to style fonts in output
def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))
    
# Create list of datasets
dirlist = []
dircontent = os.listdir("export")
for x in dircontent:
    if x.startswith(".") == False:
        dirlist.append(x)
i = 0
datalist = collections.defaultdict(list)
for x in dirlist:
    if "index" not in datalist:
        datalist["index"] = [i]
    else:
        datalist["index"].append(i)
    i = i+1
    if "dirname" not in datalist:
        datalist["dirname"] = [x]
    else:
        datalist["dirname"].append(x)

    filepath = "export/" + x + "/" + "log.txt"
    with open (filepath, "r") as f:
        file = f.read()
    year0 = re.search(r"(?<=year0\s=\s\")\d*", file)
    year0 = year0.group(0)
    if "year0" not in datalist:
        datalist["year0"] = [year0]
    else:
        datalist["year0"].append(year0)
    year1 = re.search(r"(?<=year1\s=\s\")\d*", file)
    year1 = year1.group(0)
    if "year1" not in datalist:
        datalist["year1"] = [year1]
    else:
        datalist["year1"].append(year1)
    keywords = re.search(r"(?<=keywords\s=\s\").*(?=\")", file)
    keywords = keywords.group(0)
    if "query" not in datalist:
        datalist["query"] = [keywords]
    else:
        datalist["query"].append(keywords)
    paper_count = re.search(r"(?<=paper_count_no_duplicates\s=\s\")\d*", file)
    paper_count = paper_count.group(0)
    if "paper_count" not in datalist:
        datalist["paper_count"] = [paper_count]
    else:
        datalist["paper_count"].append(paper_count)
    querydatetime = re.search(r"(?<=executed\sat:\s).*", file)
    querydatetime = querydatetime.group(0)
    if "querydatetime" not in datalist:
        datalist["querydatetime"] = [querydatetime]
    else:
        datalist["querydatetime"].append(querydatetime)
df_data = pd.DataFrame.from_dict(datalist)
df_data = df_data.drop(df_data.columns[0], axis=1)
pd.set_option('max_colwidth', 500)
df_data.index += 1 
printmd("\n\n## Datasets available: ", color = "black")
display(df_data)
pd.reset_option('max_colwidth')

In [None]:
# Function for organizing data for plotting
def organize_data(dataframe, column):
    colnames = list(dataframe.columns) 
    colnames = ([x for x in colnames if x.startswith('df_')])
    wordlist = list(dataframe[column])
    dataframe = dataframe.set_index(column)
    plotthis = {}
    for x in wordlist:
        servicelist = []
        for y in colnames:
            value = dataframe.at[x,y]
            servicelist.append(value)
        plotthis[x] = servicelist
    return(plotthis)

# Function for retrieving the wordlist from organized data
def get_entitylist(dataframe):
    entitylist = dataframe.iloc[:, 0].tolist()
    return(entitylist)

def simple_grid(df):

    column_defs = [{'headername':c,'field': c, 'width':100} for c in df.columns]

    grid_options = {
        'columnDefs' : column_defs,
        'enableSorting': True,
        'enableFilter': True,
        'enableColResize': True,
        'enableRangeSelection': True,
        'rowSelection': 'multiple',
    }

    g = Grid(grid_data=df,
             grid_options=grid_options,
             quick_filter=True,
             show_toggle_edit=False,
             sync_on_edit=False,
             export_csv=True,
             export_excel=False,
             theme='ag-theme-balham',
             show_toggle_delete=False,
             columns_fit='auto',
             index=False)
    return g

### Dataframe selection ###
# Create list of datasets
dirlist = []
dircontent = os.listdir("export")
for x in dircontent:
    if x.startswith(".") == False:
        dirlist.append(x)

# Selection widgets
selectmenu = widgets.Combobox(options=dirlist, description='Dataset:')
button_select = widgets.Button(description="Select")
output_select = widgets.Output()
output_status = widgets.Output()

# Analysis widgets
button_analyse = widgets.Button(description="Analyse")
output_analysis = widgets.Output()
output_analysis_status = widgets.Output()

# Display selection widgets
display(selectmenu, button_select, output_select, output_status)

# Display analysis widgets
display(button_analyse, output_analysis_status, output_analysis)

# Function for retrieving the appropriate data from the metadata of the dataset
@output_select.capture(clear_output=False,wait=True)
def button_select_f(b):
    with output_status:
        output_status.clear_output()
        printmd("\n\n**Loading dataset. Wait...**", color = "red")
    with output_analysis_status:
        output_analysis_status.clear_output()
    with output_analysis:
        output_analysis.clear_output()
    with output_select:
        output_select.clear_output()
        dataset = (selectmenu.value)
        global file
        file = "export/" + dataset + "/" + "log.txt"
        output_select.clear_output()
        with open (file, "r") as f:
            file = f.read()
        global year0
        year0 = re.search(r"(?<=year0\s=\s\")\d*", file)
        year0 = year0.group(0)
        global year1
        year1 = re.search(r"(?<=year1\s=\s\")\d*", file)
        year1 = year1.group(0)
        global years
        years = []
        for x in range (int(year0),int(year1)+1):
            years.append(x)
        keywords = re.search(r"(?<=keywords\s=\s\").*(?=\")", file)
        keywords = keywords.group(0)
        paper_count = re.search(r"(?<=paper_count_no_duplicates\s=\s\")\d*", file)
        paper_count = paper_count.group(0)
        global exportdir
        exportdir = re.search(r"(?<=exportdir\s=\s\").*(?=\")", file)
        exportdir = exportdir.group(0)
        print("\nUsing dataset: \"" + dataset + "\"\n")
        print("Time interval: " + year0 + " - " + year1 + "\n" + "Keywords: " + keywords + "\n" + paper_count + " entries")
        
        # Create keyword df
        global df_k
        global wordlist_k_o
        df_k = pd.read_csv(exportdir + "/data/Keywords.csv", sep=';', low_memory=False) 
        df_k = df_k.drop(df_k.columns[0], axis=1)
        df_k.index += 1 
        wordlist_k_o = get_entitylist(df_k)

        # Create keyword (normalized) df
        global df_k_n
        global wordlist_k_n_o
        df_k_n = pd.read_csv(exportdir + "/data/Keywords_norm.csv", sep=';', low_memory=False) 
        df_k_n = df_k_n.drop(df_k_n.columns[0], axis=1)
        df_k_n.index += 1 
        wordlist_k_n_o = get_entitylist(df_k_n)

        # Create mesh df
        global df_m
        global wordlist_m_o
        df_m = pd.read_csv(exportdir + "/data/Meshterms.csv", sep=';', low_memory=False) 
        df_m = df_m.drop(df_m.columns[0], axis=1)
        df_m.index += 1 
        wordlist_m_o = get_entitylist(df_m)

        # Create mesh (normalized) df
        global df_m_n
        global wordlist_m_n_o
        df_m_n = pd.read_csv(exportdir + "/data/Meshterms_norm.csv", sep=';', low_memory=False) 
        df_m_n = df_m_n.drop(df_m_n.columns[0], axis=1)
        df_m_n.index += 1 
        wordlist_m_n_o = get_entitylist(df_m_n)

        # Create author df
        global df_a
        global wordlist_a_o
        df_a = pd.read_csv(exportdir + "/data/Authors.csv", sep=';', low_memory=False) 
        df_a = df_a.drop(df_a.columns[0], axis=1)
        df_a.index += 1 
        wordlist_a_o = get_entitylist(df_a)

        # Create author (normalized) df
        global df_a_n
        global wordlist_a_n_o
        df_a_n = pd.read_csv(exportdir + "/data/Authors_norm.csv", sep=';', low_memory=False) 
        df_a_n = df_a_n.drop(df_a_n.columns[0], axis=1)
        df_a_n.index += 1 
        wordlist_a_n_o = get_entitylist(df_a_n)
        
        # Create lemmas df
        global df_l
        global wordlist_l_o
        df_l = pd.read_csv(exportdir + "/data/Lemmas.csv", sep=';', low_memory=False) 
        df_l = df_l.drop(df_l.columns[0], axis=1)
        df_l.index += 1 
        wordlist_l_o = get_entitylist(df_l)
        wordlist_l_o = [x for x in wordlist_l_o if type(x) is str]
        
        # Create lemmas (normalized) df
        global df_l_n
        global wordlist_l_n_o
        df_l_n = pd.read_csv(exportdir + "/data/Lemmas_norm.csv", sep=';', low_memory=False) 
        df_l_n = df_l_n.drop(df_l_n.columns[0], axis=1)
        df_l_n.index += 1 
        wordlist_l_n_o = get_entitylist(df_l_n)
        
        # Create COI amount df
        global df_acoi
        global wordlist_acoi_o
        df_acoi = pd.read_csv(exportdir + "/data/Amount of COI statements.csv", sep=';', low_memory=False) 
        df_acoi = df_acoi.drop(df_acoi.columns[0], axis=1)
        df_acoi.index += 1 
        wordlist_acoi_o = get_entitylist(df_acoi)
        wordlist_acoi_o = [x for x in wordlist_acoi_o if type(x) is str]

        # Create COI lemmas df
        global df_lcoi
        global wordlist_lcoi_o
        df_lcoi = pd.read_csv(exportdir + "/data/Coi_lemmas.csv", sep=';', low_memory=False) 
        df_lcoi = df_lcoi.drop(df_lcoi.columns[0], axis=1)
        df_lcoi.index += 1 
        wordlist_lcoi_o = get_entitylist(df_lcoi)
        wordlist_lcoi_o = [x for x in wordlist_lcoi_o if type(x) is str]
        
        # Create COI lemmas (normalized) df
        global df_lcoi_n
        global wordlist_lcoi_n_o
        df_lcoi_n = pd.read_csv(exportdir + "/data/Coi_lemmas_norm.csv", sep=';', low_memory=False) 
        df_lcoi_n = df_lcoi_n.drop(df_lcoi_n.columns[0], axis=1)
        df_lcoi_n.index += 1 
        wordlist_lcoi_n_o = get_entitylist(df_lcoi_n)
        
        # Create journals df
        global df_j
        global wordlist_j_o
        df_j = pd.read_csv(exportdir + "/data/Journal.csv", sep=';', low_memory=False) 
        df_j = df_j.drop(df_j.columns[0], axis=1)
        df_j.index += 1 
        wordlist_j_o = get_entitylist(df_j)
        wordlist_j_o = [x for x in wordlist_j_o if type(x) is str]
        
        # Create journals (normalized) df
        global df_j_n
        global wordlist_j_n_o
        df_j_n = pd.read_csv(exportdir + "/data/Journal_norm.csv", sep=';', low_memory=False) 
        df_j_n = df_j_n.drop(df_j_n.columns[0], axis=1)
        df_j_n.index += 1 
        wordlist_j_n_o = get_entitylist(df_j_n)
                
    with output_status:
        output_status.clear_output()
        printmd("\n\n**Dataset ready for analysis.**", color ="green")
        
button_select.on_click(button_select_f)

### Analysis ###

# Function to analyse dataset
@output_analysis.capture(clear_output=False,wait=True)
def button_analyse_d(b):
    with output_analysis_status:
        output_analysis_status.clear_output()
        printmd("\n\n**Analysis in progress...**", color = "red")
    with output_analysis:
        output_analysis.clear_output()

        ########### Keyword ###########
        def sort_by_k(column):
            data = df_k
            data = data.sort_values(by=column, ascending=False)
            display(data.head(20))
        sort_by_options_k = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        sorted_data_k = interactive(sort_by_k, column = sort_by_options_k)
        ###    
        #sort_by_options_k = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        #sorted_data_k = interactive(sort_by_k, column = sort_by_options_k)
        w1_k = widgets.Combobox(placeholder='Choose...', options=wordlist_k_o, ensure_option=True, disabled=False, description = "Keywords:")
        button1_k = widgets.Button(description="Add")
        button2_k = widgets.Button(description="Generate")
        button3_k = widgets.Button(description="Reset")
        output1_k = widgets.Output()
        output2_k = widgets.Output()
        global entitylist_k
        entitylist_k = []
        def add_button_k(b):
            with output1_k:
                global entitylist_k
                entitylist_k.append(w1_k.value)
                output1_k.clear_output()
                print("Entities to plot:\n")
                print(entitylist_k)
        button1_k.on_click(add_button_k)

        def generate_button_k(b):
            with output2_k:
                plot_stuff_k(entitylist_k)
        button2_k.on_click(generate_button_k)

        def reset_button_k(b):
            global entitylist_k
            entitylist_k = []
            with output1_k:
                output1_k.clear_output()
                print("List cleared")
            with output2_k:
                output2_k.clear_output()
        button3_k.on_click(reset_button_k)

        #Function to plot stuff
        def plot_stuff_k(entitylist_k):
            dataframe = df_k # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_k = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_k)]
            p1 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p1.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p1.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p1.xaxis.axis_label = "Year"
            p1.yaxis.axis_label = "Count"
            p1.yaxis.formatter.use_scientific = False
            p1.title.text_font_size = "20px"
            p1.legend.location = "top_left"
            p1.legend.click_policy="hide"
            p1.outline_line_width = 5
            p1.outline_line_alpha = 0.3
            p1.outline_line_color = "navy"
            p1.add_tools(HoverTool())
            hover = p1.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y")]
            hover.mode = 'mouse'
            p1.title.text = "Trends of " + title_k + ", " + str(year0) + " - " + str(year1) 
            tab1 = Panel(child=p1, title="Frequency")
        ### normalized data
            dataframe = df_k_n # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_k_n = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_k)]
            p2 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p2.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p2.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p2.xaxis.axis_label = "Year"
            p2.yaxis.axis_label = "Normalized frequency"
            p2.yaxis.formatter.use_scientific = False
            p2.title.text_font_size = "20px"
            p2.legend.location = "top_left"
            p2.legend.click_policy="hide"
            p2.outline_line_width = 5
            p2.outline_line_alpha = 0.3
            p2.outline_line_color = "navy"
            p2.add_tools(HoverTool())
            hover = p2.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y"), ("", "@y{0.00%}")]
            hover.mode = 'mouse'
            p2.title.text = "Trends of " + title_k_n + ", " + str(year0) + " - " + str(year1) 
            tab2 = Panel(child=p2, title="Normalized Frequency")
            show(Tabs(tabs=[tab1, tab2]))
        plot_k = interactive(plot_stuff_k, entitylist_k = w1_k)

        ########### MeSH ###########
        # Function to sort df
        def sort_by_m(column):
            data = df_m
            data = data.sort_values(by=column, ascending=False)
            display(data.head(20))
        sort_by_options_m = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        sorted_data_m = interactive(sort_by_m, column = sort_by_options_m)
        ###    
        #sort_by_options_m = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        #sorted_data_ = interactive(sort_by_m, column = sort_by_options_m)
        w1_m = widgets.Combobox(placeholder='Choose...', options=wordlist_m_o, ensure_option=True, disabled=False, description = "MeSH terms:")
        button1_m = widgets.Button(description="Add")
        button2_m = widgets.Button(description="Generate")
        button3_m = widgets.Button(description="Reset")
        output1_m = widgets.Output()
        output2_m = widgets.Output()
        global entitylist_m
        entitylist_m = []
        def add_button_m(b):
            with output1_m:
                global entitylist_m
                entitylist_m.append(w1_m.value)
                output1_m.clear_output()
                print("Entities to plot:\n")
                print(entitylist_m)
        button1_m.on_click(add_button_m)

        def generate_button_m(b):
            with output2_m:
                plot_stuff_m(entitylist_m)
        button2_m.on_click(generate_button_m)

        def reset_button_m(b):
            global entitylist_m
            entitylist_m = []
            with output1_m:
                output1_m.clear_output()
                print("List cleared")
            with output2_m:
                output2_m.clear_output()
        button3_m.on_click(reset_button_m)

        #Function to plot stuff
        def plot_stuff_m(entitylist_m):
            dataframe = df_m # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_m = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_m)]
            p1 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p1.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p1.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p1.xaxis.axis_label = "Year"
            p1.yaxis.axis_label = "Count"
            p1.yaxis.formatter.use_scientific = False
            p1.title.text_font_size = "20px"
            p1.legend.location = "top_left"
            p1.legend.click_policy="hide"
            p1.outline_line_width = 5
            p1.outline_line_alpha = 0.3
            p1.outline_line_color = "navy"
            p1.add_tools(HoverTool())
            hover = p1.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y")]
            hover.mode = 'mouse'
            p1.title.text = "Trends of " + title_m + ", " + str(year0) + " - " + str(year1) 
            tab1 = Panel(child=p1, title="Frequency")
        ### normalized data
            dataframe = df_m_n # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_m_n = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_m)]
            p2 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p2.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p2.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p2.xaxis.axis_label = "Year"
            p2.yaxis.axis_label = "Normalized frequency"
            p2.yaxis.formatter.use_scientific = False
            p2.title.text_font_size = "20px"
            p2.legend.location = "top_left"
            p2.legend.click_policy="hide"
            p2.outline_line_width = 5
            p2.outline_line_alpha = 0.3
            p2.outline_line_color = "navy"
            p2.add_tools(HoverTool())
            hover = p2.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y"), ("", "@y{0.00%}")]
            hover.mode = 'mouse'
            p2.title.text = "Trends of " + title_m_n + ", " + str(year0) + " - " + str(year1) 
            tab2 = Panel(child=p2, title="Normalized Frequency")
            show(Tabs(tabs=[tab1, tab2]))
        plot_m = interactive(plot_stuff_m, entitylist_m = w1_m)

        ########### Authors ###########
        # function to sort df
        def sort_by_a(column):
            data = df_a
            data = data.sort_values(by=column, ascending=False)
            display(data.head(20))
        sort_by_options_a = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        sorted_data_a = interactive(sort_by_a, column = sort_by_options_a)
        ###    
        #sort_by_options_a = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        #sorted_data_ = interactive(sort_by_a, column = sort_by_options_a)
        w1_a = widgets.Combobox(placeholder='Choose...', options=wordlist_a_o, ensure_option=True, disabled=False, description = "Authors:")
        button1_a = widgets.Button(description="Add")
        button2_a = widgets.Button(description="Generate")
        button3_a = widgets.Button(description="Reset")
        output1_a = widgets.Output()
        output2_a = widgets.Output()
        global entitylist_a
        entitylist_a = []
        def add_button_a(b):
            with output1_a:
                global entitylist_a
                entitylist_a.append(w1_a.value)
                output1_a.clear_output()
                print("Entities to plot:\n")
                print(entitylist_a)
        button1_a.on_click(add_button_a)

        def generate_button_a(b):
            with output2_a:
                plot_stuff_a(entitylist_a)
        button2_a.on_click(generate_button_a)

        def reset_button_a(b):
            global entitylist_a
            entitylist_a = []
            with output1_a:
                output1_a.clear_output()
                print("List cleared")
            with output2_a:
                output2_a.clear_output()
        button3_a.on_click(reset_button_a)

        #Function to plot stuff
        def plot_stuff_a(entitylist_a):
            dataframe = df_a # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_a = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_a)]
            p1 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p1.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p1.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p1.xaxis.axis_label = "Year"
            p1.yaxis.axis_label = "Count"
            p1.yaxis.formatter.use_scientific = False
            p1.title.text_font_size = "20px"
            p1.legend.location = "top_left"
            p1.legend.click_policy="hide"
            p1.outline_line_width = 5
            p1.outline_line_alpha = 0.3
            p1.outline_line_color = "navy"
            p1.add_tools(HoverTool())
            hover = p1.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y")]
            hover.mode = 'mouse'
            p1.title.text = "Trends of " + title_a + ", " + str(year0) + " - " + str(year1) 
            tab1 = Panel(child=p1, title="Frequency")
        ### normalized data
            dataframe = df_a_n # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_a_n = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_a)]
            p2 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p2.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p2.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p2.xaxis.axis_label = "Year"
            p2.yaxis.axis_label = "Normalized frequency"
            p2.yaxis.formatter.use_scientific = False
            p2.title.text_font_size = "20px"
            p2.legend.location = "top_left"
            p2.legend.click_policy="hide"
            p2.outline_line_width = 5
            p2.outline_line_alpha = 0.3
            p2.outline_line_color = "navy"
            p2.add_tools(HoverTool())
            hover = p2.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y"), ("", "@y{0.00%}")]
            hover.mode = 'mouse'
            p2.title.text = "Trends of " + title_a_n + ", " + str(year0) + " - " + str(year1) 
            tab2 = Panel(child=p2, title="Normalized Frequency")
            show(Tabs(tabs=[tab1, tab2]))
        plot_a = interactive(plot_stuff_a, entitylist_a = w1_a)
        
        
        ########### Lemmas in TiAb ###########
        # function to sort df
        def sort_by_l(column):
            data = df_l
            data = data.sort_values(by=column, ascending=False)
            display(data.head(20))
        sort_by_options_l = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        sorted_data_l = interactive(sort_by_l, column = sort_by_options_l)
        ###    
        #sort_by_options_a = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        #sorted_data_ = interactive(sort_by_a, column = sort_by_options_a)
        w1_l = widgets.Combobox(placeholder='Choose...', options=wordlist_l_o, ensure_option=True, disabled=False, description = "Lemmas:")
        button1_l = widgets.Button(description="Add")
        button2_l = widgets.Button(description="Generate")
        button3_l = widgets.Button(description="Reset")
        output1_l = widgets.Output()
        output2_l = widgets.Output()
        global entitylist_l
        entitylist_l = []
        def add_button_l(b):
            with output1_l:
                global entitylist_l
                entitylist_l.append(w1_l.value)
                output1_l.clear_output()
                print("Entities to plot:\n")
                print(entitylist_l)
        button1_l.on_click(add_button_l)

        def generate_button_l(b):
            with output2_l:
                plot_stuff_l(entitylist_l)
        button2_l.on_click(generate_button_l)

        def reset_button_l(b):
            global entitylist_l
            entitylist_l = []
            with output1_l:
                output1_l.clear_output()
                print("List cleared")
            with output2_l:
                output2_l.clear_output()
        button3_l.on_click(reset_button_l)

        #Function to plot stuff
        def plot_stuff_l(entitylist_l):
            dataframe = df_l # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_l = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_l)]
            p1 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p1.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p1.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p1.xaxis.axis_label = "Year"
            p1.yaxis.axis_label = "Count"
            p1.yaxis.formatter.use_scientific = False
            p1.title.text_font_size = "20px"
            p1.legend.location = "top_left"
            p1.legend.click_policy="hide"
            p1.outline_line_width = 5
            p1.outline_line_alpha = 0.3
            p1.outline_line_color = "navy"
            p1.add_tools(HoverTool())
            hover = p1.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y")]
            hover.mode = 'mouse'
            p1.title.text = "Trends of " + title_l + ", " + str(year0) + " - " + str(year1) 
            tab1 = Panel(child=p1, title="Frequency")
        ### normalized data
            dataframe = df_l_n # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_l_n = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_l)]
            p2 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p2.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p2.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p2.xaxis.axis_label = "Year"
            p2.yaxis.axis_label = "Normalized frequency"
            p2.yaxis.formatter.use_scientific = False
            p2.title.text_font_size = "20px"
            p2.legend.location = "top_left"
            p2.legend.click_policy="hide"
            p2.outline_line_width = 5
            p2.outline_line_alpha = 0.3
            p2.outline_line_color = "navy"
            p2.add_tools(HoverTool())
            hover = p2.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y"), ("", "important: this is NOT a percentage.")]
            hover.mode = 'mouse'
            p2.title.text = "Trends of " + title_l_n + ", " + str(year0) + " - " + str(year1) 
            tab2 = Panel(child=p2, title="Normalized Frequency")
            show(Tabs(tabs=[tab1, tab2]))
        plot_l = interactive(plot_stuff_l, entitylist_l = w1_l)
        
        ########### COI lemmas ###########
        def sort_by_lcoi(column):
            data = df_lcoi
            data = data.sort_values(by=column, ascending=False)
            display(data.head(20))
        sort_by_options_lcoi = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        sorted_data_lcoi = interactive(sort_by_lcoi, column = sort_by_options_lcoi)
        ###    
        #sort_by_options_lcoi = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        #sorted_data_lcoi = interactive(sort_by_lcoi, column = sort_by_options_lcoi)
        w1_lcoi = widgets.Combobox(placeholder='Choose...', options=wordlist_lcoi_o, ensure_option=True, disabled=False, description = "COI lemmas:")
        button1_lcoi = widgets.Button(description="Add")
        button2_lcoi = widgets.Button(description="Generate")
        button3_lcoi = widgets.Button(description="Reset")
        output1_lcoi = widgets.Output()
        output2_lcoi = widgets.Output()
        global entitylist_lcoi
        entitylist_lcoi = []
        def add_button_lcoi(b):
            with output1_lcoi:
                global entitylist_lcoi
                entitylist_lcoi.append(w1_lcoi.value)
                output1_lcoi.clear_output()
                print("Entities to plot:\n")
                print(entitylist_lcoi)
        button1_lcoi.on_click(add_button_lcoi)

        def generate_button_lcoi(b):
            with output2_lcoi:
                plot_stuff_lcoi(entitylist_lcoi)
        button2_lcoi.on_click(generate_button_lcoi)

        def reset_button_lcoi(b):
            global entitylist_lcoi
            entitylist_lcoi = []
            with output1_lcoi:
                output1_lcoi.clear_output()
                print("List cleared")
            with output2_lcoi:
                output2_lcoi.clear_output()
        button3_lcoi.on_click(reset_button_lcoi)

        #Function to plot stuff
        def plot_stuff_lcoi(entitylist_lcoi):
            dataframe = df_lcoi # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_lcoi = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_lcoi)]
            p1 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p1.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p1.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p1.xaxis.axis_label = "Year"
            p1.yaxis.axis_label = "Count"
            p1.yaxis.formatter.use_scientific = False
            p1.title.text_font_size = "20px"
            p1.legend.location = "top_left"
            p1.legend.click_policy="hide"
            p1.outline_line_width = 5
            p1.outline_line_alpha = 0.3
            p1.outline_line_color = "navy"
            p1.add_tools(HoverTool())
            hover = p1.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y")]
            hover.mode = 'mouse'
            p1.title.text = "Trends of " + title_lcoi + ", " + str(year0) + " - " + str(year1) 
            tab1 = Panel(child=p1, title="Frequency")
        ### normalized data
            dataframe = df_lcoi_n # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_lcoi_n = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_lcoi)]
            p2 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p2.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p2.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p2.xaxis.axis_label = "Year"
            p2.yaxis.axis_label = "Normalized frequency"
            p2.yaxis.formatter.use_scientific = False
            p2.title.text_font_size = "20px"
            p2.legend.location = "top_left"
            p2.legend.click_policy="hide"
            p2.outline_line_width = 5
            p2.outline_line_alpha = 0.3
            p2.outline_line_color = "navy"
            p2.add_tools(HoverTool())
            hover = p2.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y"), ("", "important: this is NOT a percentage.")]
            hover.mode = 'mouse'
            p2.title.text = "Trends of " + title_lcoi_n + ", " + str(year0) + " - " + str(year1) 
            tab2 = Panel(child=p2, title="Normalized Frequency")
            show(Tabs(tabs=[tab1, tab2]))
        plot_lcoi = interactive(plot_stuff_lcoi, entitylist_lcoi = w1_lcoi)
        
        ########### COI lemmas ###########
        def sort_by_j(column):
            data = df_j
            data = data.sort_values(by=column, ascending=False)
            display(data.head(20))
        sort_by_options_j = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        sorted_data_j = interactive(sort_by_j, column = sort_by_options_j)
        ###    
        #sort_by_options_j = widgets.Dropdown(options=["total", "min" , "std", "mean", "max"], description='Sort by:')
        #sorted_data_j = interactive(sort_by_j, column = sort_by_options_j)
        w1_j = widgets.Combobox(placeholder='Choose...', options=wordlist_j_o, ensure_option=True, disabled=False, description = "Journals:")
        button1_j = widgets.Button(description="Add")
        button2_j = widgets.Button(description="Generate")
        button3_j = widgets.Button(description="Reset")
        output1_j = widgets.Output()
        output2_j = widgets.Output()
        global entitylist_j
        entitylist_j = []
        def add_button_j(b):
            with output1_j:
                global entitylist_j
                entitylist_j.append(w1_j.value)
                output1_j.clear_output()
                print("Entities to plot:\n")
                print(entitylist_j)
        button1_j.on_click(add_button_j)

        def generate_button_j(b):
            with output2_j:
                plot_stuff_j(entitylist_j)
        button2_j.on_click(generate_button_j)

        def reset_button_j(b):
            global entitylist_j
            entitylist_j = []
            with output1_j:
                output1_j.clear_output()
                print("List cleared")
            with output2_j:
                output2_j.clear_output()
        button3_j.on_click(reset_button_j)

        #Function to plot stuff
        def plot_stuff_j(entitylist_j):
            dataframe = df_j # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_j = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_j)]
            p1 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p1.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p1.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p1.xaxis.axis_label = "Year"
            p1.yaxis.axis_label = "Count"
            p1.yaxis.formatter.use_scientific = False
            p1.title.text_font_size = "20px"
            p1.legend.location = "top_left"
            p1.legend.click_policy="hide"
            p1.outline_line_width = 5
            p1.outline_line_alpha = 0.3
            p1.outline_line_color = "navy"
            p1.add_tools(HoverTool())
            hover = p1.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y")]
            hover.mode = 'mouse'
            p1.title.text = "Trends of " + title_j + ", " + str(year0) + " - " + str(year1) 
            tab1 = Panel(child=p1, title="Frequency")
        ### normalized data
            dataframe = df_j_n # -> Modify accordingly
            dataframe = dataframe.drop(columns=["total", "min", "std", "mean", "max"])
            title_j_n = str(dataframe.columns[0])
            dataframe = dataframe.set_index(dataframe.columns[0])
            dataframe.columns = years
            dataframe = dataframe[dataframe.index.isin(entitylist_j)]
            p2 = figure(plot_width=1000, plot_height=400)
            i = 1
            for index, row in dataframe.iterrows():
                x = years
                y = (dataframe.loc[index].tolist())
                lab = index
                name_for_display = np.tile(index, [len(dataframe.columns),1])
                source = ColumnDataSource({'x': x, 'y': y, 'series_name': name_for_display})
                color = palette[i]
                i = i+1
                line = p2.line("x", "y", source = source, line_width=2, color=color, alpha=1, legend_label=lab)
                scatter = p2.scatter("x", "y", source = source, size=10, color=color, alpha=0)
            p2.xaxis.axis_label = "Year"
            p2.yaxis.axis_label = "Normalized frequency"
            p2.yaxis.formatter.use_scientific = False
            p2.title.text_font_size = "20px"
            p2.legend.location = "top_left"
            p2.legend.click_policy="hide"
            p2.outline_line_width = 5
            p2.outline_line_alpha = 0.3
            p2.outline_line_color = "navy"
            p2.add_tools(HoverTool())
            hover = p2.select(dict(type=HoverTool))
            hover.tooltips = [("Series", "@series_name"), ("Year", "@x"),  ("Value", "@y"), ("", "@y{0.00%}")]
            hover.mode = 'mouse'
            p2.title.text = "Trends of " + title_j_n + ", " + str(year0) + " - " + str(year1) 
            tab2 = Panel(child=p2, title="Normalized Frequency")
            show(Tabs(tabs=[tab1, tab2]))
        plot_j = interactive(plot_stuff_j, entitylist_j = w1_j)
        
        ########### All together ###########
        # tab boxes
        box_k = widgets.VBox([sorted_data_k, w1_k, button1_k, output1_k, button2_k, button3_k, output2_k])
        box_m = widgets.VBox([sorted_data_m, w1_m, button1_m, output1_m, button2_m, button3_m, output2_m])
        box_a = widgets.VBox([sorted_data_a, w1_a, button1_a, output1_a, button2_a, button3_a, output2_a])
        box_l = widgets.VBox([sorted_data_l, w1_l, button1_l, output1_l, button2_l, button3_l, output2_l])
        box_lcoi = widgets.VBox([sorted_data_lcoi, w1_lcoi, button1_lcoi, output1_lcoi, button2_lcoi, button3_lcoi, output2_lcoi])
        box_j = widgets.VBox([sorted_data_j, w1_j, button1_j, output1_j, button2_j, button3_j, output2_j])
        tab_contents = ['Keyword', 'MeSH', 'Authors', 'Lemmas in TiAb', "Lemmas in COI", "Journals"]
        children = [box_k, box_m, box_a, box_l, box_lcoi, box_j]
        tab = widgets.Tab()
        tab.children = children
        for i in tab_contents:
            tab.set_title(tab_contents.index(i), i)

        display(tab)
    with output_analysis_status:
        output_analysis_status.clear_output()

button_analyse.on_click(button_analyse_d)