In [1]:
import pandas as pd
import numpy as np 
import time
from collections import Counter
import re
import math
from gensim.models import KeyedVectors
from pprint import pprint
import pyspark

MAX_LEAFS = 128

class MeshCode:
    def __init__(self, ID, name, treeNumber, child_mesh_code=[]):
        self.id = ID
        self.name = name
        self.treeNumber = treeNumber
        self.children = child_mesh_code
    
    def __repr__(self):
        return "id: {}, name: {}".format(self.id, self.name)
    
    
MESH_HIERARCHY = MeshCode(
    "D003920", "Diabetes Mellitus", "C19.246" , [
        MeshCode("D048909", "Diabetes Complications", "C19.246.099", [
               MeshCode("D003925", "Diabetic Angiopathies", "C19.246.099.500", [
                       #MeshCode("D017719", "Diabetic Foot", "C19.246.099.500.191")  # to prevent double foot 
                      MeshCode("D003930", "Diabetic Retinopathy", "C19.246.099.500.382") 
               ]) 
              , MeshCode("D058065", "Diabetic Cardiomyopathies", "C19.246.099.625") 
              , MeshCode("D003926", "Diabetic Coma", "C19.246.099.750", [
                       MeshCode("D006944", "Hyperglycemic Hyperosmolar Nonketotic Coma", "C19.246.099.750.490", []) 
               ]) 
              , MeshCode("D016883", "Diabetic Ketoacidosis", "C19.246.099.812") 
              , MeshCode("D003928", "Diabetic Nephropathies", "C19.246.099.875") 
              , MeshCode("D003929", "Diabetic Neuropathies", "C19.246.099.937", [
                       MeshCode("D017719", "Diabetic Foot", "C19.246.099.937.250") 
               ]) 
              , MeshCode("D005320", "Fetal Macrosomia", "C19.246.099.968") 
        ])
       , MeshCode("D016640", "Diabetes, Gestational", "C19.246.200")
       , MeshCode("D003921", "Diabetes Mellitus, Experimental", "C19.246.240")
       , MeshCode("D003922", "Diabetes Mellitus, Type 1", "C19.246.267", [
                MeshCode("D014929", "Wolfram Syndrome", "C19.246.267.960")
        ])
       , MeshCode("D003924", "Diabetes Mellitus, Type 2", "C19.246.300", [
                MeshCode("D003923", "Diabetes Mellitus, Lipoatrophic", "C19.246.300.500")
        ])
       , MeshCode("D056731", "Donohue Syndrome", "C19.246.537")
       , MeshCode("D000071698", "Latent Autoimmune Diabetes in Adults", "C19.246.656")
       , MeshCode("D011236", "Prediabetic State", "C19.246.774")
    ]
)



class Tree(object):
    
    def __init__(self, tree_hierarchy, clusters_predict=[], mode="sklearn", sentences_all_classes=None, true_classes_all=None):
        """
        @param mode : Two possible values
            - "FBE" : Tree object for Feedback Explorer output
            - "sklearn" : Tree object for scikit learn output
        
        @param sentences_all_classes : List of all possible classes occuring in the sentences file (only for mode FBE)
        @param true_labels_all : All occuring true labels (mesh codes) of the documents/abstracts
        """
        self.tree = None
        if mode in ["sklearn", "FBE"]:
            self.mode = mode
        else:
            raise ValueError("Provided mode '{}' is not supported".format(mode))
        self.tree_hierarchy = tree_hierarchy # pandas dataframe with tree structure coming from hierarchical clustering
        self.n_nodes = 0 # updated by calling self.count_nodes()
        self.n_leafs = 0 # updated by calling self.count_leafs()
        self.temp_n_leafs = 1 # In mode 'FBE' helps to construct the tree with the right number of nodes
        self.clusters_predict = clusters_predict # predicted cluster for each document
        self.unique_cluster_predict = list(set(clusters_predict)) # list of all classes to calculate performance metrices
        self.leaf_nodes = [] # list of all leaf nodes
        self.sentences_all_classes = sentences_all_classes # List of all classes occuring in sentences (phrases.parquet)
        self.true_classes_documents = true_classes_all.values.tolist() # list of true labels (mesh codes) in the abstracts
        self.true_classes_documents_unique = list(set(true_classes_all)) # all possible occuring true labels (mesh codes) in the abstracts
        self.precision_all_nodes = [] # macro
        self.precision_all_nodes_weighted = []
        self.precision_all_nodes_weights = 0
        self.precision_macro = None 
        self.precision_micro = None
        self.recall_all_classes = []
        self.recall_all_classes_weighted = []
        self.recall_macro = None
        self.recall_micro = None
        self.F1_macro = None
        self.F1_micro = None
        self.maxDepth = 0
        self.temp_max_occ_class_in_cluster = 0
        self.temp_max_doc_perClass_inCluster = 0
        self.temp_mesh_and_its_childs = [] # list of a given mesh code and its children mesh codes
        
        
    def _build_tree(self, node, current_depth=None):
        if self.mode == "sklearn":
            if node.node_id in self.tree_hierarchy["node_id"].values: # if node not leaf
                treeChildren = self.tree_hierarchy[self.tree_hierarchy["node_id"] == node.node_id]
                node.add_child(Node(Id=treeChildren["left"].values[0], depth=node.depth + 1, parent=node))
                node.add_child(Node(Id=treeChildren["right"].values[0], depth=node.depth + 1, parent=node))
                self._build_tree(node.children[0])
                self._build_tree(node.children[1])
            else:
                return node
            return node
        elif self.mode == "FBE": 
            # Only create node if node is in current depth level
            if node.depth == current_depth and self.temp_n_leafs < MAX_LEAFS:
                treeChildren = self.tree_hierarchy.iloc[node.node_id].children
                #print("\t{}".format(node))
                #print("tree children:")
                #print(treeChildren)
                #print()
                # FBE tree is not a perfect binary tree, some nodes don't create children any more
                if len(treeChildren) > 0:
                    cluster_child_one = self.tree_hierarchy.iloc[treeChildren[0]].filterValue[0]
                    cluster_child_two = self.tree_hierarchy.iloc[treeChildren[1]].filterValue[0]
                #    print("c1: {}, c2: {}".format(cluster_child_one, cluster_child_two))
                    # Some nodes from nodes.json are empty: no sentences is going through them
                    # Only create node in tree when there is a sentence running through it
                    if cluster_child_one in self.sentences_all_classes:
                #        print("\c1 in class")
                        self.temp_n_leafs -= 1 # lose one leaf because it is split into two new leafs
                        node.add_child(Node(Id=treeChildren[0], depth=node.depth + 1, parent=node, cluster_label=cluster_child_one))
                        self.temp_n_leafs += 1
                        if cluster_child_two in sentences_all_classes:
                #            print("\tc1 and c2 in class")
                            node.add_child(Node(Id=treeChildren[1], depth=node.depth + 1, parent=node, cluster_label=cluster_child_two))
                            self.temp_n_leafs += 1
                    elif cluster_child_two in sentences_all_classes:
                #        print("\tc2 in class")
                        self.temp_n_leafs -= 1 # lose one leaf because it is split into two new leafs
                        node.add_child(Node(Id=treeChildren[1], depth=node.depth + 1, parent=node, cluster_label=cluster_child_two))
                        self.temp_n_leafs += 1          
                #    else:
                #        print("\tno class for c1 and c2")
            else:
                if len(node.children) == 1 and self.temp_n_leafs < MAX_LEAFS:
                    self._build_tree(node.children[0], current_depth)
                elif len(node.children) == 2 and self.temp_n_leafs < MAX_LEAFS:
                    self._build_tree(node.children[0], current_depth)
                    self._build_tree(node.children[1], current_depth)
            return node

    def _update_leaf_to_root(self, node, abstract_id, class_predict):
        """ Updates node and all its ancestors up to the root with the abstract's id and the predicted class"""
        node.update_node(abstract_id, class_predict)
        if node.parent != None: # Root has no parent
            self._update_leaf_to_root(node.parent, abstract_id, class_predict)
    

    def set_build_tree(self,node):
        """ Builds the tree and sets the variable tree."""  

        # tree with MAX_LEAFS leafs is constructed. 
        # For sklearn add to each leaf its cluster label based on the children in the tree object from sklearn AgglomerativeClustering
        self.leaf_nodes = []
        if self.mode == "sklearn":
            tree = self._build_tree(node) # construct whole tree
            tree = self._get_cluster_labels_for_leafs(tree) # get labels for leafs
            tree = self._cut_nodes_from_leafs(tree) # cut nodes from bottom of the tree until only leafs with a unique cluster_label exist (Number leaves = MAX_LEAFS)
        elif self.mode == "FBE":
            self.temp_n_leafs = 1
            self.maxDepth = 0
            self._get_maxDepth(0, 0)
            depth = 0
            #print("maxDepth: {}".format(self.maxDepth))
            # build tree by level: create first all children for level 1, then level 2... 
            # Prevents that a tree creates children just in one branch and always goes deeper in case of a max number of leavese
            while self.temp_n_leafs < MAX_LEAFS and depth <= self.maxDepth:
            #    print("\n\ndepth: {}, temp_n_leafs: {}".format(depth, self.temp_n_leafs))
                tree = self._build_tree(node, depth)
                depth += 1

        assert isinstance(tree, Node)
        self.tree = tree
        print("Count nodes: {}; leafs: {}".format(self.count_nodes(), self.count_leafs()))


    def _get_maxDepth(self, i, depth):
        """ get max depth of tree"""
        if depth > self.maxDepth:
            self.maxDepth = depth        
        node = self.tree_hierarchy.iloc[i]
        if len(node.children) == 1:
            self._get_maxDepth(node.children[0], depth+1)
        elif len(node.children) == 2:
            self._get_maxDepth(node.children[0], depth+1)
            self._get_maxDepth(node.children[1], depth+1)


        
    def _get_cluster_labels_for_leafs(self, node):
        """ 
            Get's the cluster labels for each leafs using the cluster labels assigned by
            the output of the sklearn agglomerative clustering algorithm.
        """        
        if len(node.children) == 0: #leaf
            cluster_label = self.clusters_predict[node.node_id]
            node.set_clusterLabel(cluster_label)
        else: # no leaf
            self._get_cluster_labels_for_leafs(node.children[0])
            self._get_cluster_labels_for_leafs(node.children[1])
        return node
    
    def _cut_nodes_from_leafs(self, node):
        """ 
            self.mode == sklearn:
            Children of nodes, who are leafs and have the same cluster_label, are cut off
            and the parent node takes the cluster label of its children.
            This is done recursively until there are only leafs with unique cluster_labels 
            Number of leaves = MAX_LEAFS
        
            self.mode == FBE:
            Towards the bottom of the tree, it may happen that a node has only child, which has only one child, 
            and this child also has only one child, etc. Several nodes following of each other with only one child.
            In this case keep only the child C whose parent has two children and cut the child of C.
        """
        if self.mode == "sklearn":
            if len(node.children) > 0: 
                left_child = node.children[0]
                right_child = node.children[1]
                if left_child.cluster_label is None: # left child is not leaf 
                    self._cut_nodes_from_leafs(left_child)
                if right_child.cluster_label is None: # right child is not leaf 
                    self._cut_nodes_from_leafs(right_child)

                # should be updated now
                left_child = node.children[0]
                right_child = node.children[1]
                if left_child.cluster_label == right_child.cluster_label and left_child.cluster_label is not None:
                    node.children = []
                    node.cluster_label = left_child.cluster_label
                    return node
        elif self.mode == "FBE":
            if len(node.children) == 1: # node has only one child
                temp = node
                while len(temp.children) == 1: # check if several nodes following of each other have only one child two
                    temp = temp.children[0]
                if len(temp.children) == 2: # if at some point a node has two children, continue to search
                    self._cut_nodes_from_leafs(temp.children[0])    
                    self._cut_nodes_from_leafs(temp.children[1])
                else: # if we reached a leaf, cut the node's children
                    node.children = []
            elif len(node.children) == 2:
                self._cut_nodes_from_leafs(node.children[0])    
                self._cut_nodes_from_leafs(node.children[1])             
        return node    

    
        
    def fitTree(self, node, data):
        """ Updates all the nodes of the tree according to the clustering from bottom to top """

        assert isinstance(node, Node)
        if len(node.children) > 0: # no leaf
            for child in node.children:
                self.fitTree(child, data)
        else: # leaf
            if self.mode == "sklearn": 
                leaf_cluster_label = node.cluster_label
                abstract_hits = data[data["class_predict"] == leaf_cluster_label]
                for i, row in abstract_hits.iterrows():
                    leaf_abstract_id = row.name
                    leaf_abstract_class_true = row.mesh_ui_diab # true class 
                    self._update_leaf_to_root(node, leaf_abstract_id, leaf_abstract_class_true)
            elif self.mode == "FBE": # several documents per leaf
                leaf_cluster_label = node.cluster_label
                abstract_hits = data[data["uniqueCluster"] == leaf_cluster_label]
                for i, row in abstract_hits.iterrows():
                    leaf_abstract_id = row["id"]
                    leaf_abstract_class_true = row["mesh_ui_diab"]
                    self._update_leaf_to_root(node, leaf_abstract_id, leaf_abstract_class_true)
            else: 
                print("ERROR: mode should be one of ['sklearn', 'FBE']")
        return node
         
            
    def count_nodes(self, tree=None):
        self.n_nodes = 0
        def _walk_count_nodes(node):
            self.n_nodes += 1
            for child in node.children:
                _walk_count_nodes(child)   
                
        if tree == None:
            _walk_count_nodes(self.tree)
        else:
            _walk_count_nodes(tree)
        return self.n_nodes

                
    def count_leafs(self, tree=None):

        def _walk_count_leafs(node):
            if node.children == []:
                self.n_leafs += 1
                self.leaf_nodes.append(node)
            else:
                for child in node.children:
                    _walk_count_leafs(child)
        
        self.n_leafs = 0
        self.leaf_nodes = []
        if tree == None:
            _walk_count_leafs(self.tree)
        else:
            _walk_count_leafs(tree)
        return self.n_leafs
    
    
    def get_leaf_nodes(self):
        def _walk_leaf_nodes(node):
            if node.children == []:
                self.leaf_nodes.append(node)
            else:
                for child in node.children:
                    _walk_leaf_nodes(child)
        
        self.leaf_nodes = []
        _walk_leaf_nodes(self.tree)
        return self.leaf_nodes
    
    def _walk_precision(self, node):
        node_precision = node.get_precision()
        self.precision_all_nodes.append(node_precision)
        self.precision_all_nodes_weighted.append(node_precision * node.counts)
        self.precision_all_nodes_weights += node.counts
        for child in node.children:
            self._walk_precision(child)
            
    def get_precision(self):
        self.precision_all_nodes = []
        self.precision_all_nodes_weighted = []
        self.precision_all_nodes_weights = 0
        self._walk_precision(self.tree)
        self.precision_macro = np.mean(self.precision_all_nodes)
        self.precision_micro = np.sum(self.precision_all_nodes_weighted) / self.precision_all_nodes_weights
        return {"prec_macro" : self.precision_macro
                , "prec_micro" : self.precision_micro}

        
    def get_recall(self):
        
        self.recall_all_classes = []
        self.recall_all_classes_weighted = []
        def _walk_recall(node, c):
            """ Get cluster with max documents of class c in which class c is the majority class """
            class_counts = Counter(node.true_classes).most_common()
            majority_classes = [c for c, occ in class_counts  if occ == class_counts[0][1]] # there can be several majority classes in a node
            #majority_class = Counter(node.true_classes).most_common()[0][0]
            occ = node.true_classes.count(c)
            #print()
            #print(node)
            #print("\t{}".format(node.true_classes))
            #print("\tmajority_classe: {}, occ({}): {}".format(majority_classes, c, occ))
            if c in majority_classes and occ > self.temp_max_occ_class_in_cluster:
                self.temp_max_occ_class_in_cluster = occ
            #    print("\t updatetemp_max_occ_class_in_cluster: {}".format(self.temp_max_occ_class_in_cluster))
            #if (occ > self.temp_max_occ_class_in_cluster 
            #    and (c in majority_classes or node.children == [])
            #   ): # if we found a cluster with higher occ of documents for class c and the class c is the majority class in the cluster or leaf node
            #    self.temp_max_occ_class_in_cluster = occ
            #    print("\tupdatetemp_max_occ_class_in_cluster: {}".format(self.temp_max_occ_class_in_cluster))
            
            #if (occ > self.temp_max_occ_class_in_cluster and c in majority_classes):
            # self.temp_max_occ_class_in_cluster = occ
            #    print("\MAJ: tupdatetemp_max_occ_class_in_cluster: {}".format(self.temp_max_occ_class_in_cluster))
            #elif (occ > self.temp_max_occ_class_in_cluster and node.children == []):
            #    self.temp_max_occ_class_in_cluster = occ
            #    print("\tLEAF: updatetemp_max_occ_class_in_cluster: {}".format(self.temp_max_occ_class_in_cluster))
    
            for child in node.children:
                _walk_recall(child, c)
        
        weights_sum = 0
        for c in self.true_classes_documents_unique:
            N_c = self.true_classes_documents.count(c)
            #print("\nc: {}, N_c: {}".format(c, N_c))
            self.temp_max_occ_class_in_cluster = 0
#            _walk_recall(self.tree, c)
            # TODO: check if it is right!
            # # start with children; otherwise recalls for all classes will be highest in root
            _walk_recall(self.tree.children[0], c) 
            _walk_recall(self.tree.children[1], c)
            recall = self.temp_max_occ_class_in_cluster / N_c
            #print("c: {}, recall: {}".format(c, recall))

            self.recall_all_classes.append(recall) #len(self.unique_cluster_predict))
            self.recall_all_classes_weighted.append(recall * N_c)
            weights_sum += N_c
        self.recall_macro = np.mean(self.recall_all_classes)
        self.recall_micro = np.sum(self.recall_all_classes_weighted) / weights_sum
        return {"recall_macro" : self.recall_macro
                ,"recall_micro" : self.recall_micro}
    
    def get_F1(self):
        precision = self.get_precision()
        recall = self.get_recall()        
        
        self.F1_macro = 2*precision["prec_macro"]*recall["recall_macro"] / (precision["prec_macro"] + recall["recall_macro"])
        self.F1_micro = 2*precision["prec_micro"]*recall["recall_micro"] / (precision["prec_micro"] + recall["recall_micro"])
        return {"F1_macro":self.F1_macro
               ,"F1_micro":self.F1_micro}


    def _get_child_mesh_classes(self, meshId, currentMesh, foundMeshInHierarchy=False): 
        """ For a given meshId, get all its child meshId's from meshHierarchy """
        if meshId == currentMesh.id:
            foundMeshInHierarchy = True
        if foundMeshInHierarchy:
            self.temp_mesh_and_its_child_classes.append(currentMesh)
        for mesh_child in currentMesh.children:
            self._get_child_mesh_classes(meshId, mesh_child, foundMeshInHierarchy)


    def F1_zhao(self, evaluateOnlyOnLeafs=False):
        """ F1 score like in Evaluation of Hierarchical Clustering Algorithms forDocument Datasets from Zhao & Karypis """
        
        def _walk_F1_zhao(node, mesh_and_child_classes, N_c, evaluateOnlyOnLeafs):
            """ 
                Calculates F1 Score for a given list of mesh codes and its children mesh_and_child_classes (N_c = total number of documents of class c) 
                evaluateOnlyOnLeafs [True, False] : calculate F1 score only on leafs or on all nodes
            """
            #print("\t{}".format(node))
            #print("abstracts in node:")
            #print(node.true_classes)
            #for m in mesh_and_child_classes:
            #    print("\t\t mesh: {}; count mesh in node: {}".format(m, node.true_classes.count(m.id)))
            class_count = np.sum([node.true_classes.count(m.id) for m in mesh_and_child_classes])# + node.true_classes.count(childs of class c)
            prec =  class_count / node.counts
            recall = class_count / N_c #+ all documents from all children of c
            if prec > 1e-10 or recall > 1e-10: # if prec or recall == 0 
                F1 = 2 * prec * recall / (prec+recall)
            else:
                F1 = 0
            #print("\tclass_count: {}, prec: {}, recall: {}, F1: {}".format(class_count, prec, recall, F1))            
            if F1 > self.temp_max_doc_perClass_inCluster:
                self.temp_max_doc_perClass_inCluster = F1

            if not evaluateOnlyOnLeafs:
                for child in node.children:
                    _walk_F1_zhao(child, mesh_and_child_classes, N_c, evaluateOnlyOnLeafs)        
        
        if evaluateOnlyOnLeafs:
            leafs = self.get_leaf_nodes()
        
        FScore_sum = 0
        for meshid in self.true_classes_documents_unique:
            self.temp_mesh_and_its_child_classes = [] # reset 
            self._get_child_mesh_classes(meshid, MESH_HIERARCHY) 
            mesh_and_child_classes = self.temp_mesh_and_its_child_classes
            #N_c = self.true_classes_documents.count(c)
            N_c = np.sum([self.true_classes_documents.count(m.id) for m in mesh_and_child_classes]) #+ all documents from all children of c
            N = len(self.true_classes_documents)
            #print("\nc: {}, N_c: {}, N: {}".format(meshid, N_c, N, N_c/N))
            #print("\t, mesh_childs: {}".format( mesh_and_child_classes))
            self.temp_max_doc_perClass_inCluster = 0
            if evaluateOnlyOnLeafs == False: # evaluate on all nodes
                _walk_F1_zhao(self.tree.children[0], mesh_and_child_classes, N_c, evaluateOnlyOnLeafs) 
                _walk_F1_zhao(self.tree.children[1], mesh_and_child_classes, N_c, evaluateOnlyOnLeafs)
            else: # only leafs
                for leaf in leafs:
                    _walk_F1_zhao(leaf, mesh_and_child_classes, N_c, evaluateOnlyOnLeafs)
            #print("Best F1: {}".format(self.temp_max_doc_perClass_inCluster))
            FScore_sum += (N_c / N ) * self.temp_max_doc_perClass_inCluster
            #print("Score: {}".format((N_c / N ) * self.temp_max_doc_perClass_inCluster))
            
        return FScore_sum
    
    def get_isim(self, data):
        """ Internal similarity """
        
        I_sum = 0 
        def _walk_isim(node):
            
            print("Node: {}".format(node))
            print("abstracts: {}".node.abstracts)
            for child in node.children:
                _walk_isim(child)
            

    
    def get_performances(self, evaluateOnlyOnLeafs=False):
        precision = self.get_precision()
        recall = self.get_recall()
        F1 = self.get_F1()
        return({
            "prec_micro" : precision["prec_micro"]
            ,"prec_macro" : precision["prec_macro"]            
            ,"recall_micro" : recall["recall_micro"]
            ,"recall_macro" : recall["recall_macro"]
            ,"F1_micro" : F1["F1_micro"]
            ,"F1_macro" : F1["F1_macro"]
            ,"F1_zhao" : self.F1_zhao(evaluateOnlyOnLeafs=evaluateOnlyOnLeafs)
        })
 


class Node(object):
    "Generic tree node."
    def __init__(self, Id, depth, parent=None, cluster_label=None, children=[]):
        self.node_id = Id
        self.parent = parent
        self.children = []
        self.depth = depth
        self.cluster_label = cluster_label # In case FBE: this is the filterValue in the leafs
        self.abstracts = [] # PMID's of abstracts 
        self.true_classes = [] # True classes for each abstract
        self.counts = 0
        self.recall = None
        self.precision = None 
        self.F1 = None
        if children is not None:
            for child in children:
                self.add_child(child)
                
    def __repr__(self):
        return "Node id: {} (depth: {}, cluster_label: {}, children: {})".format(
            self.node_id
            , self.depth
            , self.cluster_label
            , [child.node_id for child in self.children])
    
    def add_child(self, node):
        assert isinstance(node, Node)
        self.children.append(node)
        
    def set_clusterLabel(self, clusterLabel):
        self.cluster_label = clusterLabel
        
    def pretty_print(self, depth=0):
        
        if self.depth == depth: 
            print("Node: {}, Parent: {} (Depth: {}, counts: {}, cluster_label: {}) | Children: {}".format(self.node_id, self.parent, self.depth, self.counts, self.cluster_label, self.children))
            print("\tAbstracts: {}".format(Counter(self.abstracts)))
            print("\ttrue_classes: {}".format(Counter(self.true_classes)))
        else:
            for child in self.children:
                child.pretty_print(depth)
            
            
    def update_node(self, abstract_id, true_class):
        """ Updates the abstracts and its true class label running through this node """
        self.abstracts.append(abstract_id)
        self.true_classes.append(true_class)
        self.counts += 1
        
        
    def get_precision(self):
        count = Counter(self.true_classes)
        mostFrequent = max(self.true_classes, key=count.get)
        prec = self.true_classes.count(mostFrequent) / self.counts
        return prec

    def count_class_occurrence(self, c):
        return self.true_classes.count(c)
    

#

In [2]:
#data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/outputs_03082020/diabetes_abstracts_HC_output.parquet")
#data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/output_withoutRootClassDiabetesMellitus_stopWordRemoval_K48/diabetes_abstracts_HC_output.parquet")
data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/output_withoutRootClassDiabetesMellitus_K48/diabetes_abstracts_HC_output.parquet")
#data.index = data.index.get_level_values("PMID")
data.reset_index(drop=True, inplace=True)
#data["PMID"] = pd.to_numeric(data["PMID"])
#HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/outputs_03082020/diabetes_abstracts_tree_output.parquet')
#HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/output_withoutRootClassDiabetesMellitus_stopWordRemoval_K48/diabetes_abstracts_tree_output.parquet')
HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/output_withoutRootClassDiabetesMellitus_K48/diabetes_abstracts_tree_output.parquet')

# Ex. 10 samples
#data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_HC_output_10Examples.parquet")
#HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_tree_output_10Examples.parquet')

# Ex. 30 samples
#data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_HC_output_30Examples.parquet")
#HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_tree_output_30Examples.parquet')

## TEST TREE
#data = pd.DataFrame({"PMID": [0, 1, 2, 3, 4, 5]
#                    , "class_predict": [3, 0, 0, 0, 1, 2]}
#                   , columns=["PMID", "class_predict"]).set_index("PMID")

#HC_tree = pd.DataFrame({"node_id":[6, 7, 8, 9, 10]
#                    , "left" : [1, 2, 0, 5, 8]
#                    , "right" :[3, 6, 4, 7, 9]}
#                   , columns=["node_id", "left", "right"])

print("Tree nodes: {}".format(HC_tree.shape))
print(HC_tree.head())
print("data size: {}".format(data.shape))
print(list(set(data["class_predict"])))
#print(list(set(data["mesh_ui_diab"])))
#print(data["mesh_ui_diab"])

#df_vec = (data.title + " " + data.abstract).map(lambda abstract: avg_feature_vector(abstract))
#print(type(df_vec))
#df_vec = np.stack(df_vec.values, axis = 0)
#print(df_vec.shape)
#print(type(df_vec))

print(data.mesh_mh_diab.value_counts())
data.head(2)



Tree nodes: (50910, 3)
   node_id   left  right
0    50911  34157  45500
1    50912  18450  47237
2    50913   2323  43884
3    50914   1011  12815
4    50915  11145  19489
data size: (50911, 10)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]
Diabetic Retinopathy                          5000
Diabetes Mellitus, Type 1                     5000
Diabetes Mellitus, Experimental               5000
Diabetes Mellitus, Type 2                     5000
Diabetes Complications                        5000
Diabetic Nephropathies                        5000
Diabetes, Gestational                         5000
Diabetic Foot                                 4424
Diabetic Neuropathies                         3662
Diabetic Angiopathies                         3026
Diabetic Ketoacidosis                         1308
Fetal Macrosomia                              1282
Prediabet

Unnamed: 0,PMID,title,abstract,date,mesh_ui,mesh_mh,mesh_ui_diab,mesh_mh_diab,title_abstract_prep,class_predict
0,28800712,Outcomes Achieved With Use of a Prefabricated ...,BACKGROUND\nThe total contact cast (TCC) is co...,2017-10,"D000328,D000367,D000368,D000369,D002370,D01533...","Adult,Age Factors,Aged,Aged, 80 and over,Casts...",D017719,Diabetic Foot,outcomes achieved with use of a prefabricated ...,10
1,6989594,Investigation of insulin sensitivity in early ...,Twenty-three normal weight subjects without an...,1980-01,"D001786,D001835,D005230,D005951,D006801,D00732...","Blood Glucose,Body Weight,Fatty Acids, Noneste...",D011236,Prediabetic State,investigation of insulin sensitivity in early ...,10


In [99]:
# ADD vectors to abstracts

#model = KeyedVectors.load_word2vec_format("/home/adrian/PhD/Data/Word2Vec/BioASQvectors2018/pubmed2018_w2v_200D/pubmed2018_w2v_200D.bin", binary=True)
#index2word_set = set(model.wv.index2word)
# clean for BioASQ
#bioclean = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '', t.replace('"', '').replace('/', '').replace('\\', '').replace("'",'').strip().lower()).split()

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def avg_feature_vector(sentence, model=model, num_features=200, index2word_set=index2word_set):
    #words = sentence.split()
    try:
        words = bioclean(sentence)
    except:
        print("bioclean did not work for: {}".format(sentence))
        print(type(sentence))
        print(math.isnan(sentence))
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
        else:
            if hasNumbers(word):
                print("word not in vocabulary: {}".format(word))
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

#data["vec"] = (data.title + " " + data.abstract).map(lambda abstract: avg_feature_vector(abstract))
#data.head()

NameError: name 'model' is not defined

In [31]:
# initialise
#MAX_LEAFS=8
treeClass = Tree(HC_tree, data["class_predict"], mode="sklearn", true_classes_all=data["mesh_ui_diab"])

# define root node
root = Node(Id=HC_tree["node_id"].max() # In scikit learn, the root node is the one with maximum node id
          , depth=0
          , parent=None
          , children=[])

# build tree
treeClass.set_build_tree(root)

print("N nodes: {}".format(treeClass.count_nodes()))
print("N leafs: {}".format(treeClass.count_leafs()))

treeClass.leaf_nodes

Count nodes: 95; leafs: 48
N nodes: 95
N leafs: 48


[Node id: 98267 (depth: 2, cluster_label: 35, children: []),
 Node id: 28821 (depth: 3, cluster_label: 26, children: []),
 Node id: 100873 (depth: 3, cluster_label: 23, children: []),
 Node id: 48342 (depth: 2, cluster_label: 28, children: []),
 Node id: 12265 (depth: 3, cluster_label: 31, children: []),
 Node id: 101634 (depth: 5, cluster_label: 34, children: []),
 Node id: 101557 (depth: 6, cluster_label: 45, children: []),
 Node id: 101725 (depth: 6, cluster_label: 12, children: []),
 Node id: 23755 (depth: 5, cluster_label: 24, children: []),
 Node id: 18883 (depth: 7, cluster_label: 47, children: []),
 Node id: 101525 (depth: 7, cluster_label: 37, children: []),
 Node id: 101749 (depth: 7, cluster_label: 15, children: []),
 Node id: 90529 (depth: 8, cluster_label: 39, children: []),
 Node id: 78360 (depth: 10, cluster_label: 25, children: []),
 Node id: 101329 (depth: 11, cluster_label: 36, children: []),
 Node id: 101746 (depth: 11, cluster_label: 14, children: []),
 Node id: 101

In [32]:
# fit tree with abstracts 
tree_fit = treeClass.fitTree(treeClass.tree, data)

In [33]:
#treeClass.tree.pretty_print(depth=8)

In [35]:
#print(treeClass.get_precision())
#print(treeClass.get_recall())
#print(treeClass.get_F1())
#pprint(treeClass.get_performances())
evaluateonlyleafs=True
print("evaluate only leafs: {}".format(evaluateonlyleafs))
#pprint(treeClass.get_performances(evaluateOnlyOnLeafs=evaluateonlyleafs))
print("F1 score zhao: {}".format(treeClass.F1_zhao(evaluateOnlyOnLeafs=evaluateonlyleafs)))


evaluate only leafs: True
F1 score zhao: 0.6253834269576436


In [61]:
np.sum([3,4,5])


12

# load FeedbackExplorer output

In [3]:
#fbe_path = "/home/adrian/tmp/Test_FBE"
#fbe_path = "/home/adrian/workspace/FBE output/maxTopwords_6_maxClasses1024_Nall"
#fbe_path = "/home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_Nall"
MAX_LEAFS = 32

fbe_path = "/home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_N5000_affectOnlyHighScoreTokens_tryAsPoint_option0"


spark = pyspark.sql.SparkSession.builder.getOrCreate()
sentences = spark.read.load(fbe_path+"/phrases/")
print(len(sentences.columns))
print(sentences.count())

df_short = sentences.select("id", "tokens", "index")
#df_short.printSchema()
df_short.show(2)

515
5000
+--------+--------------------+--------------------+
|      id|              tokens|               index|
+--------+--------------------+--------------------+
|28800712|[outcomes,  , ach...|[8 -> [442 -> 442...|
| 6989594|[investigation,  ...|[164 -> [92 -> 92...|
+--------+--------------------+--------------------+
only showing top 2 rows



In [4]:
nodes = pd.read_json(fbe_path+"/nodes.json", orient="records")
print(nodes.shape)
nodes.head(5)

(256, 21)


Unnamed: 0,name,tagId,color,annotations,algo,strLinks,strClassPath,names,filterMode,filterValue,...,windowSize,classCenters,cError,childSplitSize,children,hits,metrics,rocCurve,externalClassesFreq,purity
0,In Scope,0.0,,"[{'tokens': ['aggregate'], 'tag': 1, 'from': N...",{'value': 'supervised'},{'0': [1]},{'1': [0]},{},{'value': 'allIn'},[0],...,0.0,,,,[1],5000,{},{},{},{}
1,Explorer,1.0,,"[{'tokens': ['parentsand'], 'tag': 2, 'from': ...",{'value': 'clustering'},"{'1': [2, 3]}","{'2': [0, 1], '3': [0, 1]}",{},{'value': 'anyIn'},[1],...,,"{'2': 0, '3': 1}","[0.30347155211811905, 0.20720606913059703]",50.0,"[2, 161]",5000,{},{},{},{}
2,Explorer,,,"[{'tokens': ['predictionprevention'], 'tag': 4...",{'value': 'clustering'},"{'1': [4, 5]}","{'4': [0, 1, 2], '5': [0, 1, 2]}",{},{'value': 'anyIn'},[2],...,,"{'4': 0, '5': 1}","[0.30016739899043904, 0.220015458512949]",50.0,"[3, 90]",3906,{},{},{},{}
3,Explorer,,,"[{'tokens': ['diabetes'], 'tag': 8, 'from': No...",{'value': 'clustering'},"{'1': [8, 9]}","{'8': [0, 1, 2, 4], '9': [0, 1, 2, 4]}",{},{'value': 'anyIn'},[4],...,,"{'8': 0, '9': 1}","[0.290029150982945, 0.264503354541825]",50.0,"[4, 61]",2980,{},{},{},{}
4,Explorer,,,"[{'tokens': ['diabetes'], 'tag': 20, 'from': N...",{'value': 'clustering'},"{'1': [20, 21]}","{'20': [0, 1, 2, 8, 4], '21': [0, 1, 2, 8, 4]}",{},{'value': 'anyIn'},[8],...,,"{'20': 0, '21': 1}","[0.262124534816262, 0.28412328531006303]",50.0,"[5, 40]",2832,{},{},{},{}


In [5]:
# Get list with all possible classes in the sentences file
import pyspark
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *


def get_list_all_possible_classes(sentences, data ):
    """ Get the list of all possible occuring classes in the sentences file """
    join_udf = udf(lambda x: ";".join(x))
    sentences_classes_udf = udf(lambda x: ";".join([str(v) for v in x.keys()]))

    sentences_transformed = sentences.select("id"
                                            , "tokens"
                                            , sentences_classes_udf('index').alias("all_classes")) \
                                    .withColumn("tokens", join_udf(col("tokens"))) 

    sentences_pdf = sentences_transformed.toPandas()
    sentences_pdf["id"] = pd.to_numeric(sentences_pdf["id"])

    # add true class labels to sentences from data by merge/join 
    sentences_pdf["PMID"] = sentences_pdf["id"]
    sentences_pdf["PMID"] = pd.to_numeric(sentences_pdf["PMID"])
    meshDiab = data[["PMID", "mesh_ui_diab"]]
    meshDiab["PMID"] = pd.to_numeric(meshDiab["PMID"])
    sentences_pd_with_classes = pd.merge(sentences_pdf, meshDiab, on='PMID', how="left")

    print("\tsentences_pdf: {}".format(sentences_pdf.shape))
    print("\tmeshDiab: {}".format(meshDiab.shape))
    print("\tmerged: {}".format(sentences_pd_with_classes.shape))

    
    # list of all classes in the sentences file
    return (set(pd.to_numeric(sentences_pdf["all_classes"].map(lambda sentence: sentence.split(";")).explode()).values)
            , sentences_pd_with_classes)


In [6]:
sentences_all_classes, sentences_pd_with_classes = get_list_all_possible_classes(sentences, data)
print("Number of classes in sentences file: {}".format(len(sentences_all_classes)))
print("Merged dataset with true classes: {}".format(sentences_pd_with_classes.shape))
sentences_pd_with_classes.head()

	sentences_pdf: (5000, 4)
	meshDiab: (50911, 2)
	merged: (5000, 5)
Number of classes in sentences file: 238
Merged dataset with true classes: (5000, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,tokens,all_classes,PMID,mesh_ui_diab
0,28800712,outcomes; ;achieved; ;with; ;use; ;of; ;a; ;pr...,0;1;2;66;51;420;4;485;21;8;31,28800712,D017719
1,6989594,investigation; ;of; ;insulin; ;sensitivity; ;i...,0;1;2;164;20;4;439;8;44;28;429,6989594,D011236
2,524360,ultrastructural; ;pathology; ;of; ;peripheral;...,256;16;0;1;2;5;69;10;94,524360,D003929
3,21199315,evidence;-;based; ;interventional; ;pain; ;med...,0;1;2;66;51;420;4;485;21;8;31,21199315,D003929
4,24607755,delivery; ;timing; ;and; ;cesarean; ;delivery;...,0;1;2;435;20;4;167;8;443;28;45,24607755,D016640


In [7]:
# initialise
treeFBE = Tree(nodes
            #, list(set(data["class_predict"]))
            , mode="FBE"
            , sentences_all_classes=sentences_all_classes
            , true_classes_all=sentences_pd_with_classes["mesh_ui_diab"])

# define root node
root = Node(Id=1, depth=0, parent=None, children=[]) # Id = 1 because start at Explorer 

# build tree
#maxDepth = 10
treeFBE.set_build_tree(root)

print("Number leafs: {}".format(treeFBE.count_leafs()))

Count nodes: 68; leafs: 32
Number leafs: 32


In [8]:
# Associate cluster to each sentence
from pyspark.sql.functions import isnan, when, count, col

from pyspark.sql.functions import udf, col
from pyspark.sql.types import *

def matchCluster(index_map, cluster): 
    """ gets for each abstract its unique cluster (filterValue) from the index"""
    return list(set(list(index_map.keys())).intersection(set(cluster)))[0]

def associate_unique_cluster_to_documents(sentences, tree):
    """ Associates unique cluster to each document """
    leafs = tree.get_leaf_nodes()
    print("N leafs: {}".format(len(leafs)))
    cluster = [leaf.cluster_label for leaf in leafs]
    print("N clusters: {}".format(len(set(cluster))))

    matchCluster_udf = udf(lambda y: matchCluster(y, cluster))
    join_udf = udf(lambda x: ";".join(x))

    sentences_transformed = sentences.select("id", "tokens", matchCluster_udf('index').alias("uniqueCluster")) \
                        .withColumn("tokens", join_udf(col("tokens"))) 


    sentences_transformed.select([count(when(isnan(c), c)).alias(c) for c in sentences_transformed.columns]).show()
    #sentences.select('index', matchClass_udf('index').atlias("uniqueCluster")).groupby("uniqueCluster").count().show()
    sentences_pd = sentences_transformed.toPandas()
    sentences_pd["id"] = pd.to_numeric(sentences_pd["id"])
    sentences_pd["uniqueCluster"] = pd.to_numeric(sentences_pd["uniqueCluster"])
    
    # add true class labels to data by merge/join 
    sentences_pd["PMID"] = sentences_pd["id"]
    sentences_pd["PMID"] = pd.to_numeric(sentences_pd["PMID"])
    meshDiab = data[["PMID", "mesh_ui_diab"]]
    meshDiab["PMID"] = pd.to_numeric(meshDiab["PMID"])
    sentences_pd_with_classes_uniqueCluster = pd.merge(sentences_pd, meshDiab, on='PMID', how="left")
    #print("sentences_pd: {}".format(sentences_pd.shape))
    #print("meshDiab: {}".format(meshDiab.shape))
    #print("sentences_pd_with_classes_uniqueCluster: {}".format(sentences_pd_with_classes.shape))
    
    return sentences_pd_with_classes_uniqueCluster 


In [10]:
sentences_pd_with_classes_uniqueCluster = associate_unique_cluster_to_documents(sentences, treeFBE)
print("Unique clusters in sentences: {}".format(sentences_pd_with_classes_uniqueCluster["uniqueCluster"].nunique())) #####

sentences_pd_with_classes_uniqueCluster.head()

N leafs: 32
N clusters: 32


PythonException: 
  An exception was thrown from Python worker in the executor. The below is the Python worker stacktrace.
Traceback (most recent call last):
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 223, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 141, in dump_stream
    for obj in iterator:
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 212, in _batched
    for item in iterator:
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 90, in <lambda>
    return lambda *a: f(*a)
  File "/home/adrian/miniconda3/envs/deepscience/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-8-109febaf9a1a>", line 18, in <lambda>
  File "<ipython-input-8-109febaf9a1a>", line 9, in matchCluster
IndexError: list index out of range


In [29]:
treeFBE.fitTree(treeFBE.tree, sentences_pd_with_classes_uniqueCluster)

Node id: 1 (depth: 0, cluster_label: None, children: [2, 151])

In [30]:
#print(treeFBE.get_precision_macro())
#print(treeFBE.get_recall_macro())
#print(treeFBE.get_F1())
print(treeFBE.get_performances())

{'prec_micro': 0.2099545142183972, 'prec_macro': 0.428851540150752, 'recall_micro': 0.5092, 'recall_macro': 0.28042934619697757, 'F1_micro': 0.29731813268585316, 'F1_macro': 0.3391112303598232, 'F1_zhao': 0.928483697139779}


In [15]:
from pprint import pprint
print("Performance FBE:")
pprint(treeFBE.get_performances())
print()
print("Performance scikit learn: ")
pprint(treeClass.get_performances())

Performance FBE:
{'F1_macro': 0.2506777108451753,
 'F1_micro': 0.2435437686351443,
 'F1_zhao': 0.6847747661652462,
 'prec_macro': 0.32597387918191484,
 'prec_micro': 0.17975198509950005,
 'recall_macro': 0.2036393853315089,
 'recall_micro': 0.3775215572273183}

Performance scikit learn: 
{'F1_macro': 0.4313541702730686,
 'F1_micro': 0.18398582428756732,
 'F1_zhao': 0.7470408180623018,
 'prec_macro': 0.5557336418367507,
 'prec_micro': 0.10626421882440082,
 'recall_macro': 0.35246787518737727,
 'recall_micro': 0.6849796704052169}


In [106]:
sentences.select('index', matchClass_udf('index').alias("uniqueCluster")).groupby("uniqueCluster").count().show()



+-----------+-----+
|uniqueClass|count|
+-----------+-----+
|       1159|   21|
|       1090|  234|
|        296|   51|
|        691|   33|
|        125|    3|
|        666|  256|
|       1280|  334|
|        124| 1199|
|        718|  312|
|        740| 1173|
|        169|   41|
|        747|   46|
|       1425|   19|
|        577|    5|
|        272|   25|
|         54|  968|
|        282|    7|
|        232|    1|
|        483|   27|
|       1158|    5|
+-----------+-----+
only showing top 20 rows



### Calculate Performance for different FBE configurations. 
Limit number of abstracts to N; top words: 4,6,8 and other configurations

In [11]:
MAX_LEAFS = 32

#fbe_path = "/home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_N5000"
#fbe_path = "/home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_Nall"
#fbe_path = "/home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_N5000_shuffleRepartition"
#fbe_path = "/home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_N5000_affectonlyclassifhighestscore"
#fbe_path = "/home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_N5000_affectOnlyHighScoreTokens_tryAsPoint_option0"
fbe_path = "/home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_N5000_tryAsPoint_option0"

#fbe_path = "/home/adrian/workspace/FBE output/maxTopwords_4_N5000"
#fbe_path = "/home/adrian/workspace/FBE output/maxTopwords_6_Nall"
#fbe_path = "/home/adrian/workspace/FBE output/maxTopwords_6_N5000"
#fbe_path = "/home/adrian/workspace/FBE output/maxTopwords_4_N2000"
#fbe_path = "/home/adrian/workspace/FBE output/maxTopwords_6_parallelism1_N5000"
#fbe_path = "/home/adrian/workspace/FBE output/maxTopwords_6_N5000_shuffleRepartition"

print("Load data file: {}".format(fbe_path))
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sentences = spark.read.load(fbe_path+"/phrases/")
df_short = sentences.select("id", "tokens", "index")
print("N sentences: {}".format(sentences.count()))

print("Load tree..")
nodes = pd.read_json(fbe_path+"/nodes.json", orient="records")
print("nodes: {}".format(nodes.shape))

print("Get list with all possible classes in the sentences file..")
sentences_all_classes, sentences_pd_with_classes = get_list_all_possible_classes(sentences, data)
print("Number of classes in sentences file: {}".format(len(sentences_all_classes)))
print("Merged dataset with true classes: {}".format(sentences_pd_with_classes.shape))

print("initialise tree..")
treeFBE = Tree(nodes
            , mode="FBE"
            , sentences_all_classes=sentences_all_classes
            , true_classes_all=sentences_pd_with_classes["mesh_ui_diab"])

root = Node(Id=1, depth=0, parent=None, children=[]) # Id = 1 because start at Explorer 
treeFBE.set_build_tree(root)

print("Associate cluster to each sentence..")
sentences_pd_with_classes_uniqueCluster = associate_unique_cluster_to_documents(sentences, treeFBE)
print("Unique clusters in sentences: {}".format(sentences_pd_with_classes_uniqueCluster["uniqueCluster"].nunique())) #####

print("Fit..")
treeFBE.fitTree(treeFBE.tree, sentences_pd_with_classes_uniqueCluster)
evaluateonlyleafs=False
print("evaluate only leafs: {}".format(evaluateonlyleafs))
pprint(treeFBE.get_performances(evaluateOnlyOnLeafs=evaluateonlyleafs))

Load data file: /home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/maxTopwords_6_N5000_tryAsPoint_option0
N sentences: 5000
Load tree..
nodes: (190, 21)
Get list with all possible classes in the sentences file..


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


	sentences_pdf: (5000, 4)
	meshDiab: (50911, 2)
	merged: (5000, 5)
Number of classes in sentences file: 158
Merged dataset with true classes: (5000, 5)
initialise tree..
Count nodes: 78; leafs: 32
Associate cluster to each sentence..
N leafs: 32
N clusters: 32
+---+------+-------------+
| id|tokens|uniqueCluster|
+---+------+-------------+
|  0|     0|            0|
+---+------+-------------+



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unique clusters in sentences: 32
Fit..
evaluate only leafs: False
{'F1_macro': 0.2283475803834915,
 'F1_micro': 0.17171571625710172,
 'F1_zhao': 0.6591743123046492,
 'prec_macro': 0.5006499677522831,
 'prec_micro': 0.1266880583577645,
 'recall_macro': 0.14790330606781005,
 'recall_micro': 0.2664}


In [None]:
### Performances for directories in : /home/adrian/workspace/FBE output
### old metrics not taking the hierarchy of the mesh classes into consideration


In [None]:
### Performances for directories in : /home/adrian/workspace/FBE output_without_root_class_diabetesMellitus/
### metrics taking the hierarchy of the mesh classes into consideration
### excluded abstract having the root class diabetesMellitus

Topwords 6; N = 5000 , N leafs: 32, MAX_LEAFS = 32 - 13-08-2020
{'F1_macro': 0.2283475803834915,
 'F1_micro': 0.17171571625710172,
 'F1_zhao': 0.6591743123046493,
 'prec_macro': 0.5006499677522831,
 'prec_micro': 0.1266880583577645,
 'recall_macro': 0.14790330606781005,
 'recall_micro': 0.2664}

Topwords 4; N = 5000, N_leafs 10, MAX_LEAFS = 32 - 13-08-2020
{'F1_macro': 0.14595941787205505,
 'F1_micro': 0.17322179714375205,
 'F1_zhao': 0.6548679412028475,
 'prec_macro': 0.37766171374779717,
 'prec_micro': 0.1681580344278456,
 'recall_macro': 0.09046035377974807,
 'recall_micro': 0.1786}

Topwords 8; N = 5000, N leafs: 32, MAX_LEAFS = 32 - 13-08-2020
{'F1_macro': 0.22784874417496395,
 'F1_micro': 0.2766414682309211,
 'F1_zhao': 0.7375354163826922,
 'prec_macro': 0.30292190695722104,
 'prec_micro': 0.23253333333333334,
 'recall_macro': 0.1825959691243019,
 'recall_micro': 0.3414}

Topwords 10; N = 5000, N leafs: 32, MAX_LEAFS = 32 - 13-08-2020
{'F1_macro': 0.21299064343968985,
 'F1_micro': 0.2603354180581699,
 'F1_zhao': 0.6163431147102084,
 'prec_macro': 0.2732769434400456,
 'prec_micro': 0.21216666666666667,
 'recall_macro': 0.17449594092020146,
 'recall_micro': 0.3368}


Topwords 6; N = 5000 ; parallelism 1, N leafs: 32, MAX_LEAFS = 32 - 13-08-2020
{'F1_macro': 0.2283475803834915,
 'F1_micro': 0.17171571625710172,
 'F1_zhao': 0.6591743123046493,
 'prec_macro': 0.5006499677522831,
 'prec_micro': 0.1266880583577645,
 'recall_macro': 0.14790330606781005,
 'recall_micro': 0.2664}

Topwords 6; N = 5000 ; N leafs: 32, MAX_LEAFS: 32, Shuffle+repartition  - 13-08-2020
{'F1_macro': 0.18794525138269813,
 'F1_micro': 0.21350317737968985,
 'F1_zhao': 0.6500299953630889,
 'prec_macro': 0.2794831542835792,
 'prec_micro': 0.17513448894202033,
 'recall_macro': 0.1415756078311793,
 'recall_micro': 0.2734}

Topwords 6; N = 5000 ; N leafs: 32, MAX_LEAFS: 32, tryAsPoint option 0 - 20-08-2020
{'F1_macro': 0.2283475803834915,
 'F1_micro': 0.17171571625710172,
 'F1_zhao': 0.6591743123046492,
 'prec_macro': 0.5006499677522831,
 'prec_micro': 0.1266880583577645,
 'recall_macro': 0.14790330606781005,
 'recall_micro': 0.2664}


Topwords 6; N = 5000 ; N leafs: 19, MAX_LEAFS: 32, affect class only if higher score - 13-08-2020
{'F1_macro': 0.22375833207416565,
 'F1_micro': 0.25032671503105836,
 'F1_zhao': 0.7382670537548556,
 'prec_macro': 0.3453166161736336,
 'prec_micro': 0.20733993102414155,
 'recall_macro': 0.1654993019057332,
 'recall_micro': 0.3158}

    Topwords 6; N = 5000 ; N leafs: 16, MAX_LEAFS: 16
    F1 zhao: 0.6121491555899314
    Topwords 6; N = 5000 ; N leafs: 16, MAX_LEAFS: 16, affect class only if higher score
    F1 zhao: 0.6840435816852406
        
Topwords 6; N = 5000 ; N leafs: 32, MAX_LEAFS: 32, affect class only if higher score  , tryAsPoint option 1 20-08-2020  
{'F1_macro': 0.21133315283887605,
 'F1_micro': 0.23013195516973853,
 'F1_zhao': 0.7340422296618938,
 'prec_macro': 0.428851540150752,
 'prec_micro': 0.2099545142183972,
 'recall_macro': 0.14021467309848876,
 'recall_micro': 0.2546}

Topwords 6; N = 5000 ; N leafs: 32, MAX_LEAFS: 32, affect class only if higher score  , tryAsPoint option 3  - 20-08-2020  
{'F1_macro': 0.26438731620281275,
 'F1_micro': 0.2752757846646198,
 'F1_zhao': 0.7260631440042862,
 'prec_macro': 0.42729562484854666,
 'prec_micro': 0.22132764765784113,
 'recall_macro': 0.19141103111608776,
 'recall_micro': 0.364}
    
Topwords 6; N = 5000 ; N leafs: 32, MAX_LEAFS = 32 only evaluation on leafs !!!!
{'F1_macro': 0.2283475803834915,
 'F1_micro': 0.17171571625710172,
 'F1_zhao': 0.6099679983351846,
 'prec_macro': 0.5006499677522831,
 'prec_micro': 0.1266880583577645,
 'recall_macro': 0.14790330606781005,
 'recall_micro': 0.2664}


Spark HC Bisecting Kmeans, N = 5000, F1_Zhao
K=64 : 0.31938839779134753
K=32 : 0.4867501249189368
K=20 : 0.531422305528106
 

####
ALL TWEETS, F1_Zhao scores

FBE 
K64 => leafs 64 : 0.6839356043600647
K32 => leafs 32 : 0.6837704144246328
K20 => leafs 20 : 0.6836459020792343
    
scikitlearn
K128 => leafs 128 : 0.7470408180623019
K64 => leafs 64 : 0.6325404570785932
K48 => leafs 48 : 0.6323785789287327
K32 => leafs 32 : 0.6319947764196064
K20 => leafs 20 : 0.6319261108094082

scikitlearn stop words removed
K128 => leafs 128: 0.7905416892568666
K64 => leafs 64 : 0.7797430966034341
K48 => leafs 48 : 0.6335563625752151
K32 => leafs 32 : 0.632090319013169
K20 => leafs 20 : 0.6319716282425101

    


####
ALL TWEETS, F1_Zhao scores only leafs K=32

FBE 
K64 => leafs 64 : 0.47186540814593
K32 => leafs 32 : 0.4923950663139429
K20 => leafs 20 : 0.4976217764396761
    
spark
K64 : 0.28628270701459085
K32 : 0.4646152670187616
K20 : 0.49505558398827937

scikitlearn
K128 => leafs 128 : 0.7196017046951705
K64 : 0.6307555786658395
K48 : 0.6309833346053897
K32 : 0.6315189572838107
K20 : 0.6315643667948513
    
scikitlearn stop words removed
K128 => leafs 128 : 0.7241706562939704
K64 => leafs 64 : 0.7171002919301714
K48 => leafs 48 : 0.6253834269576436
K32 => leafs 32 : 0.6309985009823309
K20 => leafs 20 : 0.6315492474421875



### Evaluate Spark HC BisectingKmeans

In [119]:
sparkHC = spark.read.parquet("/home/adrian/workspace/Spark_BisectingKmeans_without_root_class_diabetesMellitus/bisectingKmeans_out_K20_N5000.parquet") # 256, 32, 21
#sparkHC = spark.read.parquet("/home/adrian/workspace/Spark_BisectingKmeans/bisectingKmeans_out_K32_N5000.parquet") # 256, 32, 21
#sparkHC = spark.read.parquet("/home/adrian/workspace/Spark_BisectingKmeans/bisectingKmeans_out_K256.parquet") # 256, 32, 21
#sparkHC = spark.read.parquet("/home/adrian/workspace/Spark_BisectingKmeans/bisectingKmeans_out_K128.parquet") # 256, 32, 21
sparkHC_pd = sparkHC.select("PMID", "mesh_ui_diab", "prediction").toPandas()
print(sparkHC_pd.shape)
print(sparkHC_pd["mesh_ui_diab"].value_counts())
sparkHC_pd.head(2)


(5000, 3)
D003930       514
D016640       498
D003921       495
D003924       493
D003922       486
D003928       486
D048909       461
D017719       444
D003929       347
D003925       290
D011236       144
D016883       131
D005320       125
D058065        29
D014929        19
D006944        13
D003926        11
D003923        10
D056731         3
D000071698      1
Name: mesh_ui_diab, dtype: int64


Unnamed: 0,PMID,mesh_ui_diab,prediction
0,28800712,D017719,0
1,6989594,D011236,13


In [120]:
# taking class hierarchy into account

def _get_child_mesh_classes(meshId, currentMesh, foundMeshInHierarchy=False): 
    """ For a given meshId, get all its child meshId's from meshHierarchy """
    global temp_mesh_and_its_child_classes
    if meshId == currentMesh.id:
        foundMeshInHierarchy = True
    if foundMeshInHierarchy:
        temp_mesh_and_its_child_classes.append(currentMesh)
    for mesh_child in currentMesh.children:
        _get_child_mesh_classes(meshId, mesh_child, foundMeshInHierarchy)


true_classes_unique = list(set(sparkHC_pd["mesh_ui_diab"].values.tolist()))

N = sparkHC_pd.shape[0]

FScore_sum = 0
for meshid in true_classes_unique: # For each class class_count = np.sum([node.true_classes.count(m.id) for m in mesh_and_child_classes])# + node.true_classes.count(childs of class c)
    temp_mesh_and_its_child_classes = [] # reset 
    _get_child_mesh_classes(meshid, MESH_HIERARCHY) 
    mesh_and_child_classes = temp_mesh_and_its_child_classes    
    N_c = np.sum([(sparkHC_pd.mesh_ui_diab == m.id).sum() for m in mesh_and_child_classes]) #+ all documents from all children of c
    #print("\nc: {}, N_c: {}, N: {}".format(meshid, N_c, N))
    #print("\t, mesh_childs: {}".format( mesh_and_child_classes))    
    temp_max_F1 = 0
    for current_cluster in range(0, 32): # 21 # For each cluster
        docs_for_current_cluster = sparkHC_pd[sparkHC_pd["prediction"] == current_cluster]
        class_count = np.sum([docs_for_current_cluster[docs_for_current_cluster["mesh_ui_diab"] == m.id].shape[0] for m in mesh_and_child_classes])# + node.true_classes.count(childs of class c)
        prec =  class_count / docs_for_current_cluster.shape[0] #node.counts
        recall = class_count / N_c
        #print("\t\t cluster: {} :: class_count: {}, self.count: {}, p: {:.2f}, r: {:.2f}".format(current_cluster, class_count, docs_for_current_cluster.shape[0], prec, recall))
        if prec > 1e-10 or recall > 1e-10: # if prec or recall == 0 
            F1 = 2 * prec * recall / (prec+recall)
        else:
            F1 = 0

        if F1 > temp_max_F1:
            temp_max_F1 = F1        
            #print("\t\tJIP")
    #print("Best F1: {}".format(temp_max_F1))
    #print("Nc/N + F1 : {}".format((N_c / N ) * temp_max_F1))
    FScore_sum += (N_c / N ) * temp_max_F1
    
FScore_sum



0.531422305528106

In [49]:
# without taking class hierarchy into account
true_classes_unique = list(set(sparkHC_pd["mesh_ui_diab"].values.tolist()))

N = sparkHC_pd.shape[0]

FScore_sum = 0
for meshid in true_classes_unique: # For each class 
    N_c = (sparkHC_pd.mesh_ui_diab == meshid).sum() #true_classes_documents.count(c)
    #print("\nc: {}, N_c: {}, N: {}, N_c/N : {}".format(c, N_c, N, N_c/N))
    temp_max_F1 = 0
    for current_cluster in range(0, 32): # 21 # For each cluster
        docs_for_current_cluster = sparkHC_pd[sparkHC_pd["prediction"] == current_cluster]
        class_count = docs_for_current_cluster[docs_for_current_cluster["mesh_ui_diab"] == meshid].shape[0]#node.true_classes.count(c)
        prec =  class_count / docs_for_current_cluster.shape[0] #node.counts
        recall = class_count / N_c
        #print("\t cluster: {} :: class_count: {}, self.count: {}, p: {:.2f}, r: {:.2f}".format(current_cluster, class_count, docs_for_current_cluster.shape[0], prec, recall))
        if prec > 1e-10 or recall > 1e-10: # if prec or recall == 0 
            F1 = 2 * prec * recall / (prec+recall)
        else:
            F1 = 0

        if F1 > temp_max_F1:
            temp_max_F1 = F1        
            #print("\t\tJIP")
    #print("Best F1: {}".format(temp_max_F1))
    #print("Nc/N + F1 : {}".format((N_c / N ) * temp_max_F1))
    FScore_sum += (N_c / N ) * temp_max_F1
    
FScore_sum

0.26497261586518334

In [22]:
len(true_classes_unique)

21

In [None]:
Hi Francisco,
so to give you an update on the performances.
I changed the F1 score as we said taking the hierarchy of the classes into consideration. 
This said I had to remove abstracts which had as class the root class "Diabetes Mellitus", otherwise the F1 score
would go over 1, I had this case. Also it seems logical, as we only have diabetes abstracts and want to cluster
them in subcategories, we don't need the root class as all abstracts should be in the root. 
So the dataset was reduced from 55000 to 50000 abstracts by taking out the abstracts of the root class.
That is why I had to rerun quite some simulations.

So the performances on all tweets, only the F1_zhao score. In this case I checked the score for several
cluster sizes. The number of leafs is always the same than the number of clusters 

##########
ALL TWEETS, F1_Zhao scores

FBE 
K64 => leafs 64 : 0.6839356043600647
K32 => leafs 32 : 0.6837704144246328
K20 => leafs 20 : 0.6836459020792343
    
scikitlearn
K64 => leafs 64 : 0.6325404570785932
K32 => leafs 32 : 0.6319947764196064
K20 => leafs 20 : 0.6319261108094082

    
Here all tweets but the performance is only evaluated on the leafs, so we could compare with spark
##########
ALL TWEETS, F1_Zhao scores only leafs K=32

FBE 
K64 => leafs 64 : 0.47186540814593
K32 => leafs 32 : 0.4923950663139429
K20 => leafs 20 : 0.4976217764396761
    
spark
K64 : 0.28628270701459085
K32 : 0.4646152670187616
K20 : 0.49505558398827937

scikitlearn
K64 : 0.6307555786658395
K32 : 0.6315189572838107
K20 : 0.6315643667948513

#######
Now the optimisations for FBE. Again the number of leafs should be the same than the MAX_LEAFS,
which is not always the case, sometimes FBE stops creating children

# BASELINE
Topwords 6; N = 5000 , N leafs: 32, MAX_LEAFS = 32
'F1_zhao': 0.6591743123046493

    
# TOPWORDS
Topwords 4; N = 5000, N_leafs 10, MAX_LEAFS = 32
'F1_zhao': 0.6548679412028475

Topwords 8; N = 5000, N leafs: 32, MAX_LEAFS = 32
'F1_zhao': 0.7375354163826922

Topwords 10; N = 5000, N leafs: 32, MAX_LEAFS = 32
'F1_zhao': 0.6163431147102084


# Parallelism 1
Topwords 6; N = 5000 ; parallelism 1, N leafs: 32, MAX_LEAFS = 32
'F1_zhao': 0.6591743123046493

    
# Shuffle + repartition (data...orderBy(rand()).repartition(parallelism))
Topwords 6; N = 5000 ; N leafs: 32, MAX_LEAFS: 32, Shuffle+repartition  
'F1_zhao': 0.6500299953630889


# affect only class if higher score (HERE FBE only created 19 leafs, that is why the score is so much higher)
Topwords 6; N = 5000 ; N leafs: 19, MAX_LEAFS: 32, affect class only if higher score
'F1_zhao': 0.7382670537548556
    
    # that is why i took the baseline and created only 16 leafs in both cases and it seems that the optimisation improves performance
    Topwords 6; N = 5000 ; N leafs: 16, MAX_LEAFS: 16
    F1 zhao: 0.6121491555899314
    Topwords 6; N = 5000 ; N leafs: 16, MAX_LEAFS: 16, affect class only if higher score
    F1 zhao: 0.6840435816852406
    

To sum up:
- On all tweets FBE had a gain of 5% performance over sklearn 
- On all tweets when evaluating only on the leafs FBE looses significantly performance whereas sklearn almost does not
- Changing the topwords to 8 may improve performance
- parallelism 1 is the same performance than the baseline (parallelism 3)
- shuffle+repartition did not improve 
- it seems that affecting the class to only the tokens with highest score improves the performance. 
  But I need to figure out why FBE stops creating children at one point. This is not normal.
    
Tomorrow I took the day for holidays. Monday I would go continue to check out why FBE stops creating children.