In [16]:
import pandas as pd
import numpy as np 
import time
from collections import Counter


MAX_LEAFS = 256

class Tree(object):
    
    def __init__(self, tree_hierarchy, clusters_predict=[], mode="sklearn", sentences_all_classes=None, true_classes_all=None):
        """
        @param mode : Two possible values
            - "FBE" : Tree object for Feedback Explorer output
            - "sklearn" : Tree object for scikit learn output
        
        @param sentences_all_classes : List of all possible classes occuring in the sentences file (only for mode FBE)
        @param true_labels_all : All occuring true labels (mesh codes) of the documents/abstracts
        """
        self.tree = None
        if mode in ["sklearn", "FBE"]:
            self.mode = mode
        else:
            raise ValueError("Provided mode '{}' is not supported".format(mode))
        self.tree_hierarchy = tree_hierarchy # pandas dataframe with tree structure coming from hierarchical clustering
        self.n_nodes = 0 # updated by calling self.count_nodes()
        self.n_leafs = 0 # updated by calling self.count_leafs()
        self.temp_n_leafs = 1 # In mode 'FBE' helps to construct the tree with the right number of nodes
        self.clusters_predict = clusters_predict # predicted cluster for each document
        self.unique_cluster_predict = list(set(clusters_predict)) # list of all classes to calculate performance metrices
        self.leaf_nodes = [] # list of all leaf nodes
        self.sentences_all_classes = sentences_all_classes # List of all classes occuring in sentences (phrases.parquet)
        self.true_classes_documents = true_classes_all.values.tolist() # list of true labels (mesh codes) in the abstracts
        self.true_classes_documents_unique = list(set(true_classes_all)) # all possible occuring true labels (mesh codes) in the abstracts
        self.precision_all_nodes = [] # macro
        self.precision_all_nodes_weighted = []
        self.precision_macro = None 
        self.precision_micro = None
        self.recall_all_classes = []
        self.recall_all_classes_weighted = []
        self.recall_macro = None
        self.recall_micro = None
        self.F1_macro = None
        self.F1_micro = None
        
        
    def _build_tree(self, node, current_depth=None):
        if self.mode == "sklearn":
            if node.node_id in self.tree_hierarchy["node_id"].values: # if node not leaf
                treeChildren = self.tree_hierarchy[self.tree_hierarchy["node_id"] == node.node_id]
                node.add_child(Node(Id=treeChildren["left"].values[0], depth=node.depth + 1, parent=node))
                node.add_child(Node(Id=treeChildren["right"].values[0], depth=node.depth + 1, parent=node))
                self._build_tree(node.children[0])
                self._build_tree(node.children[1])
            else:
                return node
            return node
            #print(node)
#            #print("\tcurrent_depth: {}; temp_nleafs: {}".format(current_depth, self.temp_n_leafs))
#            if node.depth == current_depth and self.temp_n_leafs < MAX_LEAFS:
#                if node.node_id in self.tree_hierarchy["node_id"].values: # if node not leaf
#                    treeChildren = self.tree_hierarchy[self.tree_hierarchy["node_id"] == node.node_id]
#                    self.temp_n_leafs -= 1 # lose one leaf because it is split into two new leafs
#                    node.add_child(Node(Id=treeChildren["left"].values[0], depth=node.depth + 1, parent=node))
#                    self.temp_n_leafs += 1 # creates new child
#                    node.add_child(Node(Id=treeChildren["right"].values[0], depth=node.depth + 1, parent=node))
#                    self.temp_n_leafs += 1 # creates new child
#            else:
#                if len(node.children) == 2 and self.temp_n_leafs < MAX_LEAFS:
#                    self._build_tree(node.children[0], current_depth)
#                    self._build_tree(node.children[1], current_depth)
#            return node
        elif self.mode == "FBE": 
            # Only create node if node is in current depth level
            if node.depth == current_depth and self.temp_n_leafs < MAX_LEAFS:
                treeChildren = self.tree_hierarchy.iloc[node.node_id].children
                # FBE tree is not a perfect binary tree, some nodes don't create children any more
                if len(treeChildren) > 0:
                    cluster_child_one = self.tree_hierarchy.iloc[treeChildren[0]].filterValue[0]
                    cluster_child_two = self.tree_hierarchy.iloc[treeChildren[1]].filterValue[0]
                    # Some nodes from nodes.json are empty: no sentences is going through them
                    # Only create node in tree when there is a sentence running through it
                    if cluster_child_one in self.sentences_all_classes:
                        self.temp_n_leafs -= 1 # lose one leaf because it is split into two new leafs
                        node.add_child(Node(Id=treeChildren[0], depth=node.depth + 1, parent=node, cluster_label=cluster_child_one))
                        self.temp_n_leafs += 1
                        if cluster_child_two in sentences_all_classes:
                            node.add_child(Node(Id=treeChildren[1], depth=node.depth + 1, parent=node, cluster_label=cluster_child_two))
                            self.temp_n_leafs += 1
                    elif cluster_child_two in sentences_all_classes:
                        self.temp_n_leafs -= 1 # lose one leaf because it is split into two new leafs
                        node.add_child(Node(Id=treeChildren[1], depth=node.depth + 1, parent=node, cluster_label=cluster_child_two))
                        self.temp_n_leafs += 1                    
            else:
                if len(node.children) == 1 and self.temp_n_leafs < MAX_LEAFS:
                    self._build_tree(node.children[0], current_depth)
                elif len(node.children) == 2 and self.temp_n_leafs < MAX_LEAFS:
                    self._build_tree(node.children[0], current_depth)
                    self._build_tree(node.children[1], current_depth)
            return node

    def _update_leaf_to_root(self, node, abstract_id, class_predict):
        """ Updates node and all its ancestors up to the root with the abstract's id and the predicted class"""
        node.update_node(abstract_id, class_predict)
        if node.parent != None: # Root has no parent
            self._update_leaf_to_root(node.parent, abstract_id, class_predict)
    
#    def set_build_tree(self,node, maxDepth=None):
#        """ Builds the tree and sets the variable tree."""
#        
#        if self.mode == "sklearn":
#            tree = self._build_tree(node, maxDepth)       
#        if self.mode == "FBE":
#            self.temp_n_nodes = 1
#            depth = 0
#            # build tree by level: create first all children for level 1, then level 2... 
#            # Prevents that a tree creates children just in one branch and always goes deeper in case of a max number of leavese
#            while depth < maxDepth and self.temp_n_leafs < MAX_LEAFS:
#                tree = self._build_tree(node, depth)
#                depth += 1
#        
#        assert isinstance(tree, Node)
#        self.tree = tree

    def set_build_tree(self,node):
        """ Builds the tree and sets the variable tree."""  

        # tree with MAX_LEAFS leafs is constructed. 
        # For sklearn add to each leaf its cluster label based on the children in the tree object from sklearn AgglomerativeClustering
        self.leaf_nodes = []
        if self.mode == "sklearn":
            tree = self._build_tree(node) # construct whole tree
            tree = self._get_cluster_labels_for_leafs(tree) # get labels for leafs
            tree = self._cut_nodes_from_leafs(tree) # cut nodes from bottom of the tree until only leafs with a unique cluster_label exist (Number leaves = MAX_LEAFS)
        elif self.mode == "FBE":
            self.temp_n_nodes = 1
            depth = 0
            # build tree by level: create first all children for level 1, then level 2... 
            # Prevents that a tree creates children just in one branch and always goes deeper in case of a max number of leavese
            while self.temp_n_leafs < MAX_LEAFS:
                #print("\ndepth: {}, n_nodes: {}".format(depth, self.temp_n_nodes))
                tree = self._build_tree(node, depth)
                depth += 1

        assert isinstance(tree, Node)
        self.tree = tree
        print("Count nodes: {}; leafs: {}".format(self.count_nodes(), self.count_leafs()))


    def _get_cluster_labels_for_leafs(self, node):
        """ 
            Get's the cluster labels for each leafs using the cluster labels assigned by
            the output of the sklearn agglomerative clustering algorithm.
        """        
        if len(node.children) == 0: #leaf
            cluster_label = self.clusters_predict[node.node_id]
            node.set_clusterLabel(cluster_label)
        else: # no leaf
            self._get_cluster_labels_for_leafs(node.children[0])
            self._get_cluster_labels_for_leafs(node.children[1])
        return node
    
    def _cut_nodes_from_leafs(self, node):
        """ 
            Children of nodes, who are leafs and have the same cluster_label, are cut off
            and the parent node takes the cluster label of its children.
            This is done recursively until there are only leafs with unique cluster_labels 
            Number of leaves = MAX_LEAFS
        
        """
        if len(node.children) > 0: 
            left_child = node.children[0]
            right_child = node.children[1]
            if left_child.cluster_label is None: # left child is not leaf 
                self._cut_nodes_from_leafs(left_child)
            if right_child.cluster_label is None: # right child is not leaf 
                self._cut_nodes_from_leafs(right_child)

            # should be updated now
            left_child = node.children[0]
            right_child = node.children[1]
            if left_child.cluster_label == right_child.cluster_label and left_child.cluster_label is not None:
                node.children = []
                node.cluster_label = left_child.cluster_label
                return node

        return node    

        
    def fitTree(self, node, data):
        """ Updates all the nodes of the tree according to the clustering from bottom to top """

        assert isinstance(node, Node)
        if len(node.children) > 0: # no leaf
            for child in node.children:
                self.fitTree(child, data)
        else: # leaf
            if self.mode == "sklearn": 
                leaf_cluster_label = node.cluster_label
                abstract_hits = data[data["class_predict"] == leaf_cluster_label]
                for i, row in abstract_hits.iterrows():
                    leaf_abstract_id = row.name
                    leaf_abstract_class_true = row.mesh_ui_diab # true class 
                    self._update_leaf_to_root(node, leaf_abstract_id, leaf_abstract_class_true)
            elif self.mode == "FBE": # several documents per leaf
                leaf_cluster_label = node.cluster_label
                abstract_hits = data[data["uniqueCluster"] == leaf_cluster_label]
                for i, row in abstract_hits.iterrows():
                    leaf_abstract_id = row["id"]
                    leaf_abstract_class_true = row["mesh_ui_diab"]
                    self._update_leaf_to_root(node, leaf_abstract_id, leaf_abstract_class_true)
            else: 
                print("ERROR: mode should be one of ['sklearn', 'FBE']")
        return node
        
    def _walk_count_nodes(self, node):
        self.n_nodes += 1
        
        for child in node.children:
            self._walk_count_nodes(child)            
            
    def count_nodes(self, tree=None):
        self.n_nodes = 0
        if tree == None:
            self._walk_count_nodes(self.tree)
        else:
            self._walk_count_nodes(tree)
        return self.n_nodes
    
    def _walk_count_leafs(self, node):
        if node.children == []:
            self.n_leafs += 1
            self.leaf_nodes.append(node)
        else:
            for child in node.children:
                self._walk_count_leafs(child)
                
    def count_leafs(self, tree=None):
        self.n_leafs = 0
        self.leaf_nodes = []
        if tree == None:
            self._walk_count_leafs(self.tree)
        else:
            self._walk_count_leafs(tree)
        return self.n_leafs
    
    def _walk_leaf_nodes(self, node):
        if node.children == []:
            self.leaf_nodes.append(node)
        else:
            for child in node.children:
                self._walk_leaf_nodes(child)
    
    def get_leaf_nodes(self):
        self.leaf_nodes = []
        self._walk_leaf_nodes(self.tree)
        return self.leaf_nodes
    
    def _walk_precision(self, node):
        node_precision = node.get_precision()
        self.precision_all_nodes.append(node_precision)
        self.precision_all_nodes_weighted.append(node_precision / node.counts)
        for child in node.children:
            self._walk_precision(child)
            
    def get_precision(self):
        self.precision_all_nodes = []
        self._walk_precision(self.tree)
        self.precision_macro = np.mean(self.precision_all_nodes)
        self.precision_micro = np.mean(self.precision_all_nodes_weighted)
        return {"prec_macro" : self.precision_macro
                , "prec_micro" : self.precision_micro}

        
    def get_recall(self):
        
        def _walk_recall(node, c):
            """ Get cluster with max documents of class c """
            occ = node.count_class_occurrence(c)
            if occ > self.temp_recall :
                self.temp_recall = occ
                #print("\ttemp_recall: {}".format(self.temp_recall))
            for child in node.children:
                _walk_recall(child, c)
        
        for c in self.true_classes_documents_unique:
            N_c = self.true_classes_documents.count(c)
            #print("c: {}, N_c : {}".format(c, N_c))
            self.temp_recall = 0.0 
#            _walk_recall(self.tree, c)
            # TODO: check if it is right!
            # # start with children; otherwise recalls for all classes will be highest in root
            _walk_recall(self.tree.children[0], c) 
            _walk_recall(self.tree.children[1], c)
            #print("Max count: {}; recall: {}".format(self.temp_recall, self.temp_recall / self.true_classes_documents.count(c)))
            recall = self.temp_recall / N_c
            self.recall_all_classes.append(recall) #len(self.unique_cluster_predict))
            self.recall_all_classes_weighted.append(recall / N_c)
        self.recall_macro = np.mean(self.recall_all_classes)
        self.recall_micro = np.mean(self.recall_all_classes_weighted)
        return {"recall_macro" : self.recall_macro
                ,"recall_micro" : self.recall_micro}
    
    def get_F1(self):
        precision = self.get_precision()
        recall = self.get_recall()        
        
        self.F1_macro = 2*precision["prec_macro"]*recall["recall_macro"] / (precision["prec_macro"] + recall["recall_macro"])
        self.F1_micro = 2*precision["prec_micro"]*recall["recall_micro"] / (precision["prec_micro"] + recall["recall_micro"])
        return {"F1_macro":self.F1_macro
               ,"F1_micro":self.F1_micro}
    
    def get_performances(self):
        precision = self.get_precision()
        recall = self.get_recall()
        F1 = self.get_F1()
        return({
            "prec_micro" : precision["prec_micro"]
            ,"prec_macro" : precision["prec_macro"]            
            ,"recall_micro" : recall["recall_micro"]
            ,"recall_macro" : recall["recall_macro"]
            ,"F1_micro" : F1["F1_micro"]
            ,"F1_macro" : F1["F1_macro"]        })
 


class Node(object):
    "Generic tree node."
    def __init__(self, Id, depth, parent=None, cluster_label=None, children=[]):
        self.node_id = Id
        self.parent = parent
        self.children = []
        self.depth = depth
        self.cluster_label = cluster_label # In case FBE: this is the filterValue in the leafs
        self.abstracts = [] # PMID's of abstracts 
        self.true_classes = [] # True classes for each abstract
        self.counts = 0
        self.recall = None
        self.precision = None 
        self.F1 = None
        if children is not None:
            for child in children:
                self.add_child(child)
                
    def __repr__(self):
        return "Node id: {} (depth: {}, cluster_label: {})".format(self.node_id, self.depth, self.cluster_label)
    
    def add_child(self, node):
        assert isinstance(node, Node)
        self.children.append(node)
        
    def set_clusterLabel(self, clusterLabel):
        self.cluster_label = clusterLabel
        
    def pretty_print(self, depth=0):
        
        if self.depth == depth: 
            print("Node: {}, Parent: {} (Depth: {}, counts: {}, cluster_label: {}) | Children: {}".format(self.node_id, self.parent, self.depth, self.counts, self.cluster_label, self.children))
            print("\tAbstracts: {}".format(Counter(self.abstracts)))
            print("\ttrue_classes: {}".format(Counter(self.true_classes)))
        else:
            for child in self.children:
                child.pretty_print(depth)
            
            
    def update_node(self, abstract_id, true_class):
        """ Updates the abstracts and its true class label running through this node """
        self.abstracts.append(abstract_id)
        self.true_classes.append(true_class)
        self.counts += 1
        
        
    def get_precision(self):
        count = Counter(self.true_classes)
        mostFrequent = max(self.true_classes, key=count.get)
        prec = self.true_classes.count(mostFrequent) / self.counts
        return prec

    def count_class_occurrence(self, c):
        return self.true_classes.count(c)
    


In [2]:
        
data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/outputs/diabetes_abstracts_HC_output.parquet")
data.index = data.index.get_level_values(None)
data.index.name = "PMID"
data = data.reset_index()
#data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_HC_output_10Examples.parquet")
#data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_HC_output_30Examples.parquet")

HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/outputs/diabetes_abstracts_tree_output.parquet')
#HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_tree_output_10Examples.parquet')
#HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_tree_output_30Examples.parquet')

## TEST TREE
#data = pd.DataFrame({"PMID": [0, 1, 2, 3, 4, 5]
#                    , "class_predict": [3, 0, 0, 0, 1, 2]}
#                   , columns=["PMID", "class_predict"]).set_index("PMID")

#HC_tree = pd.DataFrame({"node_id":[6, 7, 8, 9, 10]
#                    , "left" : [1, 2, 0, 5, 8]
#                    , "right" :[3, 6, 4, 7, 9]}
#                   , columns=["node_id", "left", "right"])

print("Tree nodes: {}".format(HC_tree.shape))
print(HC_tree.head())
print("data size: {}".format(data.shape))
print(list(set(data["class_predict"])))
print(list(set(data["mesh_ui_diab"])))
data.head(2)



Tree nodes: (55910, 3)
   node_id   left  right
0    55911  24005  54126
1    55912   8019  35641
2    55913  22173  29395
3    55914  17366  43982
4    55915  16539  23169
data size: (55911, 9)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,

Unnamed: 0,PMID,title,abstract,date,mesh_ui,mesh_mh,mesh_ui_diab,mesh_mh_diab,class_predict
0,22783714,Evaluation of oxidative stress among coronary ...,OBJECTIVES\nDetermination of the superoxide di...,2011-12,"D000368,D002097,D002318,D003124,D003925,D00491...","Aged,C-Reactive Protein,Cardiovascular Disease...",D003925,Diabetic Angiopathies,33
1,6933815,Ocular complications to diabetes and their tre...,Ocular complications to diabetes may be descri...,1980,"D002386,D002387,D003930,D006801,D053685,D008028","Cataract,Cataract Extraction,Diabetic Retinopa...",D003930,Diabetic Retinopathy,3


In [25]:
# initialise
#MAX_LEAFS=6
treeClass = Tree(HC_tree, data["class_predict"], mode="sklearn", true_classes_all=data["mesh_ui_diab"])

# define root node
root = Node(Id=HC_tree["node_id"].max() # In scikit learn, the root node is the one with maximum node id
          , depth=0
          , parent=None
          , children=[])

# build tree
treeClass.set_build_tree(root)

print("N nodes: {}".format(treeClass.count_nodes()))
print("N leafs: {}".format(treeClass.count_leafs()))

treeClass.leaf_nodes

Count nodes: 511; leafs: 256
N nodes: 511
N leafs: 256


[Node id: 110259 (depth: 1, cluster_label: 79),
 Node id: 16630 (depth: 2, cluster_label: 191),
 Node id: 3609 (depth: 3, cluster_label: 255),
 Node id: 16139 (depth: 6, cluster_label: 193),
 Node id: 55746 (depth: 6, cluster_label: 95),
 Node id: 27624 (depth: 6, cluster_label: 173),
 Node id: 42333 (depth: 6, cluster_label: 195),
 Node id: 24043 (depth: 5, cluster_label: 225),
 Node id: 33961 (depth: 7, cluster_label: 131),
 Node id: 52127 (depth: 7, cluster_label: 109),
 Node id: 760 (depth: 8, cluster_label: 133),
 Node id: 41929 (depth: 8, cluster_label: 169),
 Node id: 15721 (depth: 9, cluster_label: 151),
 Node id: 111137 (depth: 9, cluster_label: 71),
 Node id: 110388 (depth: 11, cluster_label: 83),
 Node id: 17634 (depth: 13, cluster_label: 223),
 Node id: 110990 (depth: 13, cluster_label: 163),
 Node id: 111536 (depth: 13, cluster_label: 62),
 Node id: 22561 (depth: 14, cluster_label: 227),
 Node id: 53499 (depth: 15, cluster_label: 224),
 Node id: 109752 (depth: 15, cluster_

In [27]:
# fit tree with abstracts 
tree_fit = treeClass.fitTree(treeClass.tree, data)

In [30]:
#treeClass.tree.pretty_print(depth=8)

In [31]:
#print(treeClass.get_precision())
#print(treeClass.get_recall())
#print(treeClass.get_F1())
print(treeClass.get_performances())

{'prec_micro': 0.25291798273811567, 'prec_macro': 0.5603293541866284, 'recall_micro': 0.006298470385543221, 'recall_macro': 0.9999511409627141, 'F1_micro': 0.012290858894573452, 'F1_macro': 0.7182067311303414}


# load FeedbackExplorer output

In [3]:
import pyspark
fbe_path = "/home/adrian/tmp/Test_FBE"

spark = pyspark.sql.SparkSession.builder.getOrCreate()
sentences = spark.read.load(fbe_path+"/phrases/")
print(len(sentences.columns))
print(sentences.count())

df_short = sentences.select("id", "tokens", "index")
#df_short.printSchema()
df_short.show(2)

2051
55911
+--------+--------------------+--------------------+
|      id|              tokens|               index|
+--------+--------------------+--------------------+
|28800712|[outcomes,  , ach...|[828 -> [218 -> 2...|
| 6989594|[investigation,  ...|[104 -> [146 -> 1...|
+--------+--------------------+--------------------+
only showing top 2 rows



In [4]:
nodes = pd.read_json(fbe_path+"/nodes.json", orient="records")
print(nodes.shape)
nodes.head(3)

(1024, 21)


Unnamed: 0,name,tagId,color,annotations,algo,strLinks,strClassPath,names,filterMode,filterValue,...,windowSize,classCenters,cError,childSplitSize,children,hits,metrics,rocCurve,externalClassesFreq,purity
0,In Scope,0.0,,"[{'tokens': ['aggregate'], 'tag': 1, 'from': N...",{'value': 'supervised'},{'0': [1]},{'1': [0]},{},{'value': 'allIn'},[0],...,0.0,,,,[1],55911,{},{},{},{}
1,Explorer,1.0,,"[{'tokens': ['diabetes'], 'tag': 2, 'from': No...",{'value': 'clustering'},"{'1': [2, 3]}","{'2': [0, 1], '3': [0, 1]}",{},{'value': 'anyIn'},[1],...,,"{'2': 0, '3': 1}","[0.0, 0.0]",50.0,"[2, 841]",55911,{},{},{},{}
2,Explorer,,,"[{'tokens': ['diabetes'], 'tag': 4, 'from': No...",{'value': 'clustering'},"{'1': [4, 5]}","{'4': [0, 1, 2], '5': [0, 1, 2]}",{},{'value': 'anyIn'},[2],...,,"{'4': 0, '5': 1}","[0.0, 0.0]",50.0,"[3, 656]",43230,{},{},{},{}


In [6]:
# Get list with all possible classes in the sentences file

from pyspark.sql.functions import udf, col
from pyspark.sql.types import *

join_udf = udf(lambda x: ";".join(x))
sentences_classes_udf = udf(lambda x: ";".join([str(v) for v in x.keys()]))

sentences_transformed = sentences.select("id"
                                        , "tokens"
                                        , sentences_classes_udf('index').alias("all_classes")) \
                                .withColumn("tokens", join_udf(col("tokens"))) 
                    
sentences_pdf = sentences_transformed.toPandas()
sentences_pdf["id"] = pd.to_numeric(sentences_pdf["id"])

# list of all classes in the sentences file
sentences_all_classes = set(pd.to_numeric(sentences_pdf["all_classes"].map(lambda sentence: sentence.split(";")).explode()).values)
print("Number of classes in sentences file: {}".format(len(sentences_all_classes)))

Number of classes in sentences file: 1712


In [7]:
# add true class labels to sentences from data by merge/join 
sentences_pdf["PMID"] = sentences_pdf["id"]
sentences_pdf["PMID"] = pd.to_numeric(sentences_pdf["PMID"])
meshDiab = data[["PMID", "mesh_ui_diab"]]
meshDiab["PMID"] = pd.to_numeric(meshDiab["PMID"])
sentences_pd_with_classes = pd.merge(sentences_pdf, meshDiab, on='PMID', how="left")

print("sentences_pdf: {}".format(sentences_pdf.shape))
print("meshDiab: {}".format(meshDiab.shape))
print("merged: {}".format(sentences_pd_with_classes.shape))

sentences_pd_with_classes.head()

sentences_pdf: (55911, 4)
meshDiab: (55911, 2)
merged: (55911, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,id,tokens,all_classes,PMID,mesh_ui_diab
0,28800712,outcomes; ;achieved; ;with; ;use; ;of; ;a; ;pr...,0;1;3;740;276;7;72;58;828;31,28800712,D017719
1,6989594,investigation; ;of; ;insulin; ;sensitivity; ;i...,64;0;1;2;1602;228;5;104;1641;10;240;50;20;244;...,6989594,D011236
2,524360,ultrastructural; ;pathology; ;of; ;peripheral;...,64;0;1;673;2;228;5;358;104;10;592;241;50;20;24...,524360,D003929
3,21199315,evidence;-;based; ;interventional; ;pain; ;med...,0;1;3;740;276;7;72;58;828;31,21199315,D003929
4,24607755,delivery; ;timing; ;and; ;cesarean; ;delivery;...,1232;0;1;194;2;5;453;21;200;10;538;27,24607755,D016640


In [17]:
# initialise
treeFBE = Tree(nodes
            #, list(set(data["class_predict"]))
            , mode="FBE"
            , sentences_all_classes=sentences_all_classes
            , true_classes_all=sentences_pd_with_classes["mesh_ui_diab"])

# define root node
root = Node(Id=1, depth=0, parent=None, children=[]) # Id = 1 because start at Explorer 

# build tree
#maxDepth = 10
treeFBE.set_build_tree(root)

print("Number leafs: {}".format(treeFBE.count_leafs()))

Count nodes: 528; leafs: 256
Number leafs: 256


In [9]:
leafs = treeFBE.get_leaf_nodes()
print("N leafs: {}".format(len(leafs)))
cluster = [leaf.cluster_label for leaf in leafs]
print("N clusters: {}".format(len(set(cluster))))
for c in cluster:
    if c not in sentences_all_classes:
        print(c)

N leafs: 256
N clusters: 256


In [19]:
# Associate cluster to each sentence

from pyspark.sql.functions import udf, col
from pyspark.sql.types import *

def matchCluster(index_map): 
    """ gets for each abstract its unique cluster (filterValue) from the index"""
    return list(set(list(index_map.keys())).intersection(set(cluster)))[0]


leafs = treeFBE.get_leaf_nodes()
print("N leafs: {}".format(len(leafs)))
cluster = [leaf.cluster_label for leaf in leafs]
print("N clusters: {}".format(len(set(cluster))))

matchCluster_udf = udf(lambda y: matchCluster(y))
join_udf = udf(lambda x: ";".join(x))

sentences_transformed = sentences.select("id", "tokens", matchCluster_udf('index').alias("uniqueCluster")) \
                    .withColumn("tokens", join_udf(col("tokens"))) 
                    
#sentences.select('index', matchClass_udf('index').atlias("uniqueCluster")).groupby("uniqueCluster").count().show()
sentences_pd = sentences_transformed.toPandas()
sentences_pd["id"] = pd.to_numeric(sentences_pd["id"])
sentences_pd["uniqueCluster"] = pd.to_numeric(sentences_pd["uniqueCluster"])
print("Unique clusters in sentences: {}".format(sentences_pd["uniqueCluster"].nunique())) #####


N leafs: 256
N clusters: 256
Unique clusters in sentences: 256


In [20]:
# add true class labels to data by merge/join 
sentences_pd["PMID"] = sentences_pd["id"]
sentences_pd["PMID"] = pd.to_numeric(sentences_pd["PMID"])
meshDiab = data[["PMID", "mesh_ui_diab"]]
meshDiab["PMID"] = pd.to_numeric(meshDiab["PMID"])
sentences_pd_with_classes_uniqueCluster = pd.merge(sentences_pd, meshDiab, on='PMID', how="left")
print("sentences_pd: {}".format(sentences_pd.shape))
print("meshDiab: {}".format(meshDiab.shape))
print("sentences_pd_with_classes_uniqueCluster: {}".format(sentences_pd_with_classes.shape))

sentences_pd_with_classes_uniqueCluster.head()

sentences_pd: (55911, 4)
meshDiab: (55911, 2)
sentences_pd_with_classes_uniqueCluster: (55911, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,id,tokens,uniqueCluster,PMID,mesh_ui_diab
0,28800712,outcomes; ;achieved; ;with; ;use; ;of; ;a; ;pr...,740,28800712,D017719
1,6989594,investigation; ;of; ;insulin; ;sensitivity; ;i...,228,6989594,D011236
2,524360,ultrastructural; ;pathology; ;of; ;peripheral;...,228,524360,D003929
3,21199315,evidence;-;based; ;interventional; ;pain; ;med...,740,21199315,D003929
4,24607755,delivery; ;timing; ;and; ;cesarean; ;delivery;...,538,24607755,D016640


In [23]:
treeFBE.fitTree(treeFBE.tree, sentences_pd_with_classes_uniqueCluster)

Node id: 1 (depth: 0, cluster_label: None)

In [24]:
#print(treeFBE.get_precision_macro())
#print(treeFBE.get_recall_macro())
#print(treeFBE.get_F1())
print(treeFBE.get_performances())

{'prec_micro': 0.07665872863135663, 'prec_macro': 0.494841955165603, 'recall_micro': 0.005639166876476844, 'recall_macro': 0.7947587020818961, 'F1_micro': 0.010505526553826057, 'F1_macro': 0.6099251699553135}


In [32]:
print("Performance scikit learn: ")
print(treeClass.get_performances())

Performance scikit learn: 
{'prec_micro': 0.2529179827381156, 'prec_macro': 0.5603293541866284, 'recall_micro': 0.006298470385543218, 'recall_macro': 0.9999511409627143, 'F1_micro': 0.012290858894573452, 'F1_macro': 0.7182067311303414}


In [114]:
# TESST

def wwwalk(node):
    print(node.class_label)
    if node.class_label == 496:
        node.pretty_print()
    else:
        for child in node.children:
            wwwalk(child)

print("Leaf nodes with no sentences:")
leafs = treeFBE.get_leaf_nodes()
for leaf in leafs:
    if len(leaf.abstracts) < 1:
        print("id: {}; children: {}; class: {}; depth: {}; counts: {}; parent: {} ".format(leaf.node_id, leaf.children, leaf.class_label, leaf.depth, leaf.counts, leaf.parent))
        print('abstracts: {}; true_classes: {}'.format(leaf.abstracts, leaf.true_classes))
        print()

print(sentences_pd[sentences_pd["uniqueCluster"] == 910]) #497
print(nodes.iloc[91])


Leaf nodes with no sentences:
id: 91; children: []; class: 910; depth: 10; counts: 0; parent: 90 
abstracts: []; predicted_classes: []

id: 96; children: []; class: 911; depth: 10; counts: 0; parent: 90 
abstracts: []; predicted_classes: []

id: 104; children: []; class: 626; depth: 10; counts: 0; parent: 103 
abstracts: []; predicted_classes: []

id: 513; children: []; class: 471; depth: 8; counts: 0; parent: 507 
abstracts: []; predicted_classes: []

id: 517; children: []; class: 877; depth: 8; counts: 0; parent: 515 
abstracts: []; predicted_classes: []

id: 520; children: []; class: 1372; depth: 9; counts: 0; parent: 519 
abstracts: []; predicted_classes: []

id: 551; children: []; class: 1734; depth: 9; counts: 0; parent: 550 
abstracts: []; predicted_classes: []

id: 552; children: []; class: 1735; depth: 9; counts: 0; parent: 550 
abstracts: []; predicted_classes: []

id: 554; children: []; class: 794; depth: 8; counts: 0; parent: 553 
abstracts: []; predicted_classes: []

id: 5

In [100]:
#treeFBE.tree.pretty_print(1)
#root.pretty_print(9)

In [106]:
sentences.select('index', matchClass_udf('index').alias("uniqueCluster")).groupby("uniqueCluster").count().show()


+-----------+-----+
|uniqueClass|count|
+-----------+-----+
|       1159|   21|
|       1090|  234|
|        296|   51|
|        691|   33|
|        125|    3|
|        666|  256|
|       1280|  334|
|        124| 1199|
|        718|  312|
|        740| 1173|
|        169|   41|
|        747|   46|
|       1425|   19|
|        577|    5|
|        272|   25|
|         54|  968|
|        282|    7|
|        232|    1|
|        483|   27|
|       1158|    5|
+-----------+-----+
only showing top 20 rows



In [None]:
def _update_leaf_to_root(node, abstract_id, class_predict):
    node.update_node(abstract_id, class_predict)
    if node.parent != None:
        _update_leaf_to_root(node.parent, abstract_id, class_predict)

def fitTree(node, data):
    """ Updates all the nodes of the tree according to the clustering """

    assert isinstance(node, Node)
    if len(node.children) > 0: # no leaf
        for child in node.children:
            fitTree(child, data)
    else:
        leaf_class = node.class_label
        abstract_hits = data[data["uniqueClass"] == leaf_class]
#        print("leaf_class: {}".format(leaf_class))
#        print(abstract_hits.shape)
        for i, row in abstract_hits.iterrows():
            leaf_abstract_id = row["id"]
            leaf_abstract_class_predict = row["uniqueClass"]
            _update_leaf_to_root(node, leaf_abstract_id, leaf_abstract_class_predict)

    return node

#fitted_tree = fitTree(treeFBE.tree, sentences_pd)

In [54]:
df_short.select(udf((index:Map[Int, Map[Int, Int]]) => {index.keys.toSet.intersect(classes.toSet).toSeq} ).apply(col("index"))#.as("uniqueClass")).groupBy("uniqueClass").count.show()

SyntaxError: invalid syntax (<ipython-input-54-100f47e3da18>, line 1)

Unnamed: 0,id,tokens,uniqueClass
