In [103]:
import pandas as pd
import numpy as np 


class Tree(object):
    
    def __init__(self, tree_hierarchy_HC, n_classes):
        self.tree = None 
        self.tree_hierarchy_HC = tree_hierarchy_HC # pandas dataframe with tree structure coming from hierarchical clustering
        self.n_nodes = 0
        self.n_classes = n_classes
        self.precision_all_nodes = [] # macro
        self.precision_macro = None 
        self.recall_all_classes = []
        self.recall_macro = None
        self.F1 = None
        
        
    def _build_tree(self, node):
        if node.node_id in self.tree_hierarchy_HC["node_id"].values: # if node not leaf
            treeChildren = self.tree_hierarchy_HC[self.tree_hierarchy_HC["node_id"] == node.node_id]
            node.add_child(Node(Id=treeChildren["left"].values[0], depth=node.depth + 1, parent=node))
            node.add_child(Node(Id=treeChildren["right"].values[0], depth=node.depth + 1, parent=node))
            self._build_tree(node.children[0])
            self._build_tree(node.children[1])
        else:
            return node
        return node

    def _update_leaf_to_root(self, node, name, class_predict):
        node.update_node(name, class_predict)
        if node.parent != None:
            self._update_leaf_to_root(node.parent, name, class_predict)
    
    def set_build_tree(self,node):
        """ Builds the tree and sets the variable tree."""
        tree = self._build_tree(node)       
        assert isinstance(tree, Node)
        self.tree = tree
        
    def fitTree(self, node, data):
        """ Updates all the nodes of the tree according to the clustering """

        assert isinstance(node, Node)
        if len(node.children) > 0: # no leaf
            for child in node.children:
                self.fitTree(child, data)
        else:
            leaf_id = node.node_id
            leaf_abstract_name = data.iloc[leaf_id].name
            leaf_abstract_class_predict = data.iloc[leaf_id].class_predict
            self._update_leaf_to_root(node, leaf_abstract_name, leaf_abstract_class_predict)
        return node
        
    def _walk_count_nodes(self, node):
        self.n_nodes += 1
        for child in node.children:
            self._walk_count_nodes(child)            
            
    def count_nodes(self):
        self.n_nodes = 0
        self._walk_count_nodes(self.tree)
        return self.n_nodes
    
    def _walk_precision(self, node):
        self.precision_all_nodes.append(node.get_precision())
        for child in node.children:
            self._walk_precision(child)
            
    def get_precision_macro(self):
        self.precision_all_nodes = []
        self._walk_precision(self.tree)
        self.precision = np.mean(self.precision_all_nodes)
        return self.precision

    def _walk_recall(self, node, c):
        occ = node.count_class_occurrence(c)
        if occ > self.temp_recall :
            self.temp_recall = occ
        for child in node.children:
            self._walk_recall(child, c)
        
    def get_recall_macro(self):
        for c in self.n_classes:
            self.temp_recall = 0.0 
            self._walk_recall(self.tree, c)
            self.recall_all_classes.append(self.temp_recall / len(self.n_classes))
        self.recall_macro = np.mean(self.recall_all_classes)
        return self.recall_macro
    
    def get_F1(self):
        if self.precision == None:
            self.get_precision_macro()
        if self.recall_macro == None:
            self.get_recall_macro()
        self.F1 = 2*self.precision*self.recall_macro / (self.precision + self.recall_macro)
        return self.F1
    
    def get_performances(self):
        return({
            "prec" : self.precision if self.precision != None else self.get_precision_macro()
            ,"recall" : self.recall_macro if self.recall_macro != None else self.get_recall_macro()
            ,"F1" : self.F1 if self.F1 != None else self.get_F1()
        })
    
class Node(object):
    "Generic tree node."
    def __init__(self, Id, depth, parent=None, children=None):
        self.node_id = Id
        self.parent = parent
        self.children = []
        self.depth = depth
        self.abstracts = [] # PMID's of abstracts 
        self.predicted_classes = []
        self.counts = 0
        self.recall = None
        self.precision = None 
        self.F1 = None
        if children is not None:
            for child in children:
                self.add_child(child)
    def __repr__(self):
        return str(self.node_id)#{"node_id": self.node_id}
    
    def add_child(self, node):
        assert isinstance(node, Node)
        self.children.append(node)
        
    def pretty_print(self, depth=0):
        
        if self.depth == depth: 
            print("Node: {}, Parent: {} (Depth: {}, counts: {}) | Children: {}".format(self.node_id, self.parent, self.depth, self.counts, self.children))
            print("\tAbstracts: {}".format(self.abstracts))
            print("\tpredicted_classes: {}".format(self.predicted_classes))
        else:
            for child in self.children:
                child.pretty_print(depth)
            
            
    def update_node(self, abstract_PMID, predicted_class):
        self.abstracts.append(abstract_PMID)
        self.predicted_classes.append(predicted_class)
        self.counts += 1
        
        
    def get_precision(self):
        mostFrequent = max(self.predicted_classes,key=self.predicted_classes.count)
        return self.predicted_classes.count(mostFrequent) / self.counts

    def count_class_occurrence(self, c):
        return self.predicted_classes.count(c)
    
        
#data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/outputs/diabetes_abstracts_HC_output.parquet")
data = pd.read_parquet("/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_HC_output_10Examples.parquet")
#data.index = data.index.get_level_values(None)
#data.index.name = "PMID"

#HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/outputs/diabetes_abstracts_tree_output.parquet')
HC_tree = pd.read_parquet('/home/adrian/workspace/Hierarchical-Clustering-Active-Learning-Text/diabetes_abstracts_tree_output_10Examples.parquet')

## TEST TREE
#data = pd.DataFrame({"PMID": [0, 1, 2, 3, 4, 5]
#                    , "class_predict": [3, 0, 0, 0, 1, 2]}
#                   , columns=["PMID", "class_predict"]).set_index("PMID")

#HC_tree = pd.DataFrame({"node_id":[6, 7, 8, 9, 10]
#                    , "left" : [1, 2, 0, 5, 8]
#                    , "right" :[3, 6, 4, 7, 9]}
#                   , columns=["node_id", "left", "right"])

print("Tree nodes: {}".format(HC_tree.shape))
print(HC_tree.head())
print("data size: {}".format(data.shape))
print(list(set(data["class_predict"])))
data.head(2)



Tree nodes: (9, 3)
   node_id  left  right
0       10     1      3
1       11     2      9
2       12     0      6
3       13    10     11
4       14     7     12
data size: (10, 9)
[0, 1, 2, 3]


Unnamed: 0_level_0,title,abstract,date,mesh_ui,mesh_mh,mesh_ui_diab,mesh_mh_diab,title_abstract_prep,class_predict
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
29913182,The genetic architecture of type 1 diabetes me...,Type 1 diabetes mellitus (T1D) is a complex au...,2018-12-05,"D015551,D003922,D020022,D014644,D055106,D00668...","Autoimmunity,Diabetes Mellitus, Type 1,Genetic...",D003922,"Diabetes Mellitus, Type 1",the genetic architecture of type 1 diabetes me...,0
29913317,Evaluation of knowledge regarding gestational ...,OBJECTIVE\nThe aim of this study was to evalua...,2018-08,"D000293,D000328,D000368,D000369,D001459,D00343...","Adolescent,Adult,Aged,Aged, 80 and over,Bangla...",D016640,"Diabetes, Gestational",evaluation of knowledge regarding gestational ...,1


In [104]:
treeClass = Tree(HC_tree, list(set(data["class_predict"])))

root = Node(Id=HC_tree["node_id"].max(), depth=0, parent=None, children=[])

treeClass.set_build_tree(root)
treeClass.tree.pretty_print()

Node: 18, Parent: None (Depth: 0, counts: 0) | Children: [16, 17]
	Abstracts: []
	predicted_classes: []


In [105]:
treeClass.count_nodes()


19

In [106]:
# fit tree with abstracts 
treeClass.fitTree(treeClass.tree, data)

18

In [107]:
treeClass.tree.pretty_print(depth=1)

Node: 16, Parent: 18 (Depth: 1, counts: 5) | Children: [5, 13]
	Abstracts: ['29914103', '29913317', '29914063', '29913486', '29914404']
	predicted_classes: [3, 1, 1, 1, 1]
Node: 17, Parent: 18 (Depth: 1, counts: 5) | Children: [4, 15]
	Abstracts: ['29914066', '29914375', '29914345', '29913182', '29914234']
	predicted_classes: [2, 0, 0, 0, 0]


In [109]:
print(treeClass.get_precision_macro())
print(treeClass.get_recall_macro())
print(treeClass.get_F1())
print(treeClass.get_performances())

0.9473684210526315
0.625
0.7531380753138075
{'prec': 0.9473684210526315, 'recall': 0.625, 'F1': 0.7531380753138075}


In [None]:
## import re
bioclean = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '', t.replace('"', '').replace('/', '').replace('\\', '').replace("'",'').strip().lower()).split()
biocleanSoft= lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '', t).split()
fbe = lambda t: re.sub('(?!^)\\b', '', t).split()

for sent in data.abstract[0:10]:
    print(sent)
    print()
    print(bioclean(sent))
    print()
    print(biocleanSoft(sent))
    print()
    print(fbe(sent))
    print()
    print("New:")

In [37]:
qq = [3,2,2,1,0,8]
max(qq,key=qq.count)
qq.count(2)

2

In [18]:
qq