In [2]:
from pprint import pprint
from operator import itemgetter

%run FeatureExtraction.ipynb   

In [3]:
class functionalPartAnalyzer:
    def __init__(self, doc, fv, idx, annot):
        self.doc = doc
        self.annot = annot
        self.fv = fv
        self.index = idx
        self.ffv = []
        self.classification = []
    
    def introduction(self):
        self.ffv = []
        
        tagger = pycrfsuite.Tagger()
        tagger.open("./Models/intro.model")
        
        y_pred = tagger.tag(self.fv)
        
        # Labeling
        for i, (y, fv) in enumerate(zip(y_pred, self.fv)):
            if(y == "1"):
                self.classification.append({
                    "doc": self.index,
                    "index": int(fv[1].split("=")[1]),
                    "label": "Introduction"
                })
            else: 
                self.ffv.append(fv)
                
    def background(self):
        tagger = pycrfsuite.Tagger()
        tagger.open("./Models/background.model")
        
        y_pred = tagger.tag(self.ffv)
        
        # Labeling
        tffv = []
        for i, (y, fv) in enumerate(zip(y_pred, self.ffv)):
            if(y == "1"):
                self.classification.append({
                    "doc": self.index, 
                    "index": int(fv[1].split("=")[1]),
                    "label": "Background"
                })
            else: 
                tffv.append(fv)
        self.ffv = tffv
                
    def footnotes(self):
        tagger = pycrfsuite.Tagger()
        tagger.open("./Models/footnotes.model")
        
        y_pred = tagger.tag(self.ffv)
        
        # Labeling
        tffv = []
        for i, (y, fv) in enumerate(zip(y_pred, self.ffv)):
            if(y == "1"):
                self.classification.append({
                    "doc": self.index,
                    "index": int(fv[1].split("=")[1]), 
                    "label": "Footnotes"
                })
            else: 
                tffv.append(fv)
            
        self.ffv = tffv
        
    def getClassification(self):
        return self.classification
        
    def getFilteredFeatureVector(self):
        return self.ffv

In [4]:
class conclusionRecognizer:
    def __init__(self, p, ip, fv, idx):
        self.paragraph = p
        self.iParagraph = ip
        self.index = idx
        self.fv = fv
        self.classification = None
        
    def recognizer(self): 
        tagger = pycrfsuite.Tagger()
        tagger.open("./Models/conclusion.model")
        y_pred = tagger.tag(self.fv) 
        
        if(sum([int(y) for y in y_pred]) > (len(y_pred) / 2)):
            self.classification = {
                "doc": self.index,
                "index": int(self.iParagraph),
                "label": "Analysis"
            }
        else:
            self.classification = {
                "doc": self.index,
                "index": int(self.iParagraph),
                "label": "Conclusions"
            }
            
        return self.classification

In [5]:
class Classification:
    def __init__(self, docs, annots, indexes):
        self.docs = docs
        self.annot = annots
        self.idxs = indexes
        self.res = []
        
    def run(self, i, idx):
        res = []
        
        #Extract feature vector
        fx = FeatureExtraction(self.docs[i], idx, self.annot[i])
        fv = fx.get_feature_vector()
    
        # Functional Part Analyzer
        fpa = functionalPartAnalyzer(self.docs[i], fv, idx, self.annot[i])
        fpa.introduction()
        fpa.background()
        fpa.footnotes()

        res = fpa.getClassification()

        # Conclusion Recognizer
        # get paragraph's index that are still not classified 
        # it comes from annots index position
        gff = fpa.getFilteredFeatureVector()
        idxs = [(x[1].split("="))[1] for x in gff]
        
        #1 Analysis / 0 conclusions
        for x in idxs: 
            pfv = fx.get_feature_vector_for_sentence(x)
            cr = conclusionRecognizer(self.docs[i][int(x)], x, pfv, idx)
            res.append(cr.recognizer())

        res = sorted(res, key=lambda d: d['index']) 
            
        return res
    
    def set(self):
        for i, d in enumerate(self.docs):
            r = self.run(i, self.idxs[i])
            self.res.append(r)
            
    def get_results(self):
        y_true = []
        for x in self.annot:
            for y in x:
                y_true.append(y['type'])

        y_true = np.array(y_true)
        
        y_pred = []
        
        ord_res = [sorted(self.res[x], key=itemgetter('index')) for x in range(0, len(self.res))] 
        for x in ord_res:
            for y in x:
                y_pred.append(y['label'])
        
        y_pred = np.array(y_pred)
        # Print out the classification report
        a = (classification_report(
            y_true, y_pred, 
            labels = ["Introduction", "Background", "Analysis", "Conclusions", "Footnotes"], 
            output_dict=True
        ))
        
        return a