# Analyzers

In [6]:
import pycrfsuite
from nltk.tokenize import word_tokenize
import string
import nltk
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)

### Supervised Model

In [7]:
class functionalPartAnalyzer:
    def __init__(self, doc, fv, idx):
        self.doc = doc
        self.fv = fv
        self.index = idx
        self.ffv = []
        self.classification = []
    
    def introduction(self):
        self.ffv = []
        
        tagger = pycrfsuite.Tagger()
        tagger.open("../Supervised/Models/intro.model")
        
        y_pred = tagger.tag(self.fv)
        
        # Labeling
        for i, (y, fv) in enumerate(zip(y_pred, self.fv)):
            if(y == "1"):
                self.classification.append({
                    "doc": int(self.index),
                    "index": int(fv[1].split("=")[1]),
                    "label": "Introduction"
                })
            else: 
                self.ffv.append(fv)
                
    def background(self):
        tagger = pycrfsuite.Tagger()
        tagger.open("../Supervised/Models/background.model")
        
        y_pred = tagger.tag(self.ffv)
        
        # Labeling
        tffv = []
        for i, (y, fv) in enumerate(zip(y_pred, self.ffv)):
            if(y == "1"):
                self.classification.append({
                    "doc": int(self.index), 
                    "index": int(fv[1].split("=")[1]),
                    "label": "Background"
                })
            else: 
                tffv.append(fv)
        self.ffv = tffv
                
    def footnotes(self):
        tagger = pycrfsuite.Tagger()
        tagger.open("../Supervised/Models/footnotes.model")
        
        y_pred = tagger.tag(self.ffv)
        
        # Labeling
        tffv = []
        for i, (y, fv) in enumerate(zip(y_pred, self.ffv)):
            if(y == "1"):
                self.classification.append({
                    "doc": int(self.index),
                    "index": int(fv[1].split("=")[1]),
                    "label": "Footnotes"
                })
            else:
                tffv.append(fv)
            
        self.ffv = tffv
        
    def getClassification(self):
        return self.classification
        
    def getFilteredFeatureVector(self):
        return self.ffv

In [8]:
class conclusionRecognizer:
    def __init__(self, p, ip, fv, idx):
        self.paragraph = p
        self.iParagraph = ip
        self.index = idx
        self.fv = fv
        self.classification = None
        
    def recognizer(self): 
        tagger = pycrfsuite.Tagger()
        tagger.open("../Supervised/Models/conclusion.model")
        y_pred = tagger.tag(self.fv) 
        
        if(sum([int(y) for y in y_pred]) > (len(y_pred) / 2)):
            self.classification = {
                "doc": int(self.index),
                "index": int(self.iParagraph),
                "label": "Analysis"
            }
        else:
            self.classification = {
                "doc": int(self.index),
                "index": int(self.iParagraph),
                "label": "Conclusions"
            }
            
        return self.classification

### Feature Extraction

In [9]:
class FeatureExtraction:
    def __init__(self, doc, i):
        self.doc = doc.copy()
        self.index = i
        
    def get_feature_vector(self):
        fvs = []
        tdoc = []
        
        for j, p in enumerate(self.doc):
            l = []
            for k, s in enumerate(p):
                l.append(word_tokenize(s))
            tdoc.append(l)
            
        for j, p in enumerate(tdoc):
            fv = [
                f"paragraph.doc={self.index}",
                f"paragraph.position={j}",
                f"paragraph.length={self.paragraph_length(p)}",
                f"paragraph.average_sentence_length={self.avg_sentence_length(p)}"
            ]
            
            # first five tokens (and pos) in paragraph
            n = 5
            if(len(p[0]) < 5):
                n = len(p[0])
                
            ftokens_pos = self.get_first_pos(p[0], n)
            for k, ps in enumerate(ftokens_pos):
                fv.extend([
                    f"paragraph.first_word[+{k}]={ps[0]}",
                    f"paragraph.first_pos[+{k}]={ps[1]}"
                ])
            
            # last five tokens (and pos) in paragraph
            n = 5
            if(len(p[len(p) - 1]) < 5): 
                n = len(p[len(p) - 1])
                
            ltokens_pos = self.get_last_pos(p[len(p) - 1], n)
            for k, ps in enumerate(ltokens_pos):
                fv.extend([
                    f"paragraph.last_word[+{k}]={ps[0]}",
                    f"paragraph.last_pos[+{k}]={ps[1]}"
                ])
            
            if j > 0 :
                fv.extend([
                    f"paragraph.prev_paragraph_length={self.prev_paragraph_length(j)}"
                ])
            else:
                fv.extend(["BOD"])
                
            # First two tokenk in each sentence
            for k, s in enumerate(p): 
                wpos = self.get_first_pos(s, 2)
                for kk, (w, p) in enumerate(zip(s[:2], wpos)):
                    fv.extend([
                        f"paragraph.sentence[+{k}].word[+{kk}]={w}",
                        f"paragraph.sentence[+{k}].word_pos[+{kk}]={p[1]}",
                        f"paragraph.sentence[+{k}].word_lower[+{kk}]={w.lower()}",
                        f"paragraph.sentence[+{k}].isupper[+{kk}]={w.isupper()}",
                        f"paragraph.sentence[+{k}].istitle[+{kk}]={w.istitle()}",
                        f"paragraph.sentence[+{k}].isdigit[+{kk}]={w.isdigit()}"
                    ])
                
            # Last tow tokens in each sentence
            for k, s in enumerate(p): 
                wpos = self.get_last_pos(s, 2)
                for kk, (w, p) in enumerate(zip(s[-2:], wpos)):
                    fv.extend([
                        f"paragraph.sentence[+{k}].word[-{kk}]={w}",
                        f"paragraph.sentence[+{k}].word_post[-{kk}]={p[1]}",
                        f"paragraph.sentence[+{k}].word_lower[-{kk}]={w.lower()}",
                        f"paragraph.sentence[+{k}].isupper[-{kk}]={w.isupper()}",
                        f"paragraph.sentence[+{k}].istitle[-{kk}]={w.istitle()}",
                        f"paragraph.sentence[+{k}].isdigit[-{kk}]={w.isdigit()}"
                    ])
                
            if j < len(self.doc) - 1:
                 fv.extend([
                     f"paragraph.next_paragraph_length={self.next_paragraph_length(j)}"
                 ])
            else:
                fv.extend(["EOD"])
 
            fvs.append(fv)

        return fvs

    # Used during the training of Conclusion Recognizer

    def get_feature_vector_for_sentences(self):
        fvp = []
        tdoc = []
        
        for j, p in enumerate(self.doc):
            l = []
            for k, s in enumerate(p):
                l.append(word_tokenize(s))
            tdoc.append(l)
        
        for j, p in enumerate(tdoc):
            for k, s in enumerate(p):
                fv = [
                    f"sentence.doc={self.index}",
                    f"sentence.paragraph={j}",
                    f"sentence.position={k}",
                    f"sentence.length={len(s)}"
                ]
                
                 # first tokens (and pos) in sentence
                tokens_pos = self.get_first_pos(s, len(s))
                for z, ps in enumerate(tokens_pos):
                    fv.extend([
                        f"sentence.first_word[+{z}]={ps[0]}",
                        f"sentence.first_pos[+{z}]={ps[1]}",
                        f"sentence.word[+{z}]={ps[0].lower()}",
                        f"sentence.word.isupper[+{z}]={ps[0].isupper()}",
                        f"sentence.word.istitle[+{z}]={ps[0].istitle()}",
                        f"sentence.word.isdigit[+{z}]={ps[0].isdigit()}"
                    ])    

                    if(z > 0):
                        fv.extend([
                            f"sentence.word.prev_pos[+{z}]={tokens_pos[z - 1][1]}",
                            f"sentence.word.prev_isupper[+{z}]={tokens_pos[z - 1][0].isupper()}",
                            f"sentence.word.prev_istitle[+{z}]={tokens_pos[z - 1][0].istitle()}",
                            f"sentence.word.prev_isdigit[+{z}]={tokens_pos[z - 1][0].isdigit()}"
                        ])

                    if(z < len(tokens_pos) - 1):
                        fv.extend([
                            f"sentence.word.next_pos[+{z}]={tokens_pos[z + 1][1]}",
                            f"sentence.word.next_isupper[+{z}]={tokens_pos[z + 1][0].isupper()}",
                            f"sentence.word.next_istitle[+{z}]={tokens_pos[z + 1][0].istitle()}",
                            f"sentence.word.next_isdigit[+{z}]={tokens_pos[z + 1][0].isdigit()}"
                        ])
                
                if k > 0: 
                    pass
                else:
                    fv.extend(["BOS"])
                    
                if k < len(p) - 1:
                    pass
                else: 
                    fv.extend(["EOS"])
                
                fvp.append(fv)

        return fvp

    # Used during the Classification Task

    def get_feature_vector_for_sentence(self, idx):
        fvp = []
        tp = []
        
        p = self.doc[int(idx)]
        
        for k, s in enumerate(p):
            tp.append(word_tokenize(s))
            
        for k, s in enumerate(tp):
            fv = [
                f"sentence.doc={self.index}",
                f"sentence.paragraph={idx}",
                f"sentence.position={k}",
                f"sentence.length={len(s)}"
            ]
            
             # first tokens (and pos) in sentence
            tokens_pos = self.get_first_pos(s, len(s))
            for z, ps in enumerate(tokens_pos):
                fv.extend([
                    f"sentence.first_word[+{z}]={ps[0]}",
                    f"sentence.first_pos[+{z}]={ps[1]}",
                    f"sentence.word[+{z}]={ps[0].lower()}",
                    f"sentence.word.isupper[+{z}]={ps[0].isupper()}",
                    f"sentence.word.istitle[+{z}]={ps[0].istitle()}",
                    f"sentence.word.isdigit[+{z}]={ps[0].isdigit()}"
                ])    
                
                if(z > 0):
                    fv.extend([
                        f"sentence.word.prev_pos[+{z}]={tokens_pos[z - 1][1]}",
                        f"sentence.word.prev_isupper[+{z}]={tokens_pos[z - 1][0].isupper()}",
                        f"sentence.word.prev_istitle[+{z}]={tokens_pos[z - 1][0].istitle()}",
                        f"sentence.word.prev_isdigit[+{z}]={tokens_pos[z - 1][0].isdigit()}"
                    ])
                
                
                if(z < len(tokens_pos) - 1):
                    fv.extend([
                        f"sentence.word.next_pos[+{z}]={tokens_pos[z + 1][1]}",
                        f"sentence.word.next_isupper[+{z}]={tokens_pos[z + 1][0].isupper()}",
                        f"sentence.word.next_istitle[+{z}]={tokens_pos[z + 1][0].istitle()}",
                        f"sentence.word.next_isdigit[+{z}]={tokens_pos[z + 1][0].isdigit()}"
                    ])
            
            if k > 0: 
                pass
            else:
                fv.extend(["BOS"])

            if k < len(p) - 1:
                pass
            else:
                fv.extend(["EOS"])

            fvp.append(fv)

        return fvp
    
    def paragraph_length(self, p):
        l = 0
        
        for s in p:
            if(type(s) != list):
                s = word_tokenize(s)
            l += len(s)

        return l
    
    def prev_paragraph_length(self, i):
        if(i > 0):
            return self.paragraph_length(self.doc[i-1])
        else: 
            return 0

    def next_paragraph_length(self, i):
        if(i < (len(self.doc) - 1)):
            return self.paragraph_length(self.doc[i+1])
        else: 
            return 0
        
    def get_first_pos(self, t, n): 
        #  Using a Tagger. Which is part-of-speech
        # tagger or POS-tagger.
        tagged = nltk.pos_tag(t)
        return [x for x in tagged][:n]

    def get_last_pos(self, t, n): 
        #  Using a Tagger. Which is part-of-speech
        # tagger or POS-tagger.
        tagged = nltk.pos_tag(t)
        
        return [x for x in tagged][n:]
    
    def avg_sentence_length(self, p):
        return round(self.paragraph_length(p) / len(p), 2)