# Background

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pycrfsuite
import numpy as np

%run FeatureExtraction.ipynb

In [2]:
class BackgroundClassifier:
    def __init__(self, docs, annots):
        self.docs = docs
        self.annot = annots
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.FV = None
        
        self.filtering()
        
        print("Start BACKGROUND")
        
    def filtering(self):
        for i, (doc, annot) in enumerate(zip(self.docs, self.annot)):
            nX, nL = [], []
            for(p, a) in zip(doc, annot):
                if(a['type'] != "Introduction"):
                    nX.append(p)
                    nL.append(a)
            
            self.docs[i] = nX
            self.annot[i] = nL
            
    def train_test_split(self, rs, p):
        X = [self.compute_feature_vector(i, doc) for i, doc in enumerate(self.docs)]
        y = [self.get_label(i, doc) for i, doc in enumerate(self.docs)]
        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, train_size = p, random_state = rs)
    
    def compute_feature_vector(self, i, doc):
        fv = FeatureExtraction(doc, i, self.annot[i])
        return fv.get_feature_vector()
    
    def get_label(self, i, doc):
        label = []
        for (p, a) in zip(doc, self.annot[i]): 
            if(a['type'] == "Background"):
                label.append("1")
            else: 
                label.append("0")

        return label
    
    def train(self):
        trainer = pycrfsuite.Trainer(verbose=False)

        #Submit training data to the trainer
        for xseq, yseq in zip(self.X_train, self.y_train):
            trainer.append(xseq, yseq)

        # Set the parameters of the model
        trainer.set_params({
             #coefficient for L1 penality
            "c1": 0.1, 

            #coefficient for L2 penality
            "c2": 0.01, 

            # maximum number of iterations
            "max_iterations": 200, 

            # whether to include transitions that 
            # are possibile, but not observed
            "feature.possible_transitions": True
        })

        # Provide a file name as a paramter to the train function, such that 
        # the model will be saved to the file when training is finished
        trainer.train("./Models/background.model")
        
    def test(self):
        tagger = pycrfsuite.Tagger()
        tagger.open("./Models/background.model")
        y_pred = [tagger.tag(xseq) for xseq in self.X_test]
        
        # Create a mapping o la belas to indices
        labels = {"1": 1, "0": 0}

        # Convert the sequences of tags into a 1 dimensional array
        predictions = np.array([labels[tag] for row in y_pred for tag in row])
        truths = np.array([labels[tag] for row in self.y_test for tag in row])
        # Print out the classification report
        a = (classification_report(
            truths, predictions, 
            target_names = ["Others", "Background"],
            output_dict=True
        ))

        return a