In [34]:
from db.dbclient import MongoClient
import numpy as np
import spacy
from spacy.util import minibatch, compounding
from functools import partial
from sklearn.model_selection import train_test_split

class TextModel:

    
    def __init__(self, model=None):
        self.nlps = None
    
    def load_docs(self, field, test_size=.25):
        query = { "is_review" : { "$exists" : True } }
        client = MongoClient('articles').collection
        results = list(client.find(query))
    
        texts, labels = [i.get(field, "") for i in results], [{"is_review": int(i['is_review'])} for i in results]        
        return train_test_split(texts, labels, test_size=test_size)
        
    def train(self, model=None, field='content', test_size = .25, n_iter=12, n_texts=2000):
        
        if model is not None:
            nlp = spacy.load(model)  # load existing spaCy model
            print("Loaded model '%s'" % model)
        else:
            nlp = spacy.load('en')
            print("Created blank 'en' model")
            
        if 'textcat' not in nlp.pipe_names:
            self.textcat = nlp.create_pipe('textcat')
            nlp.add_pipe(self.textcat, last=True)
        # otherwise, get it, so we can add labels to it
        else:
            self.textcat = nlp.get_pipe('textcat')            
            
        # add label to text classifier
        self.textcat.add_label('is_review')  
        
        train_texts, test_texts, train_labels, test_labels = self.load_docs(field, test_size=test_size)

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']           
        train_data = list(zip(train_texts,
                              [{'cats': i} for i in train_labels]))
        
        with nlp.disable_pipes(*other_pipes):  # only train textcat
            optimizer = nlp.begin_training()
            print("Training the model...")
            print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
            for i in range(n_iter):
                try:
                    losses = {}
                    # batch up the examples using spaCy's minibatch
                    batches = minibatch(train_data, size=compounding(4., 32., 1.001))
                    for batch in batches:
                        texts, annotations = zip(*batch)
                        nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                                   losses=losses)
                    with self.textcat.model.use_params(optimizer.averages):
                        # evaluate on the dev data split off in load_data()
                        scores = self.evaluate(nlp.tokenizer, self.textcat, test_texts, test_labels)
                    print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                          .format(losses['textcat'], scores['textcat_p'],
                                  scores['textcat_r'], scores['textcat_f']))  
                except KeyboardInterrupt:
                    break
                    
        return nlp
                    
                
    def evaluate(self, tokenizer, textcat, texts, cats):
        docs = (tokenizer(text) for text in texts)
        tp = 1e-8  # True positives
        fp = 1e-8  # False positives
        fn = 1e-8  # False negatives
        tn = 1e-8  # True negatives
        for i, doc in enumerate(textcat.pipe(docs)):
            gold = cats[i]
            for label, score in doc.cats.items():
                if label not in gold:
                    continue
                if score >= 0.5 and gold[label] >= 0.5:
                    tp += 1.
                elif score >= 0.5 and gold[label] < 0.5:
                    fp += 1.
                elif score < 0.5 and gold[label] < 0.5:
                    tn += 1
                elif score < 0.5 and gold[label] >= 0.5:
                    fn += 1
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f_score = 2 * (precision * recall) / (precision + recall)
        return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}      
    
    def create_model(self):
        nlps = {}
        for field in ['title', 'content', 'meta_description']:
            nlp = self.train(field=field)
            nlps[field] = nlp
        return nlps
    
    def build(self):
        self.nlps = self.create_model()
        
        
    def predict(self, document):
        def fix(x):
            if x is None:
                return " "
            if x == "":
                return " "
            return x
        if self.nlps is None:
            raise ValueError("call .build first")
        return np.mean([nlp(fix(document.get(key," "))).cats['is_review'] for key, nlp in self.nlps.items()])  
m = TextModel()
m.build()

Created blank 'en' model
Training the model...
LOSS 	  P  	  R  	  F  
16.030	0.667	0.143	0.235
13.725	0.727	0.286	0.410
8.718	0.800	0.429	0.558
7.127	0.556	0.357	0.435
6.015	0.579	0.393	0.468
5.484	0.476	0.357	0.408
3.525	0.500	0.321	0.391
2.962	0.533	0.286	0.372
3.474	0.647	0.393	0.489
2.919	0.696	0.571	0.627
1.850	0.722	0.464	0.565
1.899	0.588	0.357	0.444
Created blank 'en' model
Training the model...
LOSS 	  P  	  R  	  F  
10.639	0.933	0.378	0.538
3.927	0.955	0.568	0.712
1.882	0.962	0.676	0.794
1.317	0.960	0.649	0.774
1.095	0.958	0.622	0.754
0.952	0.958	0.622	0.754
0.823	0.960	0.649	0.774
0.731	0.960	0.649	0.774
0.660	0.960	0.649	0.774
0.574	0.960	0.649	0.774
0.417	0.960	0.649	0.774
0.414	0.960	0.649	0.774
Created blank 'en' model
Training the model...
LOSS 	  P  	  R  	  F  
17.731	0.500	0.000	0.000
13.936	1.000	0.194	0.324
10.558	1.000	0.226	0.368
7.502	0.917	0.355	0.512
4.496	0.565	0.419	0.481
2.459	0.550	0.355	0.431
2.045	0.687	0.355	0.468
1.819	0.786	0.355	0.489
2.140	0.722	0

In [35]:
def predict(model, document):
    def fix(x):
        if x is None:
            return " "
        if x == "":
            return " "
        return x
    if model.nlps is None:
        raise ValueError("call .build first")
    return np.mean([nlp(fix(document.get(key," "))).cats['is_review'] for key, nlp in model.nlps.items()])        
my_predict = partial(predict, m)

In [37]:
from ipywidgets.widgets import Textarea, HBox, VBox, Button, Layout
from tqdm import tqdm_notebook
from IPython.display import display
from db.dbclient import MongoClient
class Labeler:
    def __init__(self, predict_function):
        self.client = MongoClient('articles').collection
        query = { "is_review" : { "$exists" : False } }
        self.article_iter = self.client.find(query)
        self.predict_function = predict_function
        self.docs = self.order_annotations()
        self.review = None
        self.__init_display__()
        self._next()      
        
    def label_docs(self):
        reviews, labels = [], []
        articles = list(self.article_iter)
        N = len(articles)
        for i in tqdm_notebook(articles, total=N):
            try:
                pred = self.predict_function(i)
            except KeyError as e:
                print(i.keys())
                raise e
            reviews.append(i)
            labels.append(pred)
        return reviews, labels
    
    def order_annotations(self):
        docs, preds = self.label_docs()
        certainties = abs(np.array([preds]) - .5)
        uncertainty_idx = np.argsort(certainties)[::-1][0]
        uncertainty_idx = [i for i in range(len(docs))]
        results = zip([docs[i] for i in uncertainty_idx], [preds[i] for i in uncertainty_idx])
        return (i for i in results)
    
    def _next(self):
        self.review, prob = next(self.docs)
        self.content.value = self.review.get('content', " ")
        self.title.value = self.review.get('title', " ")
        self.meta_description.value = self.review.get('meta_description', " ")
        self.prob.value = str(prob)
        
    def __init_display__(self):
        self.prob = Textarea(description='Probability(review)', layout=Layout(width='20%', height='30px'))
        self.content = Textarea(description='content', layout=Layout(width='80%', height='200px'))
        self.title = Textarea(description='title', layout=Layout(width='80%', height='30px'))      
        self.meta_description = Textarea(description='meta_description', layout=Layout(width='80%', height='80px'))
        self.yes = Button(description='Review')
        self.no = Button(description='Not Review')
        self.skip = Button(description='Skip')
        self.yes.on_click(self.review_true)
        self.no.on_click(self.review_false)
        self.skip.on_click(self.review_skip)
        self.button_box = HBox([self.yes, self.no, self.skip])
        self.text_box = VBox([self.prob, self.title, self.meta_description, self.content])
        self.widget = VBox([self.button_box, self.text_box])
        display(self.widget)
        
    def _submit_current(self):
        self.client.find_one_and_replace({'_id': self.review['_id']}, self.review)

    def review_true(self, b):
        self.review['is_review'] = True
        self._submit_current()
        self._next()
        
    def review_false(self, b):
        self.review['is_review'] = False
        self._submit_current()
        self._next()
        
    def review_skip(self, b):
        self._next()
        
l = Labeler(my_predict)
