In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import sys; sys.path.append("../libs/ATP-morphology/src/")
import os; os.chdir("../libs/ATP-morphology/src/")

In [3]:
import math
from collections import defaultdict, namedtuple
from dataclasses import dataclass, field
from enum import Enum
from operator import itemgetter

import nltk

In [4]:
def trigrams(text):
    for snt in nltk.tokenize.sent_tokenize(text):
        words = [word.lower() for word in nltk.tokenize.word_tokenize(snt) if word.isalpha()]
        for trigram in nltk.ngrams(words, 3):
            yield trigram

In [17]:
class Label(Enum):
    ADJ = "ADJ"
    ADV = "ADV"
    NOUN = "NOUN"
    PRO = "PRO"
    DET = "DET"
    CONJ = "CONJ"
    PREP = "PREP"
    VERB = "VERB"
    VERB_PAST = "VERB_PAST"
    VERB_PROG = "VERB_PROG"
    NOUN_PLRL = "NOUN_PLRL"

In [32]:
SEEDS = [
    ("you", Label.PRO),
    ("we", Label.PRO),
    ("me", Label.PRO),
    ("come", Label.VERB),
    ("play", Label.VERB),
    ("put", Label.VERB),
    ("on", Label.PREP),
    ("out", Label.PREP),
    ("in", Label.PREP),
    ("this", Label.DET),
    ("these", Label.DET),
    ("baby", Label.NOUN),
    ("car", Label.NOUN),
    ("train", Label.NOUN),
    ("box", Label.NOUN),
    ("house", Label.NOUN),
    ("boy", Label.NOUN),
    ("man", Label.NOUN),
    ("book", Label.NOUN),
    ("big", Label.ADJ),
    ("silly", Label.ADJ),
    ("green", Label.ADJ),
    ("well", Label.ADV),
    ("very", Label.ADV),
    ("now", Label.ADV),
    ("and", Label.CONJ),
    ("or", Label.CONJ),
    ("but", Label.CONJ),
    ("used", Label.VERB_PAST),
    ("looked", Label.VERB_PAST),
    ("called", Label.VERB_PAST),
    ("made", Label.VERB_PAST),
    ("being", Label.VERB_PROG),
    ("going", Label.VERB_PROG),
    ("playing", Label.VERB_PROG),
    ("days", Label.NOUN_PLRL),
    ("boys", Label.NOUN_PLRL),
    ("words", Label.NOUN_PLRL)
]

In [33]:
Frame = namedtuple("Frame", ["left", "right"])


@dataclass
class Model:
    frames: dict = field(default_factory=lambda: defaultdict(lambda: defaultdict(int)))  # frame -> label -> score
    lexicon: dict = field(default_factory=lambda: defaultdict(lambda: defaultdict(int))) # word  -> label -> score
    fthresh: int = field(default=15)
    wthresh: int = field(default=15)
    
    def __post_init__(self):
        for word, lbl in SEEDS:
            self.lexicon[word][lbl] = math.inf
    
    def train(self, text):
        for left, target, right in trigrams(text):
            frame = Frame(left, right)
            wlabel, flabel = self.wlabel(target), self.flabel(frame)
            # The target word is part of the trusted lexicon.
            if wlabel:
                # Update frame labels.
                self.frames[frame][wlabel] += 1
                for lbl in self.frames[frame]:
                    if lbl != wlabel:
                        self.frames[frame][lbl] -= 1
            # The frame is a trusted context.
            if flabel:
                # Update word labels.
                self.lexicon[target][flabel] += 1
                for lbl in self.lexicon[target]:
                    if lbl != flabel:
                        self.lexicon[target][lbl] -= 0.75
                
    # TODO: This should not be taking the "max"        
    def wlabel(self, word):
        # Retrieve the highest scoring label for the word.
        if word not in self.lexicon:
            return None
        label, score = max(self.lexicon[word].items(), key=itemgetter(1))
        if score <= self.wthresh:
            return None
        return label
    
    def flabel(self, frame):
        if frame not in self.frames:
            return None
        label, score = max(self.frames[frame].items(), key=itemgetter(1))
        if score <= self.fthresh:
            return None
        return label
    
    def words(self):
        for word in self.lexicon:
            lbl = self.wlabel(word)
            if lbl:
                yield (word, lbl)
                
    def ctxs(self):
        for frm in self.frames:
            lbl = self.flabel(frm)
            if lbl:
                yield (frm, lbl)

In [34]:
mdl = Model()
mdl.train(" ".join([" ".join(snt) for snt in nltk.corpus.brown.sents()]))

In [37]:
set(mdl.words()) - set(SEEDS)

{('a', <Label.DET: 'DET'>),
 ('able', <Label.VERB_PAST: 'VERB_PAST'>),
 ('about', <Label.PREP: 'PREP'>),
 ('across', <Label.PREP: 'PREP'>),
 ('after', <Label.PREP: 'PREP'>),
 ('against', <Label.PREP: 'PREP'>),
 ('all', <Label.PREP: 'PREP'>),
 ('also', <Label.PREP: 'PREP'>),
 ('among', <Label.PREP: 'PREP'>),
 ('any', <Label.DET: 'DET'>),
 ('are', <Label.PREP: 'PREP'>),
 ('around', <Label.PREP: 'PREP'>),
 ('as', <Label.PREP: 'PREP'>),
 ('at', <Label.PREP: 'PREP'>),
 ('be', <Label.PREP: 'PREP'>),
 ('before', <Label.PREP: 'PREP'>),
 ('bottom', <Label.NOUN: 'NOUN'>),
 ('by', <Label.PREP: 'PREP'>),
 ('center', <Label.NOUN: 'NOUN'>),
 ('corner', <Label.NOUN: 'NOUN'>),
 ('could', <Label.PREP: 'PREP'>),
 ('couple', <Label.NOUN: 'NOUN'>),
 ('did', <Label.PREP: 'PREP'>),
 ('do', <Label.PREP: 'PREP'>),
 ('does', <Label.PREP: 'PREP'>),
 ('door', <Label.NOUN: 'NOUN'>),
 ('down', <Label.PREP: 'PREP'>),
 ('during', <Label.PREP: 'PREP'>),
 ('edge', <Label.NOUN: 'NOUN'>),
 ('end', <Label.NOUN: 'NOUN'>),