In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [89]:
import math
from collections import defaultdict, namedtuple
from dataclasses import dataclass, field
from enum import Enum
from operator import itemgetter, attrgetter

import nltk

In [3]:
def trigrams(text):
    for snt in nltk.tokenize.sent_tokenize(text):
        words = [word.lower() for word in nltk.tokenize.word_tokenize(snt) if word.isalpha()]
        for trigram in nltk.ngrams(words, 3):
            yield trigram

In [4]:
class Label(Enum):
    ADJ = "ADJ"
    ADV = "ADV"
    NOUN = "NOUN"
    PRO = "PRO"
    DET = "DET"
    CONJ = "CONJ"
    PREP = "PREP"
    VERB = "VERB"
    # VERB_PAST = "VERB_PAST"
    # VERB_PROG = "VERB_PROG"
    # NOUN_PLRL = "NOUN_PLRL"

In [5]:
SEEDS = [
    ("you", Label.PRO),
    ("we", Label.PRO),
    ("me", Label.PRO),
    ("come", Label.VERB),
    ("play", Label.VERB),
    ("put", Label.VERB),
    ("on", Label.PREP),
    ("out", Label.PREP),
    ("in", Label.PREP),
    ("this", Label.DET),
    ("these", Label.DET),
    ("baby", Label.NOUN),
    ("car", Label.NOUN),
    ("train", Label.NOUN),
    ("box", Label.NOUN),
    ("house", Label.NOUN),
    ("boy", Label.NOUN),
    ("man", Label.NOUN),
    ("book", Label.NOUN),
    ("big", Label.ADJ),
    ("silly", Label.ADJ),
    ("green", Label.ADJ),
    ("well", Label.ADV),
    ("very", Label.ADV),
    ("now", Label.ADV),
    ("and", Label.CONJ),
    ("or", Label.CONJ),
    ("but", Label.CONJ)
]

In [94]:
@dataclass
class Frame:
    left: str
    right: str
    label: Label
    score: int = field(default=0)
    
    @property
    def is_lexical(self):
        return isinstance(self.left, str) and isinstance(self.right, str)
    
    @property
    def is_partial(self):
        return (
            (isinstance(self.left, Label) and isinstance(self.right, str)) or
            (isinstance(self.left, str) and isinstance(self.right, Label))
        )
    
    @property
    def is_categorical(self):
        return isinstance(self.left, Label) and isinstance(self.right, Label)
    

@dataclass
class Word:
    text: str
    label: Label
    score: int = field(default=0)

In [101]:
# Frame = namedtuple("Frame", ["left", "right"])


@dataclass
class Model:
    frames: list = field(default_factory=list)  # frame -> label -> score
    words: list = field(default_factory=lambda: [Word(txt, lbl, math.inf) for txt, lbl in SEEDS]) # word  -> label -> score
    fthresh: int = field(default=15)
    wthresh: int = field(default=15)
    
    def lexicon(self):
        for wrd in self.words:
            if wrd.score > self.wthresh:
                yield wrd
    
    def trusted_frames(self):
        for frm in self.frames:
            if frm.score > self.fthresh:
                yield frm

    def get_from_lexicon(self, text):
        for wrd in self.lexicon():
            if wrd.text == text:
                return wrd
        return None
    
    def get_from_trusted_frames(self, left, right):
        for frm in self.trusted_frames():
            if frm.left == left and frm.right == right:
                return frm
        return None
    
    
    def wlabel(self, text):
        return getattr(get_from_lexicon(text), "label", default=None)
        
    def wlabel(self, text):
        # Retrieve the highest scoring label for the word.
        if not self.get_from_lexicon(text):
            return None
        wrd = max([wrd for wrd in self.lexicon() if wrd.text == text], key=attrgetter("score"))
        if wrd.score <= self.wthresh:
            return None
        return wrd.label
    
    # Retrieve the highest scoring label for the word.
    # def get_from_lexicon(self, text):
    #     return max([wrd for wrd in self.lexicon() if wrd.text == text], key=itemgetter("score"))
    
    def get_best_frame(self, target, ctx, label):
        pass
    
    def applicable_frames(self, left, right):
        for frm in self.frames:
            if isinstance(frm.left, Label) and self.wlabel(frm.left) != self.wlabel(left):
                continue
            if isinstance(frm.right, Label) and self.wlabel(frm.right) != self.wlabel(right):
                continue
            if isinstance(frm.left, str) and frm.left != left:
                continue
            if isinstance(frm.right, str) and frm.left != right:
                continue
            yield frm
    
    def get_frame(self, left, right, label):
        for frm in self.frames:
            if frm.left == left and frm.right == right and frm.label == label:
                return frm
        self.frames.append(Frame(left, right, label))
        return self.frames[-1]
    
    def get_word(self, text, label):
        for wrd in self.words:
            if wrd.text == text and wrd.label == label:
                return wrd
        self.words.append(Word(text, label))
        return self.words[-1]
    
    def train(self, text):
        for left, target, right in trigrams(text):
            # The target word is part of the trusted lexicon.
            if self.get_from_lexicon(target):
                wlabel = self.wlabel(target)
                frame = self.get_frame(left, right, wlabel)
                frame.score += 1
                # Update frame labels.
                for frm in self.applicable_frames(left, right):
                    if frm != frame:
                        frm.score -= 1
            # The frame is a trusted context.
            if self.get_from_trusted_frames(left, right):
                flabel = self.get_from_trusted_frames(left, right).label
                word = self.get_word(target, flabel)
                word.score += 1
                # Update word labels.
                for wrd in self.words:
                    if wrd.text == word.text and wrd != word:
                        wrd.score -= 0.75
    
    # def train(self, text):
    #     for left, target, right in trigrams(text):
    #         frame = Frame(left, right)
    #         wlabel, flabel = self.wlabel(target), self.flabel(frame)
    #         # The target word is part of the trusted lexicon.
    #         if wlabel:
    #             # Update frame labels.
    #             self.frames[frame][wlabel] += 1
    #             for lbl in self.frames[frame]:
    #                 if lbl != wlabel:
    #                     self.frames[frame][lbl] -= 1
    #         # The frame is a trusted context.
    #         elif flabel:
    #             # Update word labels.
    #             self.lexicon[target][flabel] += 1
    #             for lbl in self.lexicon[target]:
    #                 if lbl != flabel:
    #                     self.lexicon[target][lbl] -= 0.75
    #         if wlabel:
    #             self.generalize()
    
    def generalize(self):
        # Consider generalizing lexical frames.
        for lbl in Label:
            frms = [frm for frm in self.frames if self.flabel(frm) == lbl]
            lfreqs = nltk.FreqDist([(self.wlabel(frm.left), frm.right) for frm in frms])
            del lfreqs[None]
            rfreqs = nltk.FreqDist([(frm.left, self.wlabel(frm.right)) for frm in frms])
            del rfreqs[None]
            cfreqs = nltk.FreqDist([(self.wlabel(frm.left), self.wlabel(frm.right)) for frm in frms])
            for tup in cfreqs:
                if None in tup:
                    del cfreqs[tup]
            # Run the tolerance principle.
            self.lframes = {}
            for frm in lfreqs:
                n = lfreqs.N()
                threshold = n - (n / math.log(n))
                if lfreqs[frm] > threshold:
                    self.lframes[frm] = lbl 
            self.rframes = {}
            for frm in rfreqs:
                n = rfreqs.N()
                threshold = n - (n / math.log(n))
                if rfreqs[frm] > threshold:
                    self.rframes[frm] = lbl
            self.cframes = {}
            for frm in cfreqs:
                n = cfreqs.N()
                threshold = n - (n / math.log(n))
                if cfreqs[frm] > threshold:
                    self.cframes[frm] = lbl

In [120]:
mdl = Model()
mdl.train(" ".join([" ".join(snt) for snt in nltk.corpus.brown.sents()]))

KeyboardInterrupt: 

In [12]:
set(mdl.words()) - set(SEEDS)

{('a', <Label.DET: 'DET'>),
 ('about', <Label.PREP: 'PREP'>),
 ('absence', <Label.NOUN: 'NOUN'>),
 ('across', <Label.PREP: 'PREP'>),
 ('after', <Label.PREP: 'PREP'>),
 ('against', <Label.PREP: 'PREP'>),
 ('all', <Label.PREP: 'PREP'>),
 ('also', <Label.PREP: 'PREP'>),
 ('among', <Label.PREP: 'PREP'>),
 ('amount', <Label.NOUN: 'NOUN'>),
 ('any', <Label.DET: 'DET'>),
 ('are', <Label.PREP: 'PREP'>),
 ('area', <Label.NOUN: 'NOUN'>),
 ('around', <Label.PREP: 'PREP'>),
 ('as', <Label.PREP: 'PREP'>),
 ('at', <Label.PREP: 'PREP'>),
 ('basis', <Label.NOUN: 'NOUN'>),
 ('be', <Label.PREP: 'PREP'>),
 ('because', <Label.PREP: 'PREP'>),
 ('before', <Label.PREP: 'PREP'>),
 ('bottle', <Label.NOUN: 'NOUN'>),
 ('bottom', <Label.NOUN: 'NOUN'>),
 ('by', <Label.PREP: 'PREP'>),
 ('called', <Label.PREP: 'PREP'>),
 ('can', <Label.PREP: 'PREP'>),
 ('center', <Label.NOUN: 'NOUN'>),
 ('corner', <Label.NOUN: 'NOUN'>),
 ('could', <Label.PREP: 'PREP'>),
 ('couple', <Label.NOUN: 'NOUN'>),
 ('day', <Label.NOUN: 'NOUN'

In [69]:
mdl = Model()

In [71]:
list(mdl.lexicon())[0].score = 0

In [119]:
mdl.words

[Word(text='you', label=<Label.PRO: 'PRO'>, score=inf),
 Word(text='we', label=<Label.PRO: 'PRO'>, score=inf),
 Word(text='me', label=<Label.PRO: 'PRO'>, score=inf),
 Word(text='come', label=<Label.VERB: 'VERB'>, score=inf),
 Word(text='play', label=<Label.VERB: 'VERB'>, score=inf),
 Word(text='put', label=<Label.VERB: 'VERB'>, score=inf),
 Word(text='on', label=<Label.PREP: 'PREP'>, score=inf),
 Word(text='out', label=<Label.PREP: 'PREP'>, score=inf),
 Word(text='in', label=<Label.PREP: 'PREP'>, score=inf),
 Word(text='this', label=<Label.DET: 'DET'>, score=inf),
 Word(text='these', label=<Label.DET: 'DET'>, score=inf),
 Word(text='baby', label=<Label.NOUN: 'NOUN'>, score=inf),
 Word(text='car', label=<Label.NOUN: 'NOUN'>, score=inf),
 Word(text='train', label=<Label.NOUN: 'NOUN'>, score=inf),
 Word(text='box', label=<Label.NOUN: 'NOUN'>, score=inf),
 Word(text='house', label=<Label.NOUN: 'NOUN'>, score=inf),
 Word(text='boy', label=<Label.NOUN: 'NOUN'>, score=inf),
 Word(text='man', 

In [None]:
getattr?