Skip to content

Commit

Permalink
Add MIT License information for _tag_perceptron.py
Browse files Browse the repository at this point in the history
  • Loading branch information
bact committed Aug 23, 2020
1 parent aad2507 commit 9cc4dad
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 99 deletions.
82 changes: 0 additions & 82 deletions pythainlp/tag/_perceptron.py

This file was deleted.

121 changes: 104 additions & 17 deletions pythainlp/tag/_tag_perceptron.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,96 @@
# -*- coding: utf-8 -*-
"""
Code from
https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py
Perceptron Tagger.
This tagger is a port of the Textblob Averaged Perceptron Tagger
Author: Matthew Honnibal <honnibal+gh@gmail.com>,
Long Duong <longdt219@gmail.com> (NLTK port)
Wannaphong Phatthiyaphaibun <wannaphong@kkumail.com> (PyThaiNLP port)
URL: <https://github.com/sloria/textblob-aptagger>
<https://nltk.org/>
Copyright 2013 Matthew Honnibal
NLTK modifications Copyright 2015 The NLTK Project
PyThaiNLP modifications Copyright 2020 PyThaiNLP Project
This tagger is provided under the terms of the MIT License.
"""

from __future__ import absolute_import

# import logging
import os
import pickle
import random
from collections import defaultdict
from typing import Dict, List, Tuple, Union
from typing import Dict, Iterable, List, Tuple, Union


class AveragedPerceptron(object):
"""
An averaged perceptron, as implemented by Matthew Honnibal.
See more implementation details here:
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
"""

def __init__(self) -> None:
# Each feature gets its own weight vector,
# so weights is a dict-of-dicts
self.weights = {}
self.classes = set()
# The accumulated values, for the averaging. These will be keyed by
# feature/class tuples
self._totals = defaultdict(int)
# The last time the feature was changed, for the averaging. Also
# keyed by feature/class tuples
# (tstamps is short for timestamps)
self._tstamps = defaultdict(int)
# Number of instances seen
self.i = 0

from pythainlp.tag._perceptron import AveragedPerceptron
def predict(self, features: Dict):
"""
Dot-product the features and current weights and return the best
label.
"""
scores = defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
continue
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda label: (scores[label], label))

def update(self, truth, guess, features: Dict) -> None:
"""Update the feature weights."""

def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
self.weights[f][c] = w + v

self.i += 1
if truth == guess:
return
for f in features:
weights = self.weights.setdefault(f, {})
upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)

def average_weights(self) -> None:
"""Average weights from all iterations."""
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
param = (feat, clas)
total = self._totals[param]
total += (self.i - self._tstamps[param]) * weight
averaged = round(total / float(self.i), 3)
if averaged:
new_feat_weights[clas] = averaged
self.weights[feat] = new_feat_weights


class PerceptronTagger:
Expand All @@ -21,6 +99,20 @@ class PerceptronTagger:
See more implementation details here:
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
>>> from pythainlp.tag import PerceptronTagger
>>> tagger = PerceptronTagger()
>>> data = [
[("คน", "N"), ("เดิน", "V")],
[("แมว", "N"), ("เดิน", "V")],
[("คน", "N"), ("วิ่ง", "V")],
[("ปลา", "N"), ("ว่าย", "V")],
[("นก", "N"), ("บิน", "V")],
]
>>> tagger.train(data)
>>> tagger.tag(["นก", "เดิน])
[('นก', 'N'), ('เดิน', 'V')]
"""

START = ["-START-", "-START2-"]
Expand All @@ -38,7 +130,7 @@ def __init__(self, path: str = "") -> None:
self.AP_MODEL_LOC = path
self.load(self.AP_MODEL_LOC)

def tag(self, tokens: List[str]) -> List[Tuple[str, str]]:
def tag(self, tokens: Iterable[str]) -> List[Tuple[str, str]]:
"""Tags a string `tokens`."""
prev, prev2 = self.START
output = []
Expand All @@ -56,7 +148,7 @@ def tag(self, tokens: List[str]) -> List[Tuple[str, str]]:

def train(
self,
sentences: List[List[Tuple[str, str]]],
sentences: Iterable[Iterable[Tuple[str, str]]],
save_loc: Union[str, None] = None,
nr_iter: int = 5,
) -> None:
Expand Down Expand Up @@ -94,11 +186,9 @@ def train(
c += guess == tags[i]
n += 1
random.shuffle(sentences)
# logging.info(
# "Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n))
# )
self.model.average_weights()
# Pickle as a binary file

# save the model
if save_loc is not None:
data = {}
data["weights"] = self.model.weights
Expand Down Expand Up @@ -174,7 +264,9 @@ def add(name: str, *args):
add("i+2 word", context[i + 2])
return features

def _make_tagdict(self, sentences: List[List[Tuple[str, str]]]) -> None:
def _make_tagdict(
self, sentences: Iterable[Iterable[Tuple[str, str]]]
) -> None:
"""Make a tag dictionary for single-tag words."""
counts = defaultdict(lambda: defaultdict(int))
for sentence in sentences:
Expand All @@ -190,8 +282,3 @@ def _make_tagdict(self, sentences: List[List[Tuple[str, str]]]) -> None:
# Only add quite unambiguous words
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
self.tagdict[word] = tag


# for logging
# def _pc(n, d) -> float:
# return (float(n) / d) * 100

0 comments on commit 9cc4dad

Please sign in to comment.