Add MIT License information for _tag_perceptron.py

PyThaiNLP · Aug 23, 2020 · 9cc4dad · 9cc4dad
1 parent aad2507
commit 9cc4dad
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 99 deletions.
diff --git a/pythainlp/tag/_perceptron.py b/pythainlp/tag/_perceptron.py
diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py
@@ -1,18 +1,96 @@
 # -*- coding: utf-8 -*-
 """
-Code from
-https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py
+Perceptron Tagger.
+
+This tagger is a port of the Textblob Averaged Perceptron Tagger
+Author: Matthew Honnibal <honnibal+gh@gmail.com>,
+        Long Duong <longdt219@gmail.com> (NLTK port)
+        Wannaphong Phatthiyaphaibun <wannaphong@kkumail.com> (PyThaiNLP port)
+URL: <https://github.com/sloria/textblob-aptagger>
+     <https://nltk.org/>
+Copyright 2013 Matthew Honnibal
+NLTK modifications Copyright 2015 The NLTK Project
+PyThaiNLP modifications Copyright 2020 PyThaiNLP Project
+
+This tagger is provided under the terms of the MIT License.
 """
+
 from __future__ import absolute_import
 
-# import logging
 import os
 import pickle
 import random
 from collections import defaultdict
-from typing import Dict, List, Tuple, Union
+from typing import Dict, Iterable, List, Tuple, Union
+
+
+class AveragedPerceptron(object):
+    """
+    An averaged perceptron, as implemented by Matthew Honnibal.
+
+    See more implementation details here:
+        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
+    """
+
+    def __init__(self) -> None:
+        # Each feature gets its own weight vector,
+        # so weights is a dict-of-dicts
+        self.weights = {}
+        self.classes = set()
+        # The accumulated values, for the averaging. These will be keyed by
+        # feature/class tuples
+        self._totals = defaultdict(int)
+        # The last time the feature was changed, for the averaging. Also
+        # keyed by feature/class tuples
+        # (tstamps is short for timestamps)
+        self._tstamps = defaultdict(int)
+        # Number of instances seen
+        self.i = 0
 
-from pythainlp.tag._perceptron import AveragedPerceptron
+    def predict(self, features: Dict):
+        """
+        Dot-product the features and current weights and return the best
+        label.
+        """
+        scores = defaultdict(float)
+        for feat, value in features.items():
+            if feat not in self.weights or value == 0:
+                continue
+            weights = self.weights[feat]
+            for label, weight in weights.items():
+                scores[label] += value * weight
+        # Do a secondary alphabetic sort, for stability
+        return max(self.classes, key=lambda label: (scores[label], label))
+
+    def update(self, truth, guess, features: Dict) -> None:
+        """Update the feature weights."""
+
+        def upd_feat(c, f, w, v):
+            param = (f, c)
+            self._totals[param] += (self.i - self._tstamps[param]) * w
+            self._tstamps[param] = self.i
+            self.weights[f][c] = w + v
+
+        self.i += 1
+        if truth == guess:
+            return
+        for f in features:
+            weights = self.weights.setdefault(f, {})
+            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
+            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
+
+    def average_weights(self) -> None:
+        """Average weights from all iterations."""
+        for feat, weights in self.weights.items():
+            new_feat_weights = {}
+            for clas, weight in weights.items():
+                param = (feat, clas)
+                total = self._totals[param]
+                total += (self.i - self._tstamps[param]) * weight
+                averaged = round(total / float(self.i), 3)
+                if averaged:
+                    new_feat_weights[clas] = averaged
+            self.weights[feat] = new_feat_weights
 
 
 class PerceptronTagger:
@@ -21,6 +99,20 @@ class PerceptronTagger:
 
     See more implementation details here:
         http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
+
+    >>> from pythainlp.tag import PerceptronTagger
+    >>> tagger = PerceptronTagger()
+    >>> data = [
+            [("คน", "N"), ("เดิน", "V")],
+            [("แมว", "N"), ("เดิน", "V")],
+            [("คน", "N"), ("วิ่ง", "V")],
+            [("ปลา", "N"), ("ว่าย", "V")],
+            [("นก", "N"), ("บิน", "V")],
+        ]
+    >>> tagger.train(data)
+    >>> tagger.tag(["นก", "เดิน])
+    [('นก', 'N'), ('เดิน', 'V')]
+
     """
 
     START = ["-START-", "-START2-"]
@@ -38,7 +130,7 @@ def __init__(self, path: str = "") -> None:
             self.AP_MODEL_LOC = path
             self.load(self.AP_MODEL_LOC)
 
-    def tag(self, tokens: List[str]) -> List[Tuple[str, str]]:
+    def tag(self, tokens: Iterable[str]) -> List[Tuple[str, str]]:
         """Tags a string `tokens`."""
         prev, prev2 = self.START
         output = []
@@ -56,7 +148,7 @@ def tag(self, tokens: List[str]) -> List[Tuple[str, str]]:
 
     def train(
         self,
-        sentences: List[List[Tuple[str, str]]],
+        sentences: Iterable[Iterable[Tuple[str, str]]],
         save_loc: Union[str, None] = None,
         nr_iter: int = 5,
     ) -> None:
@@ -94,11 +186,9 @@ def train(
                     c += guess == tags[i]
                     n += 1
             random.shuffle(sentences)
-            # logging.info(
-            #     "Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n))
-            # )
         self.model.average_weights()
-        # Pickle as a binary file
+
+        # save the model
         if save_loc is not None:
             data = {}
             data["weights"] = self.model.weights
@@ -174,7 +264,9 @@ def add(name: str, *args):
         add("i+2 word", context[i + 2])
         return features
 
-    def _make_tagdict(self, sentences: List[List[Tuple[str, str]]]) -> None:
+    def _make_tagdict(
+        self, sentences: Iterable[Iterable[Tuple[str, str]]]
+    ) -> None:
         """Make a tag dictionary for single-tag words."""
         counts = defaultdict(lambda: defaultdict(int))
         for sentence in sentences:
@@ -190,8 +282,3 @@ def _make_tagdict(self, sentences: List[List[Tuple[str, str]]]) -> None:
             # Only add quite unambiguous words
             if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                 self.tagdict[word] = tag
-
-
-# for logging
-# def _pc(n, d) -> float:
-#     return (float(n) / d) * 100