diff --git a/pythainlp/tag/_perceptron.py b/pythainlp/tag/_perceptron.py deleted file mode 100644 index 12bc1f07d..000000000 --- a/pythainlp/tag/_perceptron.py +++ /dev/null @@ -1,82 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Averaged perceptron classifier. -Implementation geared for simplicity rather than efficiency. - -Code from -https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/_perceptron.py -""" -import pickle -import random -from collections import defaultdict -from typing import Dict - - -class AveragedPerceptron(object): - - """ - An averaged perceptron, as implemented by Matthew Honnibal. - - See more implementation details here: - http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ - """ - - def __init__(self) -> None: - # Each feature gets its own weight vector, - # so weights is a dict-of-dicts - self.weights = {} - self.classes = set() - # The accumulated values, for the averaging. These will be keyed by - # feature/class tuples - self._totals = defaultdict(int) - # The last time the feature was changed, for the averaging. Also - # keyed by feature/class tuples - # (tstamps is short for timestamps) - self._tstamps = defaultdict(int) - # Number of instances seen - self.i = 0 - - def predict(self, features: Dict): - """ - Dot-product the features and current weights and return the best - label. - """ - scores = defaultdict(float) - for feat, value in features.items(): - if feat not in self.weights or value == 0: - continue - weights = self.weights[feat] - for label, weight in weights.items(): - scores[label] += value * weight - # Do a secondary alphabetic sort, for stability - return max(self.classes, key=lambda label: (scores[label], label)) - - def update(self, truth, guess, features: Dict) -> None: - """Update the feature weights.""" - - def upd_feat(c, f, w, v): - param = (f, c) - self._totals[param] += (self.i - self._tstamps[param]) * w - self._tstamps[param] = self.i - self.weights[f][c] = w + v - - self.i += 1 - if truth == guess: - return - for f in features: - weights = self.weights.setdefault(f, {}) - upd_feat(truth, f, weights.get(truth, 0.0), 1.0) - upd_feat(guess, f, weights.get(guess, 0.0), -1.0) - - def average_weights(self) -> None: - """Average weights from all iterations.""" - for feat, weights in self.weights.items(): - new_feat_weights = {} - for clas, weight in weights.items(): - param = (feat, clas) - total = self._totals[param] - total += (self.i - self._tstamps[param]) * weight - averaged = round(total / float(self.i), 3) - if averaged: - new_feat_weights[clas] = averaged - self.weights[feat] = new_feat_weights diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py index 9ee3f3e55..ff3fe00a5 100644 --- a/pythainlp/tag/_tag_perceptron.py +++ b/pythainlp/tag/_tag_perceptron.py @@ -1,18 +1,96 @@ # -*- coding: utf-8 -*- """ -Code from -https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py +Perceptron Tagger. + +This tagger is a port of the Textblob Averaged Perceptron Tagger +Author: Matthew Honnibal , + Long Duong (NLTK port) + Wannaphong Phatthiyaphaibun (PyThaiNLP port) +URL: + +Copyright 2013 Matthew Honnibal +NLTK modifications Copyright 2015 The NLTK Project +PyThaiNLP modifications Copyright 2020 PyThaiNLP Project + +This tagger is provided under the terms of the MIT License. """ + from __future__ import absolute_import -# import logging import os import pickle import random from collections import defaultdict -from typing import Dict, List, Tuple, Union +from typing import Dict, Iterable, List, Tuple, Union + + +class AveragedPerceptron(object): + """ + An averaged perceptron, as implemented by Matthew Honnibal. + + See more implementation details here: + http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ + """ + + def __init__(self) -> None: + # Each feature gets its own weight vector, + # so weights is a dict-of-dicts + self.weights = {} + self.classes = set() + # The accumulated values, for the averaging. These will be keyed by + # feature/class tuples + self._totals = defaultdict(int) + # The last time the feature was changed, for the averaging. Also + # keyed by feature/class tuples + # (tstamps is short for timestamps) + self._tstamps = defaultdict(int) + # Number of instances seen + self.i = 0 -from pythainlp.tag._perceptron import AveragedPerceptron + def predict(self, features: Dict): + """ + Dot-product the features and current weights and return the best + label. + """ + scores = defaultdict(float) + for feat, value in features.items(): + if feat not in self.weights or value == 0: + continue + weights = self.weights[feat] + for label, weight in weights.items(): + scores[label] += value * weight + # Do a secondary alphabetic sort, for stability + return max(self.classes, key=lambda label: (scores[label], label)) + + def update(self, truth, guess, features: Dict) -> None: + """Update the feature weights.""" + + def upd_feat(c, f, w, v): + param = (f, c) + self._totals[param] += (self.i - self._tstamps[param]) * w + self._tstamps[param] = self.i + self.weights[f][c] = w + v + + self.i += 1 + if truth == guess: + return + for f in features: + weights = self.weights.setdefault(f, {}) + upd_feat(truth, f, weights.get(truth, 0.0), 1.0) + upd_feat(guess, f, weights.get(guess, 0.0), -1.0) + + def average_weights(self) -> None: + """Average weights from all iterations.""" + for feat, weights in self.weights.items(): + new_feat_weights = {} + for clas, weight in weights.items(): + param = (feat, clas) + total = self._totals[param] + total += (self.i - self._tstamps[param]) * weight + averaged = round(total / float(self.i), 3) + if averaged: + new_feat_weights[clas] = averaged + self.weights[feat] = new_feat_weights class PerceptronTagger: @@ -21,6 +99,20 @@ class PerceptronTagger: See more implementation details here: http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ + + >>> from pythainlp.tag import PerceptronTagger + >>> tagger = PerceptronTagger() + >>> data = [ + [("คน", "N"), ("เดิน", "V")], + [("แมว", "N"), ("เดิน", "V")], + [("คน", "N"), ("วิ่ง", "V")], + [("ปลา", "N"), ("ว่าย", "V")], + [("นก", "N"), ("บิน", "V")], + ] + >>> tagger.train(data) + >>> tagger.tag(["นก", "เดิน]) + [('นก', 'N'), ('เดิน', 'V')] + """ START = ["-START-", "-START2-"] @@ -38,7 +130,7 @@ def __init__(self, path: str = "") -> None: self.AP_MODEL_LOC = path self.load(self.AP_MODEL_LOC) - def tag(self, tokens: List[str]) -> List[Tuple[str, str]]: + def tag(self, tokens: Iterable[str]) -> List[Tuple[str, str]]: """Tags a string `tokens`.""" prev, prev2 = self.START output = [] @@ -56,7 +148,7 @@ def tag(self, tokens: List[str]) -> List[Tuple[str, str]]: def train( self, - sentences: List[List[Tuple[str, str]]], + sentences: Iterable[Iterable[Tuple[str, str]]], save_loc: Union[str, None] = None, nr_iter: int = 5, ) -> None: @@ -94,11 +186,9 @@ def train( c += guess == tags[i] n += 1 random.shuffle(sentences) - # logging.info( - # "Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)) - # ) self.model.average_weights() - # Pickle as a binary file + + # save the model if save_loc is not None: data = {} data["weights"] = self.model.weights @@ -174,7 +264,9 @@ def add(name: str, *args): add("i+2 word", context[i + 2]) return features - def _make_tagdict(self, sentences: List[List[Tuple[str, str]]]) -> None: + def _make_tagdict( + self, sentences: Iterable[Iterable[Tuple[str, str]]] + ) -> None: """Make a tag dictionary for single-tag words.""" counts = defaultdict(lambda: defaultdict(int)) for sentence in sentences: @@ -190,8 +282,3 @@ def _make_tagdict(self, sentences: List[List[Tuple[str, str]]]) -> None: # Only add quite unambiguous words if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: self.tagdict[word] = tag - - -# for logging -# def _pc(n, d) -> float: -# return (float(n) / d) * 100