From 7d194e54ba190ce4bec235c975af6978456a50bb Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 19 Aug 2019 10:27:13 +0700 Subject: [PATCH 01/73] move text processing functions to ulmfit/rules.py 1. Move all functions related to text processing to the new file `ulmfit/rules.py` 2. For the rules imported from `fastai` library, we copied the code to the pythainlp library 3. Use the code of BaseTokenizer class from `fastai` librarry --- pythainlp/ulmfit/__init__.py | 102 +++++++---------------------- pythainlp/ulmfit/rules.py | 122 +++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 79 deletions(-) create mode 100644 pythainlp/ulmfit/rules.py diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py index 313f896b1..6a64549cb 100644 --- a/pythainlp/ulmfit/__init__.py +++ b/pythainlp/ulmfit/__init__.py @@ -5,21 +5,27 @@ """ import collections import re -from typing import List,Collection +from typing import List, Collection -import emoji import numpy as np import torch -from fastai.text import BaseTokenizer, TK_REP, TK_WREP -from fastai.text.transform import ( + +from pythainlp.corpus import download, get_corpus, get_corpus_path +from pythainlp.tokenize import Tokenizer +from pythainlp.util import normalize as normalize_char_order +from .rules import ( fix_html, replace_all_caps, rm_useless_spaces, spec_add_spaces, + replace_rep_after, + rm_useless_newlines, + rm_brackets, + ungroup_emoji, + lowercase_all, + replace_wrep_post, + BaseTokenizer ) -from pythainlp.corpus import download, get_corpus, get_corpus_path -from pythainlp.tokenize import Tokenizer -from pythainlp.util import normalize as normalize_char_order __all__ = [ "ThaiTokenizer", @@ -38,6 +44,7 @@ _THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt") _pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm") + # Download pretrained models def _get_path(fname: str) -> str: """ @@ -86,7 +93,7 @@ def tokenizer(text: str) -> List[str]: >>> >>> text = "อาภรณ์, จินตมยปัญญา ภาวนามยปัญญา" >>> ThaiTokenizer.tokenizer(text) - ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา', + ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา', ' ', 'ภาวนามยปัญญา'] >>> >>> word_tokenize(text, engine='ulmfit') @@ -99,78 +106,11 @@ def tokenizer(text: str) -> List[str]: def add_special_cases(self, toks): pass - -def replace_rep_after(text: str) -> str: - """ - Replace repetitions at the character level in `text` after the repetition. - This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xrep 8 ย'; - instead it will retain the word as 'น้อย xrep 8' - """ - - def _replace_rep(m): - c, cc = m.groups() - return f"{c} {TK_REP} {len(cc)+1} " - - re_rep = re.compile(r"(\S)(\1{3,})") - - return re_rep.sub(_replace_rep, text) - -def replace_wrep_post(toks:Collection): - """Replace reptitive words post tokenization; - fastai `replace_wrep` does not work well with Thai.""" - previous_word = None - rep_count = 0 - res = [] - for current_word in toks+['xxend']: - if current_word==previous_word: - rep_count+=1 - elif (current_word!=previous_word) & (rep_count>0): - res += [TK_WREP,str(rep_count),previous_word] - rep_count=0 - else: - res.append(previous_word) - previous_word=current_word - return res[1:] - - -def rm_useless_newlines(text: str) -> str: - "Remove multiple newlines in `text`." - - return re.sub(r"[\n]{2,}", " ", text) - - -def rm_brackets(text: str) -> str: - "Remove all empty brackets from `t`." - new_line = re.sub(r"\(\)", "", text) - new_line = re.sub(r"\{\}", "", new_line) - new_line = re.sub(r"\[\]", "", new_line) - - return new_line - - -def ungroup_emoji(toks:Collection): - "Ungroup emojis" - - res = [] - for tok in toks: - if emoji.emoji_count(tok) == len(tok): - for char in tok: - res.append(char) - else: - res.append(tok) - - return res - - -def lowercase_all(toks:Collection): - "lowercase all English words" - return [tok.lower() for tok in toks] - - # Pretrained paths # TODO: Let the user decide if they like to download (at setup?) _THWIKI_LSTM = dict( - wgts_fname=_get_path(_MODEL_NAME_LSTM), itos_fname=_get_path(_ITOS_NAME_LSTM) + wgts_fname=_get_path(_MODEL_NAME_LSTM), + itos_fname=_get_path(_ITOS_NAME_LSTM) ) # Preprocessing rules for Thai text @@ -183,7 +123,10 @@ def lowercase_all(toks:Collection): rm_useless_newlines, rm_brackets, ] -post_rules_th = [replace_all_caps, ungroup_emoji, lowercase_all, replace_wrep_post] +post_rules_th = [replace_all_caps, + ungroup_emoji, + lowercase_all, + replace_wrep_post] _tokenizer = ThaiTokenizer() @@ -231,7 +174,8 @@ def document_vector(text: str, learn, data, agg: str = "mean"): """ s = _tokenizer.tokenizer(text) - t = torch.tensor(data.vocab.numericalize(s), requires_grad=False).to(device) + t = torch.tensor(data.vocab.numericalize(s), + requires_grad=False).to(device) m = learn.model[0].encoder.to(device) res = m(t).cpu().detach().numpy() if agg == "mean": diff --git a/pythainlp/ulmfit/rules.py b/pythainlp/ulmfit/rules.py new file mode 100644 index 000000000..468519c56 --- /dev/null +++ b/pythainlp/ulmfit/rules.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +import html +import emoji +import re +from typing import List, Collection + +TK_MAJ, TK_UP, TK_REP, TK_WREP = 'xxmaj', 'xxup', 'xxrep', 'xxwrep' +BOS, EOS, FLD, UNK, PAD = 'xxbos', 'xxeos', 'xxfld', 'xxunk', 'xxpad' + + +def fix_html(x: str) -> str: + """List of replacements from html strings in `x`. (code from `fastai`)""" + re1 = re.compile(r' +') + x = x.replace('#39;', "'").replace('amp;', '&').replace( + '#146;', "'").replace('nbsp;', ' ').replace( + '#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( + '
', "\n").replace('\\"', '"').replace('', UNK).replace( + ' @.@ ', '.').replace(' @-@ ', '-').replace(' @,@ ', ',').replace( + '\\', ' \\ ') + return re1.sub(' ', html.unescape(x)) + + +def replace_all_caps(x: Collection[str]) -> Collection[str]: + """ + Replace tokens in ALL CAPS in `x` by their lower version \ + and add `TK_UP` before." (code from `fastai`) + """ + res = [] + for t in x: + if t.isupper() and len(t) > 1: + res.append(TK_UP) + res.append(t.lower()) + else: + res.append(t) + return res + + +def rm_useless_spaces(t: str) -> str: + """Remove multiple spaces in `t`. (code from `fastai`)""" + return re.sub(' {2,}', ' ', t) + + +def spec_add_spaces(t: str) -> str: + """Add spaces around / and # in `t`. \n (code from `fastai`)""" + return re.sub(r'([/#\n])', r' \1 ', t) + + +def replace_rep_after(text: str) -> str: + """ + Replace repetitions at the character level in `text` after the repetition. + This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xrep 8 ย'; + instead it will retain the word as 'น้อย xrep 8' + """ + + def _replace_rep(m): + c, cc = m.groups() + return f"{c} {TK_REP} {len(cc)+1} " + + re_rep = re.compile(r"(\S)(\1{3,})") + + return re_rep.sub(_replace_rep, text) + + +def replace_wrep_post(toks: Collection): + """Replace reptitive words post tokenization; + fastai `replace_wrep` does not work well with Thai.""" + previous_word = None + rep_count = 0 + res = [] + for current_word in toks+['xxend']: + if current_word == previous_word: + rep_count += 1 + elif (current_word != previous_word) & (rep_count > 0): + res += [TK_WREP, str(rep_count), previous_word] + rep_count = 0 + else: + res.append(previous_word) + previous_word = current_word + return res[1:] + + +def rm_useless_newlines(text: str) -> str: + """Remove multiple newlines in `text`.""" + + return re.sub(r"[\n]{2,}", " ", text) + + +def rm_brackets(text: str) -> str: + """Remove all empty brackets from `t`.""" + new_line = re.sub(r"\(\)", "", text) + new_line = re.sub(r"\{\}", "", new_line) + new_line = re.sub(r"\[\]", "", new_line) + + return new_line + + +def ungroup_emoji(toks: Collection): + """Ungroup emojis""" + + res = [] + for tok in toks: + if emoji.emoji_count(tok) == len(tok): + for char in tok: + res.append(char) + else: + res.append(tok) + + return res + + +def lowercase_all(toks: Collection): + "lowercase all English words" + return [tok.lower() for tok in toks] + + +class BaseTokenizer(): + """Basic class for a tokenizer function. (code from `fastai`)""" + def __init__(self, lang: str): self.lang = lang + + def tokenizer(self, t: str) -> List[str]: return t.split(' ') + + def add_special_cases(self, toks: Collection[str]): pass From 9784fe01a57f7fe1ade80e1aa159c3abf7eaa944 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 19 Aug 2019 10:33:33 +0700 Subject: [PATCH 02/73] add test cases for text processing rules and BaseTokenizer --- tests/test_ulmfit.py | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index 1401190d5..925349801 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -6,6 +6,8 @@ import unittest from pythainlp.ulmfit import * +from pythainlp.ulmfit.rules import * + class TestUlmfitPackage(unittest.TestCase): @@ -13,9 +15,68 @@ def test_ThaiTokenizer(self): self.thai = ThaiTokenizer() self.assertIsNotNone(self.thai.tokenizer("ทดสอบการตัดคำ")) self.assertIsNone(self.thai.add_special_cases(["แมว"])) + + def test_BaseTokenizer(self): + self.base = BaseTokenizer(lang='th') + self.assertIsNotNone(self.base.tokenizer("ทดสอบ การ ตัด คำ")) + self.assertIsNone(self.base.add_special_cases(["แมว"])) + def test_load_pretrained(self): self.assertIsNotNone(_THWIKI_LSTM) + def test_pre_rules_th(self): self.assertIsNotNone(pre_rules_th) + def test_post_rules_th(self): self.assertIsNotNone(post_rules_th) + + def test_fix_html(self): + self.assertEqual( + fix_html("Some HTML text
"), + "Some HTML& text\n") + + def test_rm_useless_spaces(self): + self.assertEqual( + rm_useless_spaces("Inconsistent use of spaces."), + "Inconsistent use of spaces.") + + def test_spec_add_spaces(self): + self.assertEqual( + spec_add_spaces("I #like to #put #hashtags #everywhere!"), + "I # like to # put # hashtags # everywhere!") + + def test_replace_all_caps(self): + self.assertEqual( + replace_all_caps(["Mark", "CAPITALIZED", "Only"]), + ["Mark", "xxup", "capitalized", "Only"]) + + def test_replace_rep_after(self): + self.assertEqual( + replace_rep_after("น้อยยยยยยยย"), + "น้อย xxrep 8 ") + + def test_rm_useless_newlines(self): + self.assertEqual( + rm_useless_newlines("text\n\n"), + "text ") + + def test_rm_brackets(self): + self.assertEqual( + rm_brackets("()()(ข้อความ)"), + "(ข้อความ)") + self.assertEqual( + rm_brackets("[][][ข้อความ]"), + "[ข้อความ]") + self.assertEqual( + rm_brackets("{}{}{ข้อความ}"), + "{ข้อความ}") + + def test_ungroup_emoji(self): + self.assertEqual( + ungroup_emoji("👍👍👍"), + ["👍", "👍", "👍"]) + + def test_lowercase_all(self): + self.assertEqual( + lowercase_all("HeLlO ."), + ['h', 'e', 'l', 'l', 'o', ' ', '.']) From a180a886187801a3f52e99994461946fc17d5190 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 19 Aug 2019 10:35:34 +0700 Subject: [PATCH 03/73] add a blank line --- pythainlp/ulmfit/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py index 6a64549cb..0b078b60a 100644 --- a/pythainlp/ulmfit/__init__.py +++ b/pythainlp/ulmfit/__init__.py @@ -123,6 +123,7 @@ def add_special_cases(self, toks): rm_useless_newlines, rm_brackets, ] + post_rules_th = [replace_all_caps, ungroup_emoji, lowercase_all, From e53c0186ff068c8cce376db7b48d188e8a32d4c5 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 19 Aug 2019 10:35:42 +0700 Subject: [PATCH 04/73] format docstring --- pythainlp/ulmfit/rules.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pythainlp/ulmfit/rules.py b/pythainlp/ulmfit/rules.py index 468519c56..a81d63a7f 100644 --- a/pythainlp/ulmfit/rules.py +++ b/pythainlp/ulmfit/rules.py @@ -62,8 +62,10 @@ def _replace_rep(m): def replace_wrep_post(toks: Collection): - """Replace reptitive words post tokenization; - fastai `replace_wrep` does not work well with Thai.""" + """ + Replace reptitive words post tokenization; + fastai `replace_wrep` does not work well with Thai. + """ previous_word = None rep_count = 0 res = [] @@ -109,7 +111,7 @@ def ungroup_emoji(toks: Collection): def lowercase_all(toks: Collection): - "lowercase all English words" + """lowercase all English words""" return [tok.lower() for tok in toks] From 064f791c65705a063525ec27b4fa7a5fd89818a3 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 19 Aug 2019 10:37:23 +0700 Subject: [PATCH 05/73] remove `fastai` from the library dependencies --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 24f74e207..450fcfd7a 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ "icu": ["pyicu"], "ipa": ["epitran"], "ssg": ["ssg"], - "ml": ["fastai>=1.0.38", "keras", "numpy", "torch"], + "ml": ["keras", "numpy", "torch"], "ner": ["sklearn-crfsuite"], "thai2fit": ["emoji", "gensim", "numpy"], "thai2rom": ["torch", "numpy"], @@ -22,7 +22,6 @@ "artagger", "deepcut", "epitran", - "fastai>=1.0.38", "gensim", "keras", "numpy", From eb29b901e9e0d6678b4354410e67c4058d8b6102 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 19 Aug 2019 10:45:07 +0700 Subject: [PATCH 06/73] refactor code due to conitive complexity Function ungroup_emoji has a Cognitive Complexity of 7 (exceeds 5 allowed). Consider refactoring. --- pythainlp/ulmfit/rules.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pythainlp/ulmfit/rules.py b/pythainlp/ulmfit/rules.py index a81d63a7f..bf07d7575 100644 --- a/pythainlp/ulmfit/rules.py +++ b/pythainlp/ulmfit/rules.py @@ -102,8 +102,7 @@ def ungroup_emoji(toks: Collection): res = [] for tok in toks: if emoji.emoji_count(tok) == len(tok): - for char in tok: - res.append(char) + res.append([char for char in tok]) else: res.append(tok) From 91355594d449ff1c35d8e585cb1262d730b9b4c4 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 19 Aug 2019 11:02:25 +0700 Subject: [PATCH 07/73] fix bug, ungroup emoji --- pythainlp/ulmfit/rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/ulmfit/rules.py b/pythainlp/ulmfit/rules.py index bf07d7575..e5aea85a2 100644 --- a/pythainlp/ulmfit/rules.py +++ b/pythainlp/ulmfit/rules.py @@ -102,7 +102,7 @@ def ungroup_emoji(toks: Collection): res = [] for tok in toks: if emoji.emoji_count(tok) == len(tok): - res.append([char for char in tok]) + res.extend([char for char in tok]) else: res.append(tok) From d44b03f8ecd96aaa3e6b014a7090f8548debc8e1 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Fri, 30 Aug 2019 15:57:48 +0700 Subject: [PATCH 08/73] NER : Add output like html tag --- pythainlp/tag/named_entity.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index be921a246..43623ebc2 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -90,7 +90,7 @@ def __init__(self): ) def get_ner( - self, text: str, pos: bool = True + self, text: str, pos: bool = True, tag:bool = False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ This function tags named-entitiy from text in IOB format. @@ -98,12 +98,14 @@ def get_ner( :param string text: text in Thai to be tagged :param boolean pos: To include POS tags in the results (`True`) or exclude (`False`). The defualt value is `True` - - :return: a list of tuple associated with tokenized word, NER tag, and - POS tag (if the parameter `pos` is specified as `True`). + :param boolean tag: output like html tag. + :return: a list of tuple associated with tokenized word, NER tag, + POS tag (if the parameter `pos` is specified as `True`), + and output like html tag (if the parameter `tag` is + specified as `True`). Otherwise, return a list of tuple associated with tokenized word and NER tag - :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]] + :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str :Note: * For the POS tags to be included in the results, this function @@ -137,6 +139,8 @@ def get_ner( ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')] + >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",tag=True) + 'วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา ' """ self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER) self.__pos_tags = pos_tag( @@ -145,13 +149,29 @@ def get_ner( self.__x_test = self.__extract_features(self.__pos_tags) self.__y = self.crf.predict_single(self.__x_test) - if pos: + self.sent_ner = [(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)] + if tag: + self.temp="" + self.sent="" + for idx,(word,ner) in enumerate(self.sent_ner): + if "B-" in ner: + self.temp = ner.replace("B-","") + self.sent += "<"+self.temp+">" + elif "O"== ner and self.temp!="": + self.sent+="" + self.temp="" + self.sent += word + if idx == len(self.sent_ner)-1 and self.temp!="": + self.sent += "" + return self.sent + elif pos: return [ (self.__pos_tags[i][0], self.__pos_tags[i][1], data) for i, data in enumerate(self.__y) ] + else: + return self.sent_ner - return [(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)] @staticmethod def __extract_features(doc): From 1302a851b301ab5696bf1227cbda3831c991633b Mon Sep 17 00:00:00 2001 From: seth Date: Fri, 30 Aug 2019 17:32:20 +0700 Subject: [PATCH 09/73] add attacut to pythainlp/tokrnize --- .gitignore | 4 + pythainlp/tokenize/__init__.py | 226 +++++++++++++++++---------------- pythainlp/tokenize/attacut.py | 20 +++ pythainlp/tokenize/deepcut.py | 20 +-- 4 files changed, 153 insertions(+), 117 deletions(-) create mode 100644 pythainlp/tokenize/attacut.py diff --git a/.gitignore b/.gitignore index 3f6d595ff..6ebcbd727 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,7 @@ coverage.xml # PyBuilder target/ + +# System Files +.vscode +.DS_Store diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 0db2bbe75..69d2025c6 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals +from __future__ import absolute_import, unicode_literals import nltk import re import codecs @@ -12,7 +12,7 @@ def dict_word_tokenize(text, custom_dict_trie, engine='newmm'): - ''' + ''' :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. :param str text: the text to be tokenized @@ -21,19 +21,20 @@ def dict_word_tokenize(text, custom_dict_trie, engine='newmm'): :return: A list of words, tokenized from a text. ''' - if engine=="newmm" or engine=="onecut": - from .newmm import mmcut as segment - elif engine=="mm" or engine=="multi_cut": - from .multi_cut import segment - elif engine=='longest-matching': - from .longest import segment - elif engine=='wordcutpy': - from .wordcutpy import segment - return segment(text, custom_dict_trie.keys()) - return segment(text, custom_dict_trie) - -def word_tokenize(text, engine='newmm',whitespaces=True): - """ + if engine == "newmm" or engine == "onecut": + from .newmm import mmcut as segment + elif engine == "mm" or engine == "multi_cut": + from .multi_cut import segment + elif engine == 'longest-matching': + from .longest import segment + elif engine == 'wordcutpy': + from .wordcutpy import segment + return segment(text, custom_dict_trie.keys()) + return segment(text, custom_dict_trie) + + +def word_tokenize(text, engine='newmm', whitespaces=True): + """ :param str text: the text to be tokenized :param str engine: the engine to tokenize text :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai. @@ -46,6 +47,7 @@ def word_tokenize(text, engine='newmm',whitespaces=True): * deepcut - ใช้ Deep Neural Network ในการตัดคำภาษาไทย * wordcutpy - ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ * cutkum - ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum) + * attacut - ใช้ AttaCut (https://github.com/PyThaiNLP/attacut) ในการตัดคำภาษาไทย :return: A list of words, tokenized from a text **Example**:: @@ -59,40 +61,44 @@ def word_tokenize(text, engine='newmm',whitespaces=True): e=word_tokenize(text,engine='newmm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด'] g=word_tokenize(text,engine='wordcutpy') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้านเกิด'] """ - if engine=='icu': - from .pyicu import segment - elif engine=='multi_cut' or engine=='mm': - from .multi_cut import segment - elif engine=='newmm' or engine=='onecut': - from .newmm import mmcut as segment - elif engine=='longest-matching': - from .longest import segment - elif engine=='pylexto': - from .pylexto import segment - elif engine=='deepcut': - from .deepcut import segment - elif engine=='wordcutpy': - from .wordcutpy import segment - else: - raise Exception("error no have engine.") - if whitespaces==False: - return [i.strip(' ') for i in segment(text) if i.strip(' ')!=''] - return segment(text) - -def sent_tokenize(text,engine='whitespace+newline'): - ''' - This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found. + if engine == 'icu': + from .pyicu import segment + elif engine == 'multi_cut' or engine == 'mm': + from .multi_cut import segment + elif engine == 'newmm' or engine == 'onecut': + from .newmm import mmcut as segment + elif engine == 'longest-matching': + from .longest import segment + elif engine == 'pylexto': + from .pylexto import segment + elif engine == 'deepcut': + from .deepcut import segment + elif engine == 'wordcutpy': + from .wordcutpy import segment + elif engine == 'attacut': + from .attacut import segment + else: + raise Exception("error no have engine.") + if whitespaces == False: + return [i.strip(' ') for i in segment(text) if i.strip(' ') != ''] + return segment(text) + + +def sent_tokenize(text, engine='whitespace+newline'): + ''' +This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found. :param str text: the text to be tokenized :param str engine: choose between 'whitespace' or 'whitespace+newline' :return: a list of text, split by whitespace or new line. ''' - if engine=='whitespace': - data=nltk.tokenize.WhitespaceTokenizer().tokenize(text) - elif engine=='whitespace+newline': - data=re.sub(r'\n+|\s+','|',text,re.U).split('|') - return data + if engine == 'whitespace': + data = nltk.tokenize.WhitespaceTokenizer().tokenize(text) + elif engine == 'whitespace+newline': + data = re.sub(r'\n+|\s+', '|', text, re.U).split('|') + return data + def subword_tokenize(text, engine='tcc'): """ @@ -104,95 +110,99 @@ def subword_tokenize(text, engine='tcc'): from .tcc import tcc return tcc(text) -def isthai(text,check_all=False): - """ + +def isthai(text, check_all=False): + """ :param str text: input string or list of strings :param bool check_all: checks all character or not :return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false. """ - listext=list(text) - i=0 - num_isthai=0 - if check_all==True: - listthai=[] - while i= 3584 and cVal <= 3711): - num_isthai+=1 - if check_all==True: - listthai.append(True) - else: - if check_all==True: - listthai.append(False) - i+=1 - thai=(num_isthai/len(listext))*100 - if check_all==True: - dictthai=tuple(zip(listext,listthai)) - data= {'thai':thai,'check_all':dictthai} - else: - data= {'thai':thai} - return data + listext = list(text) + i = 0 + num_isthai = 0 + if check_all == True: + listthai = [] + while i < len(listext): + cVal = ord(listext[i]) + if (cVal >= 3584 and cVal <= 3711): + num_isthai += 1 + if check_all == True: + listthai.append(True) + else: + if check_all == True: + listthai.append(False) + i += 1 + thai = (num_isthai / len(listext)) * 100 + if check_all == True: + dictthai = tuple(zip(listext, listthai)) + data = {'thai': thai, 'check_all': dictthai} + else: + data = {'thai': thai} + return data + def syllable_tokenize(text): - """ + """ :param str text: input string to be tokenized :return: returns list of strings of syllables """ - text1=word_tokenize(text) - data=[] - trie = create_custom_dict_trie(custom_dict_source=get_data()) - if len(text1)>1: - i=0 - while i 1: + i = 0 + while i < len(text1): + data.extend( + dict_word_tokenize(text=text1[i], custom_dict_trie=trie)) + i += 1 + else: + data = dict_word_tokenize(text=text, custom_dict_trie=trie) + return data + def create_custom_dict_trie(custom_dict_source): - """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see:https://marisa-trie.readthedocs.io/en/latest/index.html + """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see:https://marisa-trie.readthedocs.io/en/latest/index.html :param string/list custom_dict_source: a list of vocaburaries or a path to source file :return: A trie created from custom dict input """ - if type(custom_dict_source) is str: - # Receive a file path of the custom dict to read - with codecs.open(custom_dict_source, 'r',encoding='utf8') as f: - _vocabs = f.read().splitlines() - return Trie(_vocabs) - elif isinstance(custom_dict_source, (list, tuple, set)): - # Received a sequence type object of vocabs - return Trie(custom_dict_source) - else: - raise TypeError( - 'Type of custom_dict_source must be either str (path to source file) or collections' - ) + if type(custom_dict_source) is str: + # Receive a file path of the custom dict to read + with codecs.open(custom_dict_source, 'r', encoding='utf8') as f: + _vocabs = f.read().splitlines() + return Trie(_vocabs) + elif isinstance(custom_dict_source, (list, tuple, set)): + # Received a sequence type object of vocabs + return Trie(custom_dict_source) + else: + raise TypeError( + 'Type of custom_dict_source must be either str (path to source file) or collections' + ) + class Tokenizer: - def __init__(self, custom_dict=None): - """ + def __init__(self, custom_dict=None): + """ Initialize tokenizer object :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) :return: trie_dict - a dictionary in the form of trie data for tokenizing engines """ - if custom_dict: - if type(custom_dict) is list: - self.trie_dict = Trie(custom_dict) - elif type(custom_dict) is str: - with codecs.open(custom_dict, 'r',encoding='utf8') as f: - vocabs = f.read().splitlines() - self.trie_dict = Trie(vocabs) - else: - self.trie_dict = Trie(get_dict()) - - def word_tokenize(self, text, engine='newmm'): - from .newmm import mmcut as segment - return segment(text, self.trie_dict) - + if custom_dict: + if type(custom_dict) is list: + self.trie_dict = Trie(custom_dict) + elif type(custom_dict) is str: + with codecs.open(custom_dict, 'r', encoding='utf8') as f: + vocabs = f.read().splitlines() + self.trie_dict = Trie(vocabs) + else: + self.trie_dict = Trie(get_dict()) + + def word_tokenize(self, text, engine='newmm'): + from .newmm import mmcut as segment + return segment(text, self.trie_dict) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py new file mode 100644 index 000000000..a6224c4f3 --- /dev/null +++ b/pythainlp/tokenize/attacut.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, unicode_literals +import sys +try: + import attacut +except ImportError: + '''ในกรณีที่ยังไม่ติดตั้ง attacut ในระบบ''' + from pythainlp.tools import install_package + install_package('attacut') + try: + import attacut + except ImportError: + sys.exit('Error ! using pip install attacut') + + +def segment(text, model='attacut-sc'): + # TODO + # Implement model options: 'attacut-sc'/'attacut-c' + Tokenizer = attacut.Tokenizer(model=model) + return Tokenizer.tokenize(text) diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index c1ee34f32..0b9820957 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals +from __future__ import absolute_import, unicode_literals import sys try: import deepcut except ImportError: - '''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ''' - from pythainlp.tools import install_package - install_package('deepcut') - try: - import deepcut - except ImportError: - sys.exit('Error ! using pip install deepcut') + '''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ''' + from pythainlp.tools import install_package + install_package('deepcut') + try: + import deepcut + except ImportError: + sys.exit('Error ! using pip install deepcut') + + def segment(text): - return deepcut.tokenize(text) \ No newline at end of file + return deepcut.tokenize(text) From 9d4fab3fa0ff49020c86176e0d28df3a120c93d4 Mon Sep 17 00:00:00 2001 From: seth Date: Fri, 30 Aug 2019 17:50:10 +0700 Subject: [PATCH 10/73] fix format --- pythainlp/tokenize/__init__.py | 58 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 69d2025c6..bf797efe1 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -13,14 +13,14 @@ def dict_word_tokenize(text, custom_dict_trie, engine='newmm'): ''' - :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. + :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. - :param str text: the text to be tokenized - :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie - :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching) + :param str text: the text to be tokenized + :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie + :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching) - :return: A list of words, tokenized from a text. - ''' + :return: A list of words, tokenized from a text. + ''' if engine == "newmm" or engine == "onecut": from .newmm import mmcut as segment elif engine == "mm" or engine == "multi_cut": @@ -79,7 +79,7 @@ def word_tokenize(text, engine='newmm', whitespaces=True): from .attacut import segment else: raise Exception("error no have engine.") - if whitespaces == False: + if not whitespaces: return [i.strip(' ') for i in segment(text) if i.strip(' ') != ''] return segment(text) @@ -88,11 +88,11 @@ def sent_tokenize(text, engine='whitespace+newline'): ''' This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found. - :param str text: the text to be tokenized - :param str engine: choose between 'whitespace' or 'whitespace+newline' + :param str text: the text to be tokenized + :param str engine: choose between 'whitespace' or 'whitespace+newline' - :return: a list of text, split by whitespace or new line. - ''' + :return: a list of text, split by whitespace or new line. + ''' if engine == 'whitespace': data = nltk.tokenize.WhitespaceTokenizer().tokenize(text) elif engine == 'whitespace+newline': @@ -113,28 +113,28 @@ def subword_tokenize(text, engine='tcc'): def isthai(text, check_all=False): """ - :param str text: input string or list of strings - :param bool check_all: checks all character or not + :param str text: input string or list of strings + :param bool check_all: checks all character or not - :return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false. - """ + :return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false. + """ listext = list(text) i = 0 num_isthai = 0 - if check_all == True: + if check_all: listthai = [] while i < len(listext): cVal = ord(listext[i]) if (cVal >= 3584 and cVal <= 3711): num_isthai += 1 - if check_all == True: + if check_all: listthai.append(True) else: - if check_all == True: + if check_all: listthai.append(False) i += 1 thai = (num_isthai / len(listext)) * 100 - if check_all == True: + if check_all: dictthai = tuple(zip(listext, listthai)) data = {'thai': thai, 'check_all': dictthai} else: @@ -144,10 +144,10 @@ def isthai(text, check_all=False): def syllable_tokenize(text): """ - :param str text: input string to be tokenized + :param str text: input string to be tokenized - :return: returns list of strings of syllables - """ + :return: returns list of strings of syllables + """ text1 = word_tokenize(text) data = [] trie = create_custom_dict_trie(custom_dict_source=get_data()) @@ -165,10 +165,10 @@ def syllable_tokenize(text): def create_custom_dict_trie(custom_dict_source): """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see:https://marisa-trie.readthedocs.io/en/latest/index.html - :param string/list custom_dict_source: a list of vocaburaries or a path to source file + :param string/list custom_dict_source: a list of vocaburaries or a path to source file - :return: A trie created from custom dict input - """ + :return: A trie created from custom dict input + """ if type(custom_dict_source) is str: # Receive a file path of the custom dict to read @@ -187,12 +187,12 @@ def create_custom_dict_trie(custom_dict_source): class Tokenizer: def __init__(self, custom_dict=None): """ - Initialize tokenizer object + Initialize tokenizer object - :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) + :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) - :return: trie_dict - a dictionary in the form of trie data for tokenizing engines - """ + :return: trie_dict - a dictionary in the form of trie data for tokenizing engines + """ if custom_dict: if type(custom_dict) is list: self.trie_dict = Trie(custom_dict) From ea703f23e2953f85db65c65235bec8d82dd9f365 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 16:30:46 +0700 Subject: [PATCH 11/73] add test cases for new option `tag` changes in this commit: - assert the output of the method `get_ner` when argument `tag` is set to True for all 13 tags as described in (https://github.com/wannaphongcom/thai-ner/tree/master/model/1.2) - assert the output of the method `get_ner` when argument `pos` is set to True - asser the output of the method `get_ner` when argument `pos` is set to False --- tests/test_tag.py | 99 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/tests/test_tag.py b/tests/test_tag.py index de290d03c..aad03aad9 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -87,6 +87,105 @@ def test_ner(self): จังหวัดหนองคาย 43000""" ) ) + + # arguement `tag` is True + self.assertEqual( + ner.get_ner( + "วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", + tag=True + ), + "วันที่ 15 ก.ย. 61 " + "ทดสอบระบบเวลา ") + + self.assertEqual( + ner.get_ner( + "url = https://thainlp.org/pythainlp/docs/2.0/", + tag=True + ), + "url = https://thainlp.org/pythainlp/docs/2.0/") + + self.assertEqual( + ner.get_ner( + "example@gmail.com", + tag=True + ), + "example@gmail.com") + + self.assertEqual( + ner.get_ner( + "รหัสไปรษณีย์ 19130", + tag=True + ), + "รหัสไปรษณีย์ 19130") + + self.assertEqual( + ner.get_ner( + "เบอร์โทรศัพท์ 00-120-1100", + tag=True + ), + "เบอร์โทรศัพท์ 00-120-1100") + + self.assertEqual( + ner.get_ner( + "อาจารย์เอกพล ประจำคณะวิสกรรมศาสตร์ ", + tag=True + ), + "อาจารย์เอกพล ประจำ" + "คณะวิสกรรมศาสตร์ ") + + self.assertEqual( + ner.get_ner( + "มาตรา 80 ให้ใช้อัตราภาษีร้อยละ 10.0" + " ในการคำนวณภาษีมูลค่าเพิ่ม", + tag=True + ), + "มาตรา 80 ให้ใช้อัตราภาษีร้อยละ 10.0" + " ในการคำนวณภาษีมูลค่าเพิ่ม") + + self.assertEqual( + ner.get_ner( + "ยาว 20 เซนติเมตร", + tag=True + ), + "ยาว 20 เซนติเมตร") + + self.assertEqual( + ner.get_ner( + "1 บาท", + pos=True, + tag=True), + "1 บาท") + + self.assertEqual( + ner.get_ner( + "ไทย", + pos=False, + tag=True + ), + "ไทย") + + # arguement `tag` is False and `pos` is True + self.assertEqual( + ner.get_ner( + "ไทย", + pos=True, + tag=False + ), + [('ไทย', 'PROPN', 'B-LOCATION')]) + + # arguement `tag` is False and `pos` is False + self.assertEqual( + ner.get_ner( + "วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", + pos=False, + tag=False + ), + [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), + (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), + ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), + ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), + ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]) + # self.assertEqual( # ner.get_ner("แมวทำอะไรตอนห้าโมงเช้า"), # [ From def738817beed7132f24a90001019e0afdbe6423 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 16:37:48 +0700 Subject: [PATCH 12/73] fix PEP8 issues --- pythainlp/tag/named_entity.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index 43623ebc2..d480768fc 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -90,7 +90,7 @@ def __init__(self): ) def get_ner( - self, text: str, pos: bool = True, tag:bool = False + self, text: str, pos: bool = True, tag: bool = False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ This function tags named-entitiy from text in IOB format. @@ -99,9 +99,9 @@ def get_ner( :param boolean pos: To include POS tags in the results (`True`) or exclude (`False`). The defualt value is `True` :param boolean tag: output like html tag. - :return: a list of tuple associated with tokenized word, NER tag, + :return: a list of tuple associated with tokenized word, NER tag, POS tag (if the parameter `pos` is specified as `True`), - and output like html tag (if the parameter `tag` is + and output like html tag (if the parameter `tag` is specified as `True`). Otherwise, return a list of tuple associated with tokenized word and NER tag @@ -128,8 +128,8 @@ def get_ner( ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')] >>> - >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", \\ - pos=False) + >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", + pos=False) [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), @@ -139,7 +139,8 @@ def get_ner( ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')] - >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",tag=True) + >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", + tag=True) 'วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา ' """ self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER) @@ -149,19 +150,20 @@ def get_ner( self.__x_test = self.__extract_features(self.__pos_tags) self.__y = self.crf.predict_single(self.__x_test) - self.sent_ner = [(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)] + self.sent_ner = [(self.__pos_tags[i][0], data) + for i, data in enumerate(self.__y)] if tag: - self.temp="" - self.sent="" - for idx,(word,ner) in enumerate(self.sent_ner): + self.temp = "" + self.sent = "" + for idx, (word, ner) in enumerate(self.sent_ner): if "B-" in ner: - self.temp = ner.replace("B-","") + self.temp = ner.replace("B-", "") self.sent += "<"+self.temp+">" - elif "O"== ner and self.temp!="": - self.sent+="" - self.temp="" + elif "O" == ner and self.temp != "": + self.sent += "" + self.temp = "" self.sent += word - if idx == len(self.sent_ner)-1 and self.temp!="": + if idx == len(self.sent_ner)-1 and self.temp != "": self.sent += "" return self.sent elif pos: @@ -172,7 +174,6 @@ def get_ner( else: return self.sent_ner - @staticmethod def __extract_features(doc): return [_doc2features(doc, i) for i in range(len(doc))] From 723a052616dc00e05a5c0a767c2d6efe45872fd2 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 16:48:34 +0700 Subject: [PATCH 13/73] fix typo --- tests/test_tag.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tag.py b/tests/test_tag.py index aad03aad9..4c2a5c4ad 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -127,11 +127,11 @@ def test_ner(self): self.assertEqual( ner.get_ner( - "อาจารย์เอกพล ประจำคณะวิสกรรมศาสตร์ ", + "อาจารย์เอกพล ประจำคณะวิศวกรรมศาสตร์ ", tag=True ), "อาจารย์เอกพล ประจำ" - "คณะวิสกรรมศาสตร์ ") + "คณะวิศวกรรมศาสตร์ ") self.assertEqual( ner.get_ner( From 937aecb5d369e0728dbae3c99324253542b7e5a0 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 17:21:31 +0700 Subject: [PATCH 14/73] change input texts in two cases As thainer version 1.2 provide different results from previous version --- tests/test_tag.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_tag.py b/tests/test_tag.py index 4c2a5c4ad..dfb191455 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -120,10 +120,10 @@ def test_ner(self): self.assertEqual( ner.get_ner( - "เบอร์โทรศัพท์ 00-120-1100", + "เบอร์โทรศัพท์ 091-123-4567", tag=True ), - "เบอร์โทรศัพท์ 00-120-1100") + "เบอร์โทรศัพท์ 091-123-4567") self.assertEqual( ner.get_ner( @@ -135,11 +135,12 @@ def test_ner(self): self.assertEqual( ner.get_ner( - "มาตรา 80 ให้ใช้อัตราภาษีร้อยละ 10.0" + "มาตรา 80 ปพพ ให้ใช้อัตราภาษีร้อยละ 10.0" " ในการคำนวณภาษีมูลค่าเพิ่ม", tag=True ), - "มาตรา 80 ให้ใช้อัตราภาษีร้อยละ 10.0" + "มาตรา 80 ปพพ " + "ให้ใช้อัตราภาษีร้อยละ 10.0" " ในการคำนวณภาษีมูลค่าเพิ่ม") self.assertEqual( From 888aa785552490894d24e739d4360468b9305844 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 20:43:24 +0700 Subject: [PATCH 15/73] refactor the unittest --- tests/test_ulmfit.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index 925349801..f2a10553c 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -2,11 +2,9 @@ import datetime import os -import sys import unittest from pythainlp.ulmfit import * -from pythainlp.ulmfit.rules import * class TestUlmfitPackage(unittest.TestCase): From 074a391b564f9e1a9d77c2b212b57146e77b3628 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 20:44:24 +0700 Subject: [PATCH 16/73] Add change in appveyor.yml (to fix build error) according to @bact (https://github.com/PyThaiNLP/pythainlp/pull/252#issuecomment-526821200) --- appveyor.yml | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 681b122b4..901206115 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,8 +2,11 @@ # https://www.lfd.uci.edu/~gohlke/pythonlibs/ build: off +image: Visual Studio 2015 environment: + global: + CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\run_with_env.cmd" matrix: # - PYTHON: "C:/Python36" # PYTHON_VERSION: "3.6" @@ -24,31 +27,41 @@ environment: # ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" # PYICU_PKG: "https://www.dropbox.com/s/3xwdnwhdcu619x4/PyICU-2.3.1-cp37-cp37m-win32.whl?dl=1" -# - PYTHON: "C:/Python37-x64" -# PYTHON_VERSION: "3.7" -# PYTHON_ARCH: "64" -# ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" -# PYICU_PKG: "https://www.dropbox.com/s/le5dckc3231opqt/PyICU-2.3.1-cp37-cp37m-win_amd64.whl?dl=1" -# DISTUTILS_USE_SDK: "1" + - PYTHON: "C:/Python37-x64" + PYTHON_VERSION: "3.7" + PYTHON_ARCH: "64" + ARTAGGER_PKG: "https://github.com/wannaphongcom/artagger/tarball/master#egg=artagger" + PYICU_PKG: "https://www.dropbox.com/s/le5dckc3231opqt/PyICU-2.3.1-cp37-cp37m-win_amd64.whl?dl=1" + DISTUTILS_USE_SDK: "1" init: - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" # - ps: "ls C:/Python*" +platform: + - x64 + install: - "chcp 65001" - "set PYTHONIOENCODING=utf-8" + - "%PYTHON%\\python.exe -m pip install wheel" # - ECHO "Installed SDKs:" # - ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\"" - - "%PYTHON%/python.exe --version" + - IF "%ARCH%"=="32" (call "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" x86) ELSE (ECHO "probably a 64bit build") + - IF "%ARCH%"=="64" (call "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" amd64) ELSE (ECHO "probably a 32bit build") + - '"%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat" %PLATFORM%' + - ps: if (-not(Test-Path($env:PYTHON))) { & appveyor\install.ps1 } + - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + - "python --version" # - "set ICU_VERSION=62" - - "%PYTHON%/python.exe -m pip install --upgrade pip" - - "%PYTHON%/python.exe -m pip install coveralls[yaml]" - - "%PYTHON%/python.exe -m pip install coverage" - - "%PYTHON%/python.exe -m pip install %PYICU_PKG%" - - "%PYTHON%/python.exe -m pip install %ARTAGGER_PKG%" - - "%PYTHON%/python.exe -m pip install -e .[full]" + - "pip install --disable-pip-version-check --user --upgrade pip setuptools" + - "pip install coveralls[yaml]" + - "pip install coverage" + - "pip install torch==1.2.0+cpu torchvision==0.4.0+cpu -f https://download.pytorch.org/whl/torch_stable.html" + - "pip install %PYICU_PKG%" + - "pip install %ARTAGGER_PKG%" + - "pip install -e .[full]" test_script: - - "%PYTHON%/python.exe -m pip --version" - - "%PYTHON%/python.exe setup.py test" + - "pip --version" + - "python setup.py test" From 53c812aba29673169026d1674e6ac3a7d5df1ca9 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 21:22:17 +0700 Subject: [PATCH 17/73] add test cases for new function in `ulmfit` Change in this commit: - add new test case: test_remove_space - add new test case: test_replace_wrep_post_nonum - add new test case: test_replace_wrep_post - add new test case: test_replace_rep_nonum - add new test case: test_replace_rep_after - remove a test case: test_replace_all_caps (as the function `replace_all_caps` was removed from /pythianlp/ulmfit/__init__.py) --- tests/test_ulmfit.py | 56 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index f2a10553c..2143dab6f 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -4,7 +4,25 @@ import os import unittest -from pythainlp.ulmfit import * +from pythainlp.ulmfit import ( + ThaiTokenizer, + BaseTokenizer, + fix_html, + _THWIKI_LSTM, + pre_rules_th, + post_rules_th, + rm_useless_spaces, + spec_add_spaces, + rm_useless_newlines, + rm_brackets, + ungroup_emoji, + lowercase_all, + replace_rep_nonum, + replace_rep_after, + replace_wrep_post, + replace_wrep_post_nonum, + remove_space +) class TestUlmfitPackage(unittest.TestCase): @@ -43,15 +61,38 @@ def test_spec_add_spaces(self): spec_add_spaces("I #like to #put #hashtags #everywhere!"), "I # like to # put # hashtags # everywhere!") - def test_replace_all_caps(self): - self.assertEqual( - replace_all_caps(["Mark", "CAPITALIZED", "Only"]), - ["Mark", "xxup", "capitalized", "Only"]) - def test_replace_rep_after(self): self.assertEqual( replace_rep_after("น้อยยยยยยยย"), - "น้อย xxrep 8 ") + "น้อยxxrep8 ") + + def test_replace_rep_nonum(self): + self.assertEqual( + replace_rep_nonum("น้อยยยยยยยย"), + "น้อย xxrep ") + + def test_replace_wrep_post(self): + self.assertEqual( + replace_wrep_post(["น้อย", "น้อย"]), + ["xxwrep", "1", "น้อย"]) + + self.assertEqual( + replace_wrep_post(["นก", "กา", "กา", "กา"]), + ["นก", "xxwrep", "2", "กา"]) + + def test_replace_wrep_post_nonum(self): + self.assertEqual( + replace_wrep_post_nonum(["น้อย", "น้อย"]), + ["xxwrep", "น้อย"]) + + self.assertEqual( + replace_wrep_post_nonum(["นก", "กา", "กา", "กา"]), + ["นก", "xxwrep", "กา"]) + + def test_remove_space(self): + self.assertEqual( + remove_space([" ", "น้อย", " ", "."]), + ["น้อย", "."]) def test_rm_useless_newlines(self): self.assertEqual( @@ -78,3 +119,4 @@ def test_lowercase_all(self): self.assertEqual( lowercase_all("HeLlO ."), ['h', 'e', 'l', 'l', 'o', ' ', '.']) + From d6ccb2c4da3fbee70e8ac25f5a9466622b34805a Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 22:32:36 +0700 Subject: [PATCH 18/73] add a test case for `pythainlp.ulmfit.process_text` --- tests/test_ulmfit.py | 57 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index 2143dab6f..e867e1a57 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -4,6 +4,8 @@ import os import unittest +from pythainlp.corpus import get_corpus +from pythainlp.tokenize import Tokenizer from pythainlp.ulmfit import ( ThaiTokenizer, BaseTokenizer, @@ -11,6 +13,8 @@ _THWIKI_LSTM, pre_rules_th, post_rules_th, + pre_rules_th_sparse, + post_rules_th_sparse, rm_useless_spaces, spec_add_spaces, rm_useless_newlines, @@ -21,9 +25,13 @@ replace_rep_after, replace_wrep_post, replace_wrep_post_nonum, - remove_space + remove_space, + process_thai ) +_THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt") +_pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm") + class TestUlmfitPackage(unittest.TestCase): @@ -46,6 +54,12 @@ def test_pre_rules_th(self): def test_post_rules_th(self): self.assertIsNotNone(post_rules_th) + def test_pre_rules_th(self): + self.assertIsNotNone(pre_rules_th_sparse) + + def test_post_rules_th(self): + self.assertIsNotNone(post_rules_th_sparse) + def test_fix_html(self): self.assertEqual( fix_html("Some HTML text
"), @@ -120,3 +134,44 @@ def test_lowercase_all(self): lowercase_all("HeLlO ."), ['h', 'e', 'l', 'l', 'o', ' ', '.']) + def test_process_thai_1(self): + """rules for sparse features""" + + text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146" + + actual = process_thai(text) + expect = ["xxwrep", "👍", "#", "ana", "มาก", "xxrep", + " ", "xxwrep", "น้อย", ".", "1146"] + + self.assertEqual(actual, expect) + + def test_process_thai_2(self): + """rules for dense features""" + + text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146" + + actual = process_thai(text, + pre_rules=pre_rules_th, + post_rules=post_rules_th, + tok_func=_pythainlp_tokenizer.word_tokenize) + + # after pre_rules_th + # >>> "👍👍👍 # ana มาก xxrep 4 น้้อย xxwrep 1 .1146" + # + # after tokenize with word_tokenize(engine="newmm") + # >>> ["👍👍👍", " ", "#", "ana", " ", "มาก", "xxrep", "4", + # " ", "น้อย", "น้อย", " ", ".", "1146"] + # after post_rules_th + # -- because it performs `replace_wrep_post` before `ungroup_emoji`, + # 3 repetitive emoji are not marked with special token "xxwrep " + # + # >>> ["👍", "👍","👍", " ", "#", "ana", " ", "มาก", + # "xxrep", "4", " ", "xxwrep", "1", "น้อย", " ", + # ".", "1146"] + + expect = ["👍", "👍", "👍", " ", "#", " ", + "ana", " ", "มาก", "xxrep", "4", + " ", "xxwrep", "1", "น้อย", " ", + ".", "1146"] + + self.assertEqual(actual, expect) From c20ec56dc63c6779d8c3f6e28575d71c05aa64a8 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 22:37:41 +0700 Subject: [PATCH 19/73] edit test case description --- tests/test_ulmfit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index e867e1a57..d9e1b9d73 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -156,16 +156,16 @@ def test_process_thai_2(self): tok_func=_pythainlp_tokenizer.word_tokenize) # after pre_rules_th - # >>> "👍👍👍 # ana มาก xxrep 4 น้้อย xxwrep 1 .1146" + # >>> "👍👍👍 # ana มากxxrep4 น้้อยน้อย .1146" # # after tokenize with word_tokenize(engine="newmm") # >>> ["👍👍👍", " ", "#", "ana", " ", "มาก", "xxrep", "4", # " ", "น้อย", "น้อย", " ", ".", "1146"] # after post_rules_th # -- because it performs `replace_wrep_post` before `ungroup_emoji`, - # 3 repetitive emoji are not marked with special token "xxwrep " + # 3 repetitive emoji are not marked with special token "xxwrep num" # - # >>> ["👍", "👍","👍", " ", "#", "ana", " ", "มาก", + # >>> ["👍", "👍","👍", " ", "#", "ana", " ", "มาก", # "xxrep", "4", " ", "xxwrep", "1", "น้อย", " ", # ".", "1146"] From 3e90cc58537f3c8f8fb3c755c448d3a36f9f8b96 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sat, 31 Aug 2019 22:47:31 +0700 Subject: [PATCH 20/73] update test cases description --- tests/test_ulmfit.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index d9e1b9d73..5084f2dfe 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -140,6 +140,19 @@ def test_process_thai_1(self): text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146" actual = process_thai(text) + + # after pre_rules_th_sparse + # >>> "👍👍👍 # Ana มาก xxrep น้้อยน้อย .1146" + # + # after tokenize with word_tokenize(engine="newmm") + # >>> ["👍👍👍", " ", "#", " ","Ana", " ", "มาก", "xxrep", + # " ", "น้อย", "น้อย", " ", ".", "1146"] + # + # after post_rules_th + # - remove whitespace token (" ") + # >>> ["xxwrep, "👍", "#", "ana", "มาก", + # "xxrep", " ", "xxwrep", "น้อย", ".", "1146"] + expect = ["xxwrep", "👍", "#", "ana", "มาก", "xxrep", " ", "xxwrep", "น้อย", ".", "1146"] @@ -156,10 +169,10 @@ def test_process_thai_2(self): tok_func=_pythainlp_tokenizer.word_tokenize) # after pre_rules_th - # >>> "👍👍👍 # ana มากxxrep4 น้้อยน้อย .1146" + # >>> "👍👍👍 # Ana มากxxrep4 น้้อยน้อย .1146" # # after tokenize with word_tokenize(engine="newmm") - # >>> ["👍👍👍", " ", "#", "ana", " ", "มาก", "xxrep", "4", + # >>> ["👍👍👍", " ", "#", "Ana", " ", "มาก", "xxrep", "4", # " ", "น้อย", "น้อย", " ", ".", "1146"] # after post_rules_th # -- because it performs `replace_wrep_post` before `ungroup_emoji`, From 3292ba37dcbe73bac85f6422c98ff63a237b1926 Mon Sep 17 00:00:00 2001 From: seth Date: Sun, 1 Sep 2019 13:24:14 +0700 Subject: [PATCH 21/73] use high level api in attacut --- pythainlp/tokenize/attacut.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index a6224c4f3..84f3111fb 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -13,8 +13,5 @@ sys.exit('Error ! using pip install attacut') -def segment(text, model='attacut-sc'): - # TODO - # Implement model options: 'attacut-sc'/'attacut-c' - Tokenizer = attacut.Tokenizer(model=model) - return Tokenizer.tokenize(text) +def segment(text): + return attacut.tokenize(text) From 3e0d4d6613508a335681cc5fbe38d88212fc76f7 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Sun, 1 Sep 2019 13:27:43 +0700 Subject: [PATCH 22/73] merge --- bin/word-tokenization-benchmark | 121 +++++++++ docs/api/benchmarks.rst | 24 ++ pythainlp/benchmarks/__init__.py | 3 + pythainlp/benchmarks/word_tokenisation.py | 310 ++++++++++++++++++++++ tests/data/sentences.yml | 42 +++ tests/test_benchmarks.py | 85 ++++++ 6 files changed, 585 insertions(+) create mode 100644 bin/word-tokenization-benchmark create mode 100644 docs/api/benchmarks.rst create mode 100644 pythainlp/benchmarks/__init__.py create mode 100644 pythainlp/benchmarks/word_tokenisation.py create mode 100644 tests/data/sentences.yml create mode 100644 tests/test_benchmarks.py diff --git a/bin/word-tokenization-benchmark b/bin/word-tokenization-benchmark new file mode 100644 index 000000000..0193926d7 --- /dev/null +++ b/bin/word-tokenization-benchmark @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import json +import os +import argparse +import yaml + +from pythainlp.benchmarks import word_tokenisation + +parser = argparse.ArgumentParser( + description="Script for benchmarking tokenizaiton results" +) + +parser.add_argument( + "--input", + action="store", + help="path to file that you want to compare against the test file" +) + +parser.add_argument( + "--test-file", + action="store", + help="path to test file" +) + +parser.add_argument( + "--save-details", + default=False, + action='store_true', + help="specify whether to save the details of comparisons" +) + +args = parser.parse_args() + +def _read_file(path): + with open(path, "r", encoding="utf-8") as f: + lines = map(lambda r: r.strip(), f.readlines()) + return list(lines) + + +print(args.input) +actual = _read_file(args.input) +expected = _read_file(args.test_file) + +assert len(actual) == len(expected), \ + 'Input and test files do not have the same number of samples' +print('Benchmarking %s against %s with %d samples in total' % ( + args.input, args.test_file, len(actual) +)) + +df_raw = word_tokenisation.benchmark(expected, actual) + +df_res = df_raw\ + .describe() +df_res = df_res[[ + 'char_level:tp', + 'char_level:tn', + 'char_level:fp', + 'char_level:fn', + 'char_level:precision', + 'char_level:recall', + 'char_level:f1', + 'word_level:precision', + 'word_level:recall', + 'word_level:f1', +]] + +df_res = df_res.T.reset_index(0) + +df_res['mean±std'] = df_res.apply( + lambda r: '%2.2f±%2.2f' % (r['mean'], r['std']), + axis=1 +) + +df_res['metric'] = df_res['index'] + +print("============== Benchmark Result ==============") +print(df_res[['metric', 'mean±std', 'min', 'max']].to_string(index=False)) + + + +if args.save_details: + data = {} + for r in df_res.to_dict('records'): + metric = r['index'] + del r['index'] + data[metric] = r + + dir_name = os.path.dirname(args.input) + file_name = args.input.split("/")[-1].split(".")[0] + + res_path = "%s/eval-%s.yml" % (dir_name, file_name) + print("Evaluation result is saved to %s" % res_path) + + with open(res_path, 'w') as outfile: + yaml.dump(data, outfile, default_flow_style=False) + + res_path = "%s/eval-details-%s.json" % (dir_name, file_name) + print("Details of comparisons is saved to %s" % res_path) + + with open(res_path, "w") as f: + samples = [] + for i, r in enumerate(df_raw.to_dict("records")): + expected, actual = r["expected"], r["actual"] + del r["expected"] + del r["actual"] + + samples.append(dict( + metrics=r, + expected=expected, + actual=actual, + id=i + )) + + details = dict( + metrics=data, + samples=samples + ) + + json.dump(details, f, ensure_ascii=False) diff --git a/docs/api/benchmarks.rst b/docs/api/benchmarks.rst new file mode 100644 index 000000000..e5167ee96 --- /dev/null +++ b/docs/api/benchmarks.rst @@ -0,0 +1,24 @@ +.. currentmodule:: pythainlp.benchmarks + +pythainlp.benchmarks +==================================== +The :class:`pythainlp.benchmarks` contains utility functions for benchmarking +tasked related to Thai NLP. At the moment, we have only for word tokenization. +Other tasks will be added soon. + +Modules +------- + +Tokenization +********* + +Quality +^^^^ +.. figure:: ../images/evaluation.png + :scale: 50 % + + Qualitative evaluation of word tokenization. + +.. autofunction:: pythainlp.benchmarks.word_tokenisation.compute_stats +.. autofunction:: pythainlp.benchmarks.word_tokenisation.benchmark +.. autofunction:: pythainlp.benchmarks.word_tokenisation.preprocessing diff --git a/pythainlp/benchmarks/__init__.py b/pythainlp/benchmarks/__init__.py new file mode 100644 index 000000000..711404da2 --- /dev/null +++ b/pythainlp/benchmarks/__init__.py @@ -0,0 +1,3 @@ +from .word_tokenisation import benchmark + +__all__ = ["benchmark"] \ No newline at end of file diff --git a/pythainlp/benchmarks/word_tokenisation.py b/pythainlp/benchmarks/word_tokenisation.py new file mode 100644 index 000000000..75e074b84 --- /dev/null +++ b/pythainlp/benchmarks/word_tokenisation.py @@ -0,0 +1,310 @@ +# -*- coding: utf-8 -*- + +import sys +import re + +import numpy as np +import pandas as pd + +SEPARATOR = "|" + +# regex for removing to a space surrounded by separators, i.e. | | +SURROUNDING_SEPS_RX = re.compile( + "{sep}? ?{sep}$".format(sep=re.escape(SEPARATOR)) +) + +# regex for removing repeated separators, i.e. |||| +MULTIPLE_SEPS_RX = re.compile("{sep}+".format(sep=re.escape(SEPARATOR))) + +# regex for removing tags, i.e. , +TAG_RX = re.compile("<\/?[A-Z]+>") + +# regex for tailing separator, i.e. a|dog| -> a|dog +TAILING_SEP_RX = re.compile("{sep}$".format(sep=re.escape(SEPARATOR))) + + +def _f1(precision: float, recall: float) -> float: + """ + Compute f1 + + :param float precision + :param float recall + + :return: f1 + :rtype: float + """ + if precision == recall == 0: + return 0 + return 2*precision*recall / (precision + recall) + + +def _flatten_result(my_dict: dict, sep: str = ":") -> dict: + """ + Flatten two-level dictionary + + Use keys in the first level as a prefix for keys in the two levels. + For example, + my_dict = { "a": { "b": 7 } } + flatten(my_dict) + { "a:b": 7 } + + + :param dict my_dict: contains stats dictionary + :param str sep: separator between the two keys (default: ":") + + :return: a one-level dictionary with key combined + :rtype: dict[str, float | str] + """ + items = [] + for k1, kv2 in my_dict.items(): + for k2, v in kv2.items(): + new_key = f"{k1}{sep}{k2}" + items.append((new_key, v)) + + return dict(items) + + +def benchmark(ref_samples: list, samples: list): + """ + Performace benchmark of samples + + Please see :meth:`pythainlp.benchmarks.word_tokenisation.compute_stats` for + metrics being computed. + + :param list[str] ref_samples: ground truth samples + :param list[str] samples: samples that we want to evaluate + + :return: dataframe with row x col = len(samples) x len(metrics) + :rtype: pandas.DataFrame + """ + + results = [] + for i, (r, s) in enumerate(zip(ref_samples, samples)): + try: + r, s = preprocessing(r), preprocessing(s) + if r and s: + stats = compute_stats(r, s) + stats = _flatten_result(stats) + stats["expected"] = r + stats["actual"] = s + results.append(stats) + except: + reason = """ +[Error] +Reason: %s + +Pair (i=%d) +--- label +%s +--- sample +%s +""" % (sys.exc_info(), i, r, s) + + raise SystemExit(reason) + + return pd.DataFrame(results) + + +def preprocessing(txt: str, remove_space: bool = True) -> str: + """ + Clean up text before performing evaluation + + :param str text: text to be preprocessed + :param bool remove_space: whether remove white space + + :return: preprocessed text + :rtype: str + """ + txt = re.sub(SURROUNDING_SEPS_RX, "", txt) + + if remove_space: + txt = re.sub("\s+", "", txt) + + txt = re.sub( + MULTIPLE_SEPS_RX, + SEPARATOR, + txt + ) + + txt = re.sub(TAG_RX, "", txt) + + txt = re.sub(TAILING_SEP_RX, "", txt).strip() + + return txt + + +def compute_stats(ref_sample: str, raw_sample: str) -> dict: + """ + Compute statistics for tokenization quality + + These statistics includes: + + **Character-Level**: + True Positive, False Positive, True Negative, False Negative, Precision, Recall, and f1 + **Word-Level**: + Precision, Recall, and f1 + **Other**: + - Correct tokenization indicator: {0, 1} sequence indicating the correspoding + word is tokenized correctly. + + :param str ref_sample: ground truth samples + :param str samples samples that we want to evaluate + + :return: metrics in character and word-level and correctly tokenized word indicators + :rtype: dict[str, float | str] + """ + ref_sample = _binary_representation(ref_sample) + sample = _binary_representation(raw_sample) + + # Compute charater-level statistics + c_pos_pred, c_neg_pred = np.argwhere(sample == 1), np.argwhere(sample == 0) + + c_pos_pred = c_pos_pred[c_pos_pred < ref_sample.shape[0]] + c_neg_pred = c_neg_pred[c_neg_pred < ref_sample.shape[0]] + + c_tp = np.sum(ref_sample[c_pos_pred] == 1) + c_fp = np.sum(ref_sample[c_pos_pred] == 0) + + c_tn = np.sum(ref_sample[c_neg_pred] == 0) + c_fn = np.sum(ref_sample[c_neg_pred] == 1) + + c_precision = c_tp / (c_tp + c_fp) + c_recall = c_tp / (c_tp + c_fn) + c_f1 = _f1(c_precision, c_recall) + + # Compute word-level statistics + word_boundaries = _find_word_boudaries(ref_sample) + + correctly_tokenised_words = _count_correctly_tokenised_words( + sample, + word_boundaries + ) + + w_precision = correctly_tokenised_words / np.sum(sample) + w_recall = correctly_tokenised_words / np.sum(ref_sample) + w_f1 = _f1(w_precision, w_recall) + + # Find correctly tokenized words in the sample + ss_boundaries = _find_word_boudaries(sample) + tokenisation_indicators = _find_words_correctly_tokenised( + word_boundaries, + ss_boundaries + ) + + tokenisation_indicators = list( + map(lambda x: str(x), tokenisation_indicators) + ) + + return { + 'char_level': { + 'tp': c_tp, + 'fp': c_fp, + 'tn': c_tn, + 'fn': c_fn, + 'precision': c_precision, + 'recall': c_recall, + 'f1': c_f1 + }, + 'word_level': { + 'precision': w_precision, + 'recall': w_recall, + 'f1': w_f1 + }, + 'global': { + 'tokenisation_indicators': "".join(tokenisation_indicators) + } + } + + +def _binary_representation(txt: str, verbose: bool = False): + """ + Transform text to {0, 1} sequence + + where (1) indicates that the corresponding character is the beginning of + a word. For example, ผม|ไม่|ชอบ|กิน|ผัก -> 10100... + + :param str txt: input text that we want to transform + :param bool verbose: for debugging purposes + + :return: {0, 1} sequence + :rtype: str + """ + + chars = np.array(list(txt)) + + boundary = np.argwhere(chars == SEPARATOR).reshape(-1) + boundary = boundary - np.array(range(boundary.shape[0])) + + bin_rept = np.zeros(len(txt) - boundary.shape[0]) + bin_rept[list(boundary) + [0]] = 1 + + sample_wo_seps = list(txt.replace(SEPARATOR, "")) + + # sanity check + assert len(sample_wo_seps) == len(bin_rept) + + if verbose: + for c, m in zip(sample_wo_seps, bin_rept): + print('%s -- %d' % (c, m)) + + return bin_rept + + +def _find_word_boudaries(bin_reps) -> list: + """ + Find start and end location of each word + + :param str bin_reps: binary representation of a text + + :return: list of tuples (start, end) + :rtype: list[tuple(int, int)] + """ + + boundary = np.argwhere(bin_reps == 1).reshape(-1) + start_idx = boundary + end_idx = boundary[1:].tolist() + [bin_reps.shape[0]] + + return list(zip(start_idx, end_idx)) + + +def _count_correctly_tokenised_words(bin_reps, word_boundaries) -> list: + """ + Count how many words are tokenized correctly + + :param str bin_reps: binary representation of a text + :param list[tuple(int, int)] word_boundaries: list of when each word starts and ends + + :return: no. correctly tokenized words + :rtype: int + """ + count = 0 + for st, end in word_boundaries: + pend = min(end, bin_reps.shape[0]) + if (bin_reps[st] == 1 and np.sum(bin_reps[st+1:pend]) == 0) \ + and ( + (pend == bin_reps.shape[0]) or + (pend != bin_reps.shape[0] and bin_reps[pend] == 1) + ): + count = count + 1 + + return count + + +def _find_words_correctly_tokenised( + ref_boundaries: list, + predicted_boundaries: list + ) -> tuple: + """ + Find whether each word is correctly tokenized + + :param list[tuple(int, int)] ref_boundaries: word boundaries of reference tokenization + :param list[tuple(int, int)] predicted_boundaries: word boundareies of predicted tokenization + + :return: binary sequence where 1 indicates the corresponding word is tokenized correctly + :rtype: tuple[int] + """ + + ref_b = dict(zip(ref_boundaries, [1]*len(ref_boundaries))) + + labels = tuple(map(lambda x: ref_b.get(x, 0), predicted_boundaries)) + return labels diff --git a/tests/data/sentences.yml b/tests/data/sentences.yml new file mode 100644 index 000000000..6d913a38d --- /dev/null +++ b/tests/data/sentences.yml @@ -0,0 +1,42 @@ +sentences: + - + expected: >- + ผม|ไม่|ชอบ|กิน|ผัก + actual: >- + ผม|ไม่|ชอบ|กิน|ผัก + - + expected: >- + ผม|ไม่|ชอบ|กิน|ผัก + actual: >- + ผม|ไม่|ชอบ|กินผัก + - + expected: >- + ผม|ไม่|ชอบ|กิน|ผัก| + actual: >- + ผม|ไม่|ชอบ|กินผัก| + - + expected: >- + ผม|ไม่|ชอบ|กินผัก| + actual: >- + ผม|ไม่|ชอบ|กิน|ผัก| +binary_sentences: + - + expected: "10001010" + actual: "10001010" + expected_count: 3 + - + expected: "10001010" + actual: "10101010" + expected_count: 2 + - + expected: "10101010" + actual: "10001010" + expected_count: 2 + - + expected: "10001010" + actual: "10001000" + expected_count: 1 + - + expected: "10001010" + actual: "10101000" + expected_count: 0 \ No newline at end of file diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py new file mode 100644 index 000000000..aad63fd76 --- /dev/null +++ b/tests/test_benchmarks.py @@ -0,0 +1,85 @@ +import datetime +import os +import sys +import unittest +import yaml +import numpy as np + +from pythainlp.benchmarks import word_tokenisation + +with open("./tests/data/sentences.yml", 'r') as stream: + TEST_DATA = yaml.safe_load(stream) + +class TestBenchmarksPackage(unittest.TestCase): + + def test_preprocessing(self): + self.assertIsNotNone(word_tokenisation.preprocessing( + txt="ทดสอบ การ ทำ ความสะอาด ข้อมูลok" + )) + + def test_benchmark_not_none(self): + self.assertIsNotNone(word_tokenisation.benchmark( + ["วัน", "จัน", "ทร์", "สี", "เหลือง"], + ["วัน", "จันทร์", "สี", "เหลือง"] + )) + + def test_binary_representation(self): + sentence = "อากาศ|ร้อน|มาก|ครับ" + rept = word_tokenisation._binary_representation(sentence) + + self.assertEqual( + [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], + rept.tolist() + ) + + def test_compute_stats(self): + for pair in TEST_DATA['sentences']: + exp, act = pair['expected'], pair['actual'] + + result = word_tokenisation.compute_stats( + word_tokenisation.preprocessing(exp), + word_tokenisation.preprocessing(act) + ) + + self.assertIsNotNone(result) + + def test_benchmark(self): + expected = [] + actual = [] + for pair in TEST_DATA['sentences']: + expected.append(pair['expected']) + actual.append(pair['actual']) + + df = word_tokenisation.benchmark(expected, actual) + + self.assertIsNotNone(df) + + def test_count_correctly_tokenised_words(self): + for d in TEST_DATA['binary_sentences']: + sample = np.array(list(d['actual'])).astype(int) + ref_sample = np.array(list(d['expected'])).astype(int) + + wb = list(word_tokenisation._find_word_boudaries(ref_sample)) + + self.assertEqual( + word_tokenisation._count_correctly_tokenised_words(sample, wb), + d['expected_count'] + ) + + def test_words_correctly_tokenised(self): + r = [(0, 2), (2, 10), (10, 12) ] + s = [(0, 10), (10, 12)] + + expected = "01" + + labels = word_tokenisation._find_words_correctly_tokenised(r, s) + self.assertEqual(expected, "".join(np.array(labels).astype(str))) + + def test_flatten_result(self): + result = dict( + key1=dict(v1=6), + key2=dict(v2=7) + ) + + actual = word_tokenisation._flatten_result(result) + self.assertEqual(actual, {'key1:v1': 6, 'key2:v2': 7}) \ No newline at end of file From 7adc2ea7ec11cf4376551a9395bccf20d9013f20 Mon Sep 17 00:00:00 2001 From: seth Date: Sun, 1 Sep 2019 16:19:02 +0700 Subject: [PATCH 23/73] fixed merge conflict in and --- pythainlp/tokenize/__init__.py | 533 +++++++++++++++++++++++---------- pythainlp/tokenize/deepcut.py | 42 ++- 2 files changed, 404 insertions(+), 171 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index bf797efe1..950413e33 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -1,208 +1,431 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import, unicode_literals -import nltk +""" +Thai tokenizers +""" import re -import codecs -from six.moves import zip -from pythainlp.corpus.thaisyllable import get_data -from pythainlp.corpus.thaiword import get_data as get_dict -from marisa_trie import Trie +import warnings +from typing import Iterable, List, Union -DEFAULT_DICT_TRIE = Trie(get_dict()) +from marisa_trie import Trie +from pythainlp.corpus import thai_syllables, thai_words +DEFAULT_DICT_TRIE = Trie(thai_words()) -def dict_word_tokenize(text, custom_dict_trie, engine='newmm'): - ''' - :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. - :param str text: the text to be tokenized - :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie - :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching) +def word_tokenize( + text: str, + custom_dict: Trie = None, + engine: str = "newmm", + keep_whitespace: bool = True, +) -> List[str]: + """ + This function tokenizes running text into words. + :param str text: text to be tokenized + :param str engine: name of the tokenizer to be used + :param marisa_trie.Trie custom_dict: marisa dictionary trie + :param bool keep_whitespace: True to keep whitespaces, a common mark + for end of phrase in Thai. + Otherwise, whitespaces are omitted. + :return: list of words + :rtype: list[str] + **Options for engine** + * *newmm* (default) - dictionary-based, Maximum Matching + + Thai Character Cluster + * *longest* - dictionary-based, Longest Matching + * *deepcut* - wrapper for + `deepcut `_, + language-model-based + * *icu* - wrapper for ICU (International Components for Unicode, + using PyICU), dictionary-based + * attacut - Wrapper for `AttaCut (https://github.com/PyThaiNLP/attacut)` + .. warning:: + * the option for engine named *ulmfit* has been deprecated since \ + PyThaiNLP version 2.1 + :Note: + - The parameter **custom_dict** can be provided as an argument \ + only for *newmm*, *longest*, and *deepcut* engine. + :Example: + Tokenize text with different tokenizer: + >>> from pythainlp.tokenize import word_tokenize + >>> + >>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด" + >>> word_tokenize(text, engine="newmm") + ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + >>> + >>> word_tokenize(text, engine="longest") + ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + >>> + >>> word_tokenize(text, engine="deepcut") + ['โอเค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + >>> + >>> word_tokenize(text, engine="icu") + ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด'] + >>> + >>> word_tokenize(text, engine="ulmfit") + ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + Tokenize text by omitiing whitespaces: + >>> from pythainlp.tokenize import word_tokenize + >>> + >>> text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว " + >>> word_tokenize(text, engine="newmm") + ['วรรณกรรม', ' ', 'ภาพวาด', ' ', 'และ', 'การแสดง', 'งิ้ว', ' '] + >>> word_tokenize(text, engine="newmm", keep_whitespace=False) + ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว'] + Tokenize with default and custom dictionary: + >>> from pythainlp.corpus.common import thai_words + >>> from pythainlp.tokenize import dict_trie, word_tokenize + >>> + >>> text = 'ชินโซ อาเบะ เกิด 21 กันยายน' + >>> word_tokenize(text, engine="newmm") + ​['ชิน', 'โซ', ' ', 'อา', 'เบะ', ' ', 'เกิด', ' ', + '21', ' ', 'กันยายน'] + >>> custom_dict_japanese_name = set(thai_words() + >>> custom_dict_japanese_name.add('ชินโซ') + >>> custom_dict_japanese_name.add('อาเบะ') + >>> trie = dict_trie(dict_source=custom_dict_japanese_name) + >>> word_tokenize(text, engine="newmm", custom_dict=trie)) + ['ชินโซ', ' ', 'อาเบะ', ' ', 'เกิด', ' ', '21', ' ', 'กันยายน'] + """ + if not text or not isinstance(text, str): + return [] - :return: A list of words, tokenized from a text. - ''' + segments = [] if engine == "newmm" or engine == "onecut": - from .newmm import mmcut as segment + from .newmm import segment + + segments = segment(text, custom_dict) + elif engine == "longest": + from .longest import segment + + segments = segment(text, custom_dict) elif engine == "mm" or engine == "multi_cut": from .multi_cut import segment - elif engine == 'longest-matching': - from .longest import segment - elif engine == 'wordcutpy': - from .wordcutpy import segment - return segment(text, custom_dict_trie.keys()) - return segment(text, custom_dict_trie) + segments = segment(text, custom_dict) + elif engine == "deepcut": # deepcut can optionally use dictionary + from .deepcut import segment -def word_tokenize(text, engine='newmm', whitespaces=True): - """ - :param str text: the text to be tokenized - :param str engine: the engine to tokenize text - :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai. - :Parameters for engine: - * newmm - ใช้ Maximum Matching algorithm ในการตัดคำภาษาไทย โค้ดชุดใหม่ (ค่าเริ่มต้น) - * icu - engine ตัวดั้งเดิมของ PyThaiNLP (ความแม่นยำต่ำ) - * longest-matching ใช้ Longest matching ในการตัดคำ - * mm ใช้ Maximum Matching algorithm - โค้ดชุดเก่า - * pylexto - ใช้ LexTo ในการตัดคำ - * deepcut - ใช้ Deep Neural Network ในการตัดคำภาษาไทย - * wordcutpy - ใช้ wordcutpy (https://github.com/veer66/wordcutpy) ในการตัดคำ - * cutkum - ใช้ Deep Neural Network ในการตัดคำภาษาไทย (https://github.com/pucktada/cutkum) - * attacut - ใช้ AttaCut (https://github.com/PyThaiNLP/attacut) ในการตัดคำภาษาไทย - :return: A list of words, tokenized from a text - - **Example**:: - - from pythainlp.tokenize import word_tokenize - text='ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด' - a=word_tokenize(text,engine='icu') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอ', 'เค', 'บ่', 'พวก', 'เรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้าน', 'เกิด'] - b=word_tokenize(text,engine='dict') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด'] - c=word_tokenize(text,engine='mm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด'] - d=word_tokenize(text,engine='pylexto') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด'] - e=word_tokenize(text,engine='newmm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด'] - g=word_tokenize(text,engine='wordcutpy') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้านเกิด'] - """ - if engine == 'icu': + if custom_dict: + custom_dict = list(custom_dict) + segments = segment(text, custom_dict) + else: + segments = segment(text) + elif engine == "icu": from .pyicu import segment - elif engine == 'multi_cut' or engine == 'mm': - from .multi_cut import segment - elif engine == 'newmm' or engine == 'onecut': - from .newmm import mmcut as segment - elif engine == 'longest-matching': - from .longest import segment - elif engine == 'pylexto': - from .pylexto import segment - elif engine == 'deepcut': - from .deepcut import segment - elif engine == 'wordcutpy': - from .wordcutpy import segment + + segments = segment(text) elif engine == 'attacut': from .attacut import segment - else: - raise Exception("error no have engine.") - if not whitespaces: - return [i.strip(' ') for i in segment(text) if i.strip(' ') != ''] - return segment(text) + segments = segment(text) + else: # default, use "newmm" engine + from .newmm import segment -def sent_tokenize(text, engine='whitespace+newline'): - ''' -This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found. + segments = segment(text, custom_dict) - :param str text: the text to be tokenized - :param str engine: choose between 'whitespace' or 'whitespace+newline' - - :return: a list of text, split by whitespace or new line. - ''' - if engine == 'whitespace': - data = nltk.tokenize.WhitespaceTokenizer().tokenize(text) - elif engine == 'whitespace+newline': - data = re.sub(r'\n+|\s+', '|', text, re.U).split('|') - return data + if not keep_whitespace: + segments = [token.strip(" ") for token in segments if token.strip(" ")] + return segments -def subword_tokenize(text, engine='tcc'): +def dict_word_tokenize( + text: str, + custom_dict: Trie = DEFAULT_DICT_TRIE, + engine: str = "newmm", + keep_whitespace: bool = True, +) -> List[str]: """ + :meth: DEPRECATED: Please use `word_tokenize()` with a `custom_dict` + argument instead :param str text: text to be tokenized - :param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units. - :return: a list of tokenized strings. + :param dict custom_dict: a dictionary trie, or an iterable of words, + or a string of dictionary path + :param str engine: choose between different options of engine to token + (newmm [default], mm, longest, and deepcut) + :param bool keep_whitespace: True to keep whitespaces, a common mark + for end of phrase in Thai + :return: list of words """ - if engine == 'tcc': - from .tcc import tcc - return tcc(text) + warnings.warn( + "dict_word_tokenize is deprecated. Use word_tokenize with a custom_dict argument instead.", + DeprecationWarning, + ) + return word_tokenize( + text=text, + custom_dict=custom_dict, + engine=engine, + keep_whitespace=keep_whitespace, + ) -def isthai(text, check_all=False): + +def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: """ - :param str text: input string or list of strings - :param bool check_all: checks all character or not + This function does not yet automatically recognize when a sentence + actually ends. Rather it helps split text where white space and + a new line is found. + :param str text: the text to be tokenized + :param str engine: choose between *'whitespace'* or *'whitespace+newline'* + :return: list of splited sentences + :rtype: list[str] + **Options for engine** + * *whitespace+newline* (default) - split by whitespace token \ + and newline. + * *whitespace* - split by whitespace token. Specifiaclly, with \ + :class:`regex` pattern ``r" +"`` + :Example: + Split the text based on *whitespace* + >>> from pythainlp.tokenize import sent_tokenize + >>> sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม" + >>> sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\ + และได้รับมอบหมายให้ประจำในระดับภูมิภาค" + >>> sent_tokenize(sentence_1, engine="whitespace") + ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม'] + >>> sent_tokenize(sentence_2, engine="whitespace") + ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ', + '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค'] + Split the text based on *whitespace* and *newline* + >>> from pythainlp.tokenize import sent_tokenize + >>> sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม" + >>> sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\ + และได้รับมอบหมายให้ประจำในระดับภูมิภาค" + >>> sent_tokenize(sentence_1, engine="whitespace") + ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม'] + >>> sent_tokenize(sentence_2, engine="whitespace") + ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ', + 'และได้รับมอบหมายให้ประจำในระดับภูมิภาค'] + """ + + if not text or not isinstance(text, str): + return [] + + sentences = [] - :return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false. + if engine == "whitespace": + sentences = re.split(r" +", text, re.U) + else: # default, use whitespace + newline + sentences = text.split() + + return sentences + + +def subword_tokenize(text: str, engine: str = "tcc") -> List[str]: """ - listext = list(text) - i = 0 - num_isthai = 0 - if check_all: - listthai = [] - while i < len(listext): - cVal = ord(listext[i]) - if (cVal >= 3584 and cVal <= 3711): - num_isthai += 1 - if check_all: - listthai.append(True) - else: - if check_all: - listthai.append(False) - i += 1 - thai = (num_isthai / len(listext)) * 100 - if check_all: - dictthai = tuple(zip(listext, listthai)) - data = {'thai': thai, 'check_all': dictthai} - else: - data = {'thai': thai} - return data + This function tokenizes text into inseparable units of + Thai contiguous characters namely + `Thai Character Clusters (TCCs) \ + `_ + TCCs are the units based on Thai spelling feature that could not be + separated any character further such as 'ก็', 'จะ', 'ไม่', and 'ฝา'. + If the following units are separated, they could not be spelled out. + This function apply the TCC rules to tokenizes the text into + the smallest units. For example, the word 'ขนมชั้น' would be tokenized + into 'ข', 'น', 'ม', and 'ชั้น' + :param str text: text to be tokenized + :param str engine: the name subword tokenizer + :return: list of subwords + :rtype: list[str] + **Options for engine** + * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) + * *ssg* - CRF syllable segmenter for Thai. + * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001) + [In development] + :Example: + Tokenize text into subword based on *tcc* + >>> from pythainlp.tokenize import subword_tokenize + >>> text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" + >>> text_2 = "ความแปลกแยกและพัฒนาการ" + >>> subword_tokenize(text_1, engine='tcc') + ['ยุ', 'ค', 'เริ่ม', 'แร', 'ก', 'ข', 'อ', 'ง', ' ', 'รา', 'ช', 'ว', 'ง', + 'ศ', '์', 'ห', 'มิ', 'ง'] + >>> subword_tokenize(text_2, engine='tcc') + ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก', + 'และ', 'พัฒ','นา', 'กา', 'ร'] + Tokenize text into subword based on *etcc* **(Work In Progress)** + >>> from pythainlp.tokenize import subword_tokenize + >>> text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" + >>> text_2 = "ความแปลกแยกและพัฒนาการ" + >>> subword_tokenize(text_1, engine='etcc') + ['ยุคเริ่มแรกของ ราชวงศ์หมิง'] + >>> subword_tokenize(text_2, engine='etcc') + ['ความแปลกแยกและ', 'พัฒ', 'นาการ'] + """ + if not text or not isinstance(text, str): + return [] + if engine == "etcc": + from .etcc import segment + elif engine == "ssg": + from .ssg import segment + else: # default + from .tcc import segment -def syllable_tokenize(text): + return segment(text) + + +def syllable_tokenize(text: str, engine: str = "default") -> List[str]: """ + This function is to tokenize text into syllable (Thai: พยางค์), a unit of + pronunciation having one vowel sound. For example, the word 'รถไฟ' + contains two syallbles including 'รถ', and 'ไฟ'. + Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize` + with *newmm* as a tokenizer. The function tokenize the text with + the dictionary of Thai words from + :func:`pythainlp.corpus.common.thai_words` + and then dictionary of Thai syllable from + :func:`pythainlp.corpus.common.thai_syllables`. + As a result, only syllables are obtained. :param str text: input string to be tokenized - - :return: returns list of strings of syllables + :return: list of syllables where whitespaces in the text **are included** + :rtype: list[str] + **Options for engine** + * *default* + * *ssg* - CRF syllable segmenter for Thai. + :Example: + >>> from pythainlp.tokenize import syllable_tokenize + >>> + >>> text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า' + >>> syllable_tokenize(text) + ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว', + 'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า'] """ - text1 = word_tokenize(text) - data = [] - trie = create_custom_dict_trie(custom_dict_source=get_data()) - if len(text1) > 1: - i = 0 - while i < len(text1): - data.extend( - dict_word_tokenize(text=text1[i], custom_dict_trie=trie)) - i += 1 - else: - data = dict_word_tokenize(text=text, custom_dict_trie=trie) - return data + if not text or not isinstance(text, str): + return [] + + tokens = [] + if engine == "default": + words = word_tokenize(text) + trie = dict_trie(dict_source=thai_syllables()) + for word in words: + tokens.extend(word_tokenize(text=word, custom_dict=trie)) + else: + from .ssg import segment + tokens = segment(text) -def create_custom_dict_trie(custom_dict_source): - """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see:https://marisa-trie.readthedocs.io/en/latest/index.html + return tokens - :param string/list custom_dict_source: a list of vocaburaries or a path to source file - :return: A trie created from custom dict input +def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie: """ + Create a dict trie which will be used for word_tokenize() function. + For more information on the trie data structure, + see: `marisa-trie's Official Documentation \ + `_ + :param string/list dict_source: a list of vocaburaries or a path + to source file + :return: a trie created from a dictionary input + """ + trie = None - if type(custom_dict_source) is str: - # Receive a file path of the custom dict to read - with codecs.open(custom_dict_source, 'r', encoding='utf8') as f: + if isinstance(dict_source, Trie): + trie = dict_source + elif isinstance(dict_source, str): + # Receive a file path of the dict to read + with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() - return Trie(_vocabs) - elif isinstance(custom_dict_source, (list, tuple, set)): + trie = Trie(_vocabs) + elif isinstance(dict_source, Iterable): + # Note: Trie and str are both Iterable, Iterable check should be here # Received a sequence type object of vocabs - return Trie(custom_dict_source) + trie = Trie(dict_source) else: raise TypeError( - 'Type of custom_dict_source must be either str (path to source file) or collections' + "Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)" ) + return trie + + class Tokenizer: - def __init__(self, custom_dict=None): + """ + This class allows users to pre-define custom dictionary along with + tokenizer and encapsulate them into one single object. + It is an wrapper for both two functions including + :func:`pythainlp.tokenize.word_tokenize`, + and :func:`pythainlp.tokenize.dict_trie` + :Example: + Tokenizer object instantiated with :class:`marisa_trie.Trie` + >>> from pythainlp.tokenize import Tokenizer + >>> from pythainlp.tokenize import Tokenizer, dict_trie + >>> from pythainlp.corpus.common import thai_words + >>> + >>> custom_words_list = set(thai_words()) + >>> custom_words_list.add('อะเฟเซีย') + >>> custom_words_list.add('Aphasia') + >>> trie = dict_trie(dict_source=custom_words_list) + >>> + >>> text = "อะเฟเซีย (Aphasia*) เป็นอาการผิดปกติของการพูด" + >>> _tokenizer = Tokenizer(custom_dict=trie, engine='newmm') + ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', + 'ผิดปกติ', 'ของ', 'การ', 'พูด'] + Tokenizer object instantiated with a list of words + >>> from pythainlp.tokenize import Tokenizer + >>> from pythainlp.corpus.common import thai_words + >>> + >>> text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด" + >>> _tokenizer = Tokenizer(custom_dict=list(thai_words()), engine='newmm') + >>> _tokenizer.word_tokenize(text) + ['อะ', 'เฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', + 'ผิดปกติ', 'ของ', 'การ', 'พูด'] + Tokenizer object instantiated with a file path containing list of + word separated with *newline* and explicitly set a new tokeneizer + after initiation. + >>> from pythainlp.tokenize import Tokenizer + >>> + >>> PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt' + >>> + >>> # write a file + >>> with open(PATH_TO_CUSTOM_DICTIONARY, 'w', encoding='utf-8') as f: + >>> f.write('อะเฟเซีย\\nAphasia\\nผิด\\nปกติ') + >>> + >>> text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด" + >>> + >>> # initate an object from file with `deepcut` as tokenizer + >>> _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY, \\ + engine='deepcut') + >>> _tokenizer.word_tokenize(text) + ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', 'ผิด', + 'ปกติ', 'ของ', 'การ', 'พูด'] + >>> + >>> # change tokenizer to `newmm + >>> _tokenizer.set_tokenizer_engine(engine='newmm') + >>> _tokenizer.word_tokenize(text) + ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็นอาการ', 'ผิด', + 'ปกติ', 'ของการพูด'] + """ + + def __init__( + self, custom_dict: Union[Trie, Iterable[str], str] = None, engine: str = "newmm" + ): """ Initialize tokenizer object - - :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron) - - :return: trie_dict - a dictionary in the form of trie data for tokenizing engines + :param str: a file path, a list of vocaburaies* to be + used to create a trie, or an instantiated + :class:`marisa_trie.Trie` object. + :param str engine: choose between different options of engine to token + (i.e. *newmm*, *longest*, *deepcut*) """ + self.__trie_dict = None + self.__engine = engine if custom_dict: - if type(custom_dict) is list: - self.trie_dict = Trie(custom_dict) - elif type(custom_dict) is str: - with codecs.open(custom_dict, 'r', encoding='utf8') as f: - vocabs = f.read().splitlines() - self.trie_dict = Trie(vocabs) + self.__trie_dict = dict_trie(custom_dict) else: - self.trie_dict = Trie(get_dict()) + self.__trie_dict = DEFAULT_DICT_TRIE + + def word_tokenize(self, text: str) -> List[str]: + """ + :param str text: text to be tokenized + :return: list of words, tokenized from the text + :rtype: list[str] + """ + return word_tokenize(text, custom_dict=self.__trie_dict, engine=self.__engine) - def word_tokenize(self, text, engine='newmm'): - from .newmm import mmcut as segment - return segment(text, self.trie_dict) + def set_tokenize_engine(self, engine: str) -> None: + """ + Set the tokenizer + :param str engine: choose between different options of engine to token + (i.e. *newmm*, *longest*, *deepcut*) + """ + self.__engine = engine \ No newline at end of file diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 0b9820957..1ae7aeab1 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -1,17 +1,27 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import, unicode_literals -import sys -try: - import deepcut -except ImportError: - '''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ''' - from pythainlp.tools import install_package - install_package('deepcut') - try: - import deepcut - except ImportError: - sys.exit('Error ! using pip install deepcut') - - -def segment(text): +""" +Wrapper for deepcut Thai word segmentation. deepcut is a +Thai word segmentation library using Deep Neural, specifically, +1D Convolution Neural Network. +:See Also: + * `GitHub repository `_ +""" + +from typing import List, Union + +import deepcut + +from marisa_trie import Trie + + +def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]: + if not text or not isinstance(text, str): + return [] + + if custom_dict: + if isinstance(custom_dict, Trie): + custom_dict = list(custom_dict) + + return deepcut.tokenize(text, custom_dict) + return deepcut.tokenize(text) + From 9822e10b81166349533fa43deced446e759d8848 Mon Sep 17 00:00:00 2001 From: seth Date: Sun, 1 Sep 2019 16:32:21 +0700 Subject: [PATCH 24/73] fixed file format --- pythainlp/tokenize/deepcut.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 078e2155b..1ae7aeab1 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -1,4 +1,3 @@ -<<<<<<< HEAD """ Wrapper for deepcut Thai word segmentation. deepcut is a Thai word segmentation library using Deep Neural, specifically, @@ -26,21 +25,3 @@ def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[ return deepcut.tokenize(text) -======= -# -*- coding: utf-8 -*- -from __future__ import absolute_import, unicode_literals -import sys -try: - import deepcut -except ImportError: - '''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ''' - from pythainlp.tools import install_package - install_package('deepcut') - try: - import deepcut - except ImportError: - sys.exit('Error ! using pip install deepcut') - -def segment(text): - return deepcut.tokenize(text) ->>>>>>> 73ba1ed161887deedbc505b79714779cd3673388 From 44a9f6bade585b74f239f94051f3bf11e06462da Mon Sep 17 00:00:00 2001 From: bact Date: Sun, 1 Sep 2019 23:11:12 +0700 Subject: [PATCH 25/73] Update requirements.txt --- requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 508e5ba90..06525e404 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -dill -marisa-trie==0.7.4 -nltk>=3.2.2 -pytz -requests -tinydb -tqdm +dill==0.3.* +marisa-trie==0.7.* +nltk==3.4.* +pytz==2019.2 +requests==2.22.* +tinydb==3.13.* +tqdm==4.35.* From 134e79bc9b6d93b52ed534b78f02b90108dffc35 Mon Sep 17 00:00:00 2001 From: bact Date: Sun, 1 Sep 2019 23:12:00 +0700 Subject: [PATCH 26/73] Update requirements.txt --- requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7fd66ad78..06525e404 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -dill -marisa-trie -nltk>=3.2.2 -pytz -requests -tinydb -tqdm +dill==0.3.* +marisa-trie==0.7.* +nltk==3.4.* +pytz==2019.2 +requests==2.22.* +tinydb==3.13.* +tqdm==4.35.* From d6c53d5a33d01588ec70f266751149ace0cd1456 Mon Sep 17 00:00:00 2001 From: seth Date: Mon, 2 Sep 2019 09:27:34 +0700 Subject: [PATCH 27/73] remove try-except from tokenize/attacut.py --- pythainlp/tokenize/__init__.py | 3 +++ pythainlp/tokenize/attacut.py | 26 ++++++++++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 6428ffb3a..95170eb4d 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -63,6 +63,9 @@ def word_tokenize( >>> >>> word_tokenize(text, engine="ulmfit") ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + >>> tokenize.word_tokenize('โอเคบ่พวกเรารักภาษาบ้านเกิด', engine='attacut') + ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] + >>> Tokenize text by omitiing whitespaces: >>> from pythainlp.tokenize import word_tokenize >>> diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 84f3111fb..033753ee5 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -1,17 +1,15 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import, unicode_literals -import sys -try: - import attacut -except ImportError: - '''ในกรณีที่ยังไม่ติดตั้ง attacut ในระบบ''' - from pythainlp.tools import install_package - install_package('attacut') - try: - import attacut - except ImportError: - sys.exit('Error ! using pip install attacut') +""" +Wrapper for AttaCut - a resonable fast thai word segmentation +:See Also: + * `GitHub repository `_ +""" +from typing import List, Union +import attacut -def segment(text): + +def segment(text: str): + if not text or not isinstance(text, str): + return [] + return attacut.tokenize(text) From 229d9eb2d79ebc19939036ede8bbd0c7c5a9c255 Mon Sep 17 00:00:00 2001 From: seth Date: Mon, 2 Sep 2019 10:14:44 +0700 Subject: [PATCH 28/73] add test for attacut --- tests/test_tokenize.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 9ddc4ed83..dcdcc9ef2 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -174,6 +174,16 @@ def test_word_tokenize_newmm(self): ["จุ๋ม", "ง่วง"], ) + + def test_word_tokenize_attacut(self): + self.assertEqual(attacut.segment(None), []) + self.assertEqual(attacut.segment(""), []) + self.assertEqual( + word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"), + ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คนไทย'], + ) + + def test_sent_tokenize(self): self.assertEqual(sent_tokenize(None), []) self.assertEqual(sent_tokenize(""), []) From c496e3c2010cf33be139690d72c26650cf13574c Mon Sep 17 00:00:00 2001 From: seth Date: Mon, 2 Sep 2019 10:22:56 +0700 Subject: [PATCH 29/73] add test for attacut --- tests/test_tokenize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index dcdcc9ef2..a9c14da94 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -12,6 +12,7 @@ ) from pythainlp.tokenize import DEFAULT_DICT_TRIE, Tokenizer from pythainlp.tokenize import deepcut as tokenize_deepcut +from pythainlp.tokenize import attacut from pythainlp.tokenize import ( dict_trie, dict_word_tokenize, From d9888ca2c8dd928035b9137fad4af216c63d19e6 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:00:08 +0700 Subject: [PATCH 30/73] add documentation code document were added for the following functions: - process_thai - fix_html - replace_rep_after - replace_rep_nonum - replace_wrep_post - replace_wrep_post_nonum - remove_space --- docs/api/ulmfit.rst | 18 +++-- pythainlp/ulmfit/__init__.py | 143 +++++++++++++++++++++++++++++++++-- 2 files changed, 149 insertions(+), 12 deletions(-) diff --git a/docs/api/ulmfit.rst b/docs/api/ulmfit.rst index 527336c64..9b7bdba28 100644 --- a/docs/api/ulmfit.rst +++ b/docs/api/ulmfit.rst @@ -6,13 +6,21 @@ The :class:`ulmfit.utils` is utils for ULMFit model. Modules ------- - -.. autofunction:: replace_rep_after +.. autoclass:: ThaiTokenizer +.. autofunction:: document_vector +.. autofunction:: process_thai +.. autofunction:: fix_html +.. autofunction:: spec_add_spaces +.. autofunction:: rm_useless_spaces .. autofunction:: rm_useless_newlines .. autofunction:: rm_brackets +.. autofunction:: replace_rep_nonum .. autofunction:: ungroup_emoji .. autofunction:: lowercase_all +.. autofunction:: replace_wrep_post +.. autofunction:: replace_wrep_post_nonum +.. autofunction:: replace_rep_after +.. autofunction:: remove_space .. autofunction:: merge_wgts -.. autofunction:: document_vector -.. autoclass:: ThaiTokenizer - :members: tokenizer + +:members: tokenizer diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py index 18ce6a377..5db96824e 100644 --- a/pythainlp/ulmfit/__init__.py +++ b/pythainlp/ulmfit/__init__.py @@ -38,7 +38,20 @@ def add_special_cases(self, toks: Collection[str]): pass def fix_html(x: str) -> str: - """List of replacements from html strings in `x`. (code from `fastai`)""" + """ + List of replacements from html strings in `x`. (code from `fastai`) + + :param str x: text to replace html string + + :return: text where html strings are replaced + :rtype: str + + :Example: + + >>> from pythainlp.ulmfit import fix_html + >>> fix_html("Anbsp;amp;nbsp;B @.@ ") + A & B. + """ re1 = re.compile(r' +') x = x.replace('#39;', "'").replace('amp;', '&').replace( '#146;', "'").replace('nbsp;', ' ').replace( @@ -150,6 +163,20 @@ def replace_rep_after(text: str) -> str: Replace repetitions at the character level in `text` after the repetition. This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep 8 ย' ;instead it will retain the word as 'น้อย xxrep 8' + + :param str text: input text to replace character repetition + + :return: text with repetitive token **xxrep** and the counter + after character repetition + + :rtype: str + :Example: + + >>> from pythainlp.ulmfit import replace_rep_after + >>> + >>> text = "กาาาาาาา" + >>> replace_rep_after(text) + 'กาxxrep7 ' """ def _replace_rep(m): @@ -163,8 +190,23 @@ def _replace_rep(m): def replace_wrep_post(toks: Collection): """ - Replace reptitive words post tokenization; + Replace reptitive words post tokenization; fastai `replace_wrep` does not work well with Thai. + + :param list[str] toks: list of tokens + + :return: list of tokens where **xxwrep** token and the counter + is added in front of repetitive words. + :rtype: list[str] + + :Example: + + >>> from pythainlp.ulmfit import replace_wrep_post_nonum + >>> + >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] + >>> replace_wrep_post(toks) + ['กา', 'xxwrep', '3', 'น้ำ'] + """ previous_word = None rep_count = 0 @@ -218,8 +260,23 @@ def lowercase_all(toks: Collection): def replace_rep_nonum(text: str) -> str: """ Replace repetitions at the character level in `text` after the repetition. - This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xrep 8 ย'; - instead it will retain the word as 'น้อย xrep 8' + This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep ย'; + instead it will retain the word as 'น้อย xxrep ' + + :param str text: input text to replace character repetition + + :return: text with repetitive token **xxrep** after + character repetition + :rtype: str + + :Example: + + >>> from pythainlp.ulmfit import replace_rep_nonum + >>> + >>> text = "กาาาาาาา" + >>> replace_rep_nonum(text) + 'กา xxrep ' + """ def _replace_rep(m): c, cc = m.groups() @@ -232,6 +289,21 @@ def replace_wrep_post_nonum(toks: Collection): """ Replace reptitive words post tokenization; fastai `replace_wrep` does not work well with Thai. + + :param list[str] toks: list of tokens + + :return: list of tokens where **xxwrep** token is added in front of + repetitive words. + :rtype: list[str] + + :Example: + + >>> from pythainlp.ulmfit import replace_wrep_post_nonum + >>> + >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] + >>> replace_wrep_post_nonum(toks) + ['กา', 'xxwrep', 'น้ำ'] + """ previous_word = None rep_count = 0 @@ -251,6 +323,11 @@ def replace_wrep_post_nonum(toks: Collection): def remove_space(toks: Collection): """ Do not include space for bag-of-word models. + + :param list[str] toks: list of tokens + + :return: list of tokens where space tokens (" ") are filtered out + :rtype: list[str] """ res = [] for t in toks: @@ -289,11 +366,63 @@ def process_thai(text: str, pre_rules: Collection = pre_rules_th_sparse, post_rules: Collection = post_rules_th_sparse) -> Collection[str]: """ Process Thai texts for models (with sparse features as default) + :param str text: text to be cleaned - :param pre_rules List: rules to apply before tokenization - :param tok_func Callable: tokenization function - :param post_rules List: rules to apply after tokenizations + :param list[func] pre_rules: rules to apply before tokenization. + :param func tok_func: tokenization function (by default, **tok_func** is + :func:`pythainlp.tokenize.word_tokenize`) + + :param list[func] post_rules: rules to apply after tokenizations + :return: a list of cleaned tokenized texts + :rtype: list[str] + + + :Note: + - The default **pre-rules** consists of :func:`fix_html`, + :func:`pythainlp.util.normalize`, + :func:`spec_add_spaces`, + :func:`rm_useless_spaces`, + :func:`rm_useless_newlines`, + :func:`rm_brackets` + and :func:`replace_rep_nonum`. + + - The default **post-rules** consists of :func:`ungroup_emoji`, + :func:`lowercase_all`, :func:`replace_wrep_post_nonum`, + and :func:`remove_space`. + + :Example: + + 1. Use default pre-rules and post-rules: + + >>> from pythainlp.ulmfit import process_thai + >>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp; " + >>> process_thai(text) + [บ้าน', 'xxrep', ' ', 'อยู่', 'xxwrep', 'นาน', '😂', '🤣', + '😃', '😄', '😅', 'pythainlp', '&'] + + 2. Modify pre_rules and post_rules arugments with + rules provided in :mod:`pythainlp.ulmfit`: + + >>> from pythainlp.ulmfit import ( + process_thai, + replace_rep_after, + fix_html, + ungroup_emoji, + replace_wrep_post, + remove_space) + >>> + >>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp; " + >>> process_thai(text, + pre_rules=[replace_rep_after, fix_html], + post_rules=[ungroup_emoji, + replace_wrep_post, + remove_space] + ) + ['บ้าน', 'xxrep', '5', '()', 'อยู่', 'xxwrep', '2', 'นาน', '😂', '🤣', + '😃', '😄', '😅', 'PyThaiNLP', '&'] + + """ res = text for pre in pre_rules: From de69a54d839ae4ed2ed9f5280c1bf716fcbb3c81 Mon Sep 17 00:00:00 2001 From: seth Date: Mon, 2 Sep 2019 11:09:39 +0700 Subject: [PATCH 31/73] add test for attacut --- pythainlp/tokenize/attacut.py | 4 ++-- setup.py | 2 ++ tests/test_tokenize.py | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 033753ee5..ea6c948b6 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -1,7 +1,7 @@ """ -Wrapper for AttaCut - a resonable fast thai word segmentation +Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai :See Also: - * `GitHub repository `_ + * `GitHub repository `_ """ from typing import List, Union diff --git a/setup.py b/setup.py index 85739ef8b..364f4901e 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ extras = { "artagger": ["artagger"], + "attacut": ["attacut"], "deepcut": ["deepcut", "keras", "tensorflow"], "icu": ["pyicu"], "ipa": ["epitran"], @@ -21,6 +22,7 @@ "benchmarks": ["numpy", "pandas"], "full": [ "artagger", + "attacut", "deepcut", "epitran", "fastai>=1.0.38", diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index a9c14da94..67da3249f 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -71,6 +71,9 @@ def test_word_tokenize(self): self.assertIsNotNone( word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX") ) + self.assertIsNotNone( + word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut") + ) self.assertIsNotNone(dict_trie(())) self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie"))) From 194851660bf7468b9d6d9604e4c2d13c3a5cfa40 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:10:42 +0700 Subject: [PATCH 32/73] Add Discussion --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5c6d35d8f..920ac227c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -36,7 +36,7 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod # Discussion -- Facebook group: https://www.facebook.com/groups/thainlp +- Facebook group (for Thai NLP Discussion only): https://www.facebook.com/groups/thainlp - GitHub issues: https://github.com/PyThaiNLP/pythainlp/issues Happy hacking! (; From a479b0ec7d67f8d689cb410afaf7d3518c4e01dd Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:10:54 +0700 Subject: [PATCH 33/73] Add pythainlp.benchmarks --- pythainlp/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index d5d8e13ad..76b4c0485 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -34,3 +34,4 @@ ) from pythainlp.transliterate import romanize, transliterate from pythainlp.util import collate, thai_strftime +from pythainlp.benchmarks import benchmark \ No newline at end of file From bb340e86b735ccb20045d1cf74128e3be53c0c80 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:14:58 +0700 Subject: [PATCH 34/73] resolve conflict CONTRIBUTING.md --- CONTRIBUTING.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 920ac227c..1a3f56f5e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -36,8 +36,10 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod # Discussion -- Facebook group (for Thai NLP Discussion only): https://www.facebook.com/groups/thainlp -- GitHub issues: https://github.com/PyThaiNLP/pythainlp/issues +- Facebook group (for Thai NLP Discussion only): + https://www.facebook.com/groups/thainlp +- GitHub issues (Problems and suggestions): + https://github.com/PyThaiNLP/pythainlp/issues Happy hacking! (; From 4af3d7316625d77a1de958edef06e942856abe90 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:16:45 +0700 Subject: [PATCH 35/73] resolve conflict CONTRIBUTING.md --- CONTRIBUTING.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1a3f56f5e..f43233847 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -36,10 +36,8 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod # Discussion -- Facebook group (for Thai NLP Discussion only): - https://www.facebook.com/groups/thainlp -- GitHub issues (Problems and suggestions): - https://github.com/PyThaiNLP/pythainlp/issues +- Facebook group (for Thai NLP Discussion only): https://www.facebook.com/groups/thainlp +- GitHub issues (Problems and suggestions): https://github.com/PyThaiNLP/pythainlp/issues Happy hacking! (; From 7697ced606545d0c04aba02c95ea24a75add516f Mon Sep 17 00:00:00 2001 From: seth Date: Mon, 2 Sep 2019 11:32:00 +0700 Subject: [PATCH 36/73] correct test case for attacut --- tests/test_tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 67da3249f..e4aca4a9e 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -184,7 +184,7 @@ def test_word_tokenize_attacut(self): self.assertEqual(attacut.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"), - ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คนไทย'], + ['ฉัน', 'รัก', 'ภาษา', 'ไทย', 'เพราะ', 'ฉัน', 'เป็น', 'คน', 'ไทย'], ) From e63136500e380ec24477e2470ff77856449ee151 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:34:58 +0700 Subject: [PATCH 37/73] Removed a modified file from pull request --- notebooks/pythainlp-get-started.ipynb | 2969 +++++++++++++++---------- notebooks/sentiment_analysis.ipynb | 2798 ++++++++++++++--------- notebooks/text_generation.ipynb | 813 +++++-- 3 files changed, 4186 insertions(+), 2394 deletions(-) diff --git a/notebooks/pythainlp-get-started.ipynb b/notebooks/pythainlp-get-started.ipynb index 956a959b5..4bae41755 100644 --- a/notebooks/pythainlp-get-started.ipynb +++ b/notebooks/pythainlp-get-started.ipynb @@ -1,1244 +1,1795 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PyThaiNLP Get Started\n", - "\n", - "Code examples for basic functions in PyThaiNLP https://github.com/PyThaiNLP/pythainlp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Thai Characters\n", - "\n", - "PyThaiNLP provides some ready-to-use Thai character set (e.g. Thai consonants, vowels, tonemarks, symbols) as a string for convenience. There are also few utility functions to test if a string is in Thai or not." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮฤฦะัาำิีึืุูเแโใไๅ็่้๊๋ฯๆฺ์ํ๎๏๚๛๐๑๒๓๔๕๖๗๘๙฿'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "colab": { + "name": "pythainlp-get-started.ipynb", + "version": "0.3.2", + "provenance": [] } - ], - "source": [ - "import pythainlp\n", - "\n", - "pythainlp.thai_characters" - ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ'" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ri5cVDAWIgp7", + "colab_type": "text" + }, + "source": [ + "# PyThaiNLP Get Started\n", + "\n", + "Code examples for basic functions in PyThaiNLP https://github.com/PyThaiNLP/pythainlp" ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pythainlp.thai_consonants" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" + }, + { + "cell_type": "code", + "metadata": { + "id": "3HsfhZlwInqs", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 187 + }, + "outputId": "c4e91a7c-356c-4d07-802d-530cd62e4a6d" + }, + "source": [ + "# #uncomment if running from colab\n", + "# !pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "# !pip install epitran\n", + "# !pip install sklearn_crfsuite" + ], + "execution_count": 46, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting sklearn_crfsuite\n", + " Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (1.12.0)\n", + "Requirement already satisfied: tabulate in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (0.8.3)\n", + "Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)\n", + "\u001b[K |████████████████████████████████| 757kB 4.2MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm>=2.0 in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (4.28.1)\n", + "Installing collected packages: python-crfsuite, sklearn-crfsuite\n", + "Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"๔\" in pythainlp.thai_digits" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A6gy4MLGIgp9", + "colab_type": "text" + }, + "source": [ + "## Thai Characters\n", + "\n", + "PyThaiNLP provides some ready-to-use Thai character set (e.g. Thai consonants, vowels, tonemarks, symbols) as a string for convenience. There are also few utility functions to test if a string is in Thai or not." ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pythainlp.util\n", - "\n", - "pythainlp.util.isthai(\"ก\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" + }, + { + "cell_type": "code", + "metadata": { + "id": "GAvoeZg3Igp-", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "13509870-fe94-4957-ae37-b86a677d9234" + }, + "source": [ + "import pythainlp\n", + "\n", + "pythainlp.thai_characters" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮฤฦะัาำิีึืุูเแโใไๅ็่้๊๋ฯๆฺ์ํ๎๏๚๛๐๑๒๓๔๕๖๗๘๙฿'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 3 + } ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pythainlp.util.isthai(\"(ก.พ.)\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" + }, + { + "cell_type": "code", + "metadata": { + "id": "uPwx53A6IgqF", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "7693ee7c-f42f-4503-fc0a-fa2a47e5a374" + }, + "source": [ + "pythainlp.thai_consonants" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 5 + } ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pythainlp.util.isthai(\"(ก.พ.)\", ignore_chars=\".()\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "100.0" + }, + { + "cell_type": "code", + "metadata": { + "id": "5UA7Hwy_IgqI", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "9de7d50e-8499-48d9-bd2f-b025ddab9479" + }, + "source": [ + "\"๔\" in pythainlp.thai_digits" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 6 + } ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pythainlp.util.countthai(\"วันอาทิตย์ที่ 24 มีนาคม 2562\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "67.85714285714286" + }, + { + "cell_type": "code", + "metadata": { + "id": "t3NvXqYFIgqK", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "52d91e75-cfd7-4176-ff3b-a725724a8871" + }, + "source": [ + "import pythainlp.util\n", + "\n", + "pythainlp.util.isthai(\"ก\")" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 + } ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pythainlp.util.countthai(\"วันอาทิตย์ที่ 24 มีนาคม 2562\", ignore_chars=\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Collation\n", - "\n", - "Sorting according to Thai dictionary." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['กรรไกร', 'กระดาษ', 'ไข่', 'ค้อน', 'ผ้าไหม']" + }, + { + "cell_type": "code", + "metadata": { + "id": "sRzSQjugIgqM", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "212049ed-56d2-4b03-aef0-87d05b861ddb" + }, + "source": [ + "pythainlp.util.isthai(\"(ก.พ.)\")" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 + } ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp.util import collate\n", - "\n", - "thai_words = [\"ค้อน\", \"กระดาษ\", \"กรรไกร\", \"ไข่\", \"ผ้าไหม\"]\n", - "collate(thai_words)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['ผ้าไหม', 'ค้อน', 'ไข่', 'กระดาษ', 'กรรไกร']" + }, + { + "cell_type": "code", + "metadata": { + "id": "DP5yfJebIgqP", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "0eca64e8-dbfc-479a-ec0c-c4da71ff3b1c" + }, + "source": [ + "pythainlp.util.isthai(\"(ก.พ.)\", ignore_chars=\".()\")" + ], + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 9 + } ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "collate(thai_words, reverse=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Date and Time Format\n", - "\n", - "Get Thai day and month names with Thai Buddhist Era (B.E.).\n", - "Use formatting directives similar to datetime.strftime()." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'วันพุธที่ 6 ตุลาคม พ.ศ. 2519 เวลา 01:40 น. (พ 06-ต.ค.-19)'" + }, + { + "cell_type": "code", + "metadata": { + "id": "87Z8P9WPIgqS", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "0b92019f-9773-49db-e0b0-840ba9f7d8a0" + }, + "source": [ + "pythainlp.util.countthai(\"วันอาทิตย์ที่ 24 มีนาคม 2562\")" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "100.0" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 10 + } ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import datetime\n", - "from pythainlp.util import thai_strftime\n", - "\n", - "fmt = \"%Aที่ %-d %B พ.ศ. %Y เวลา %H:%M น. (%a %d-%b-%y)\"\n", - "date = datetime.datetime(1976, 10, 6, 1, 40)\n", - "\n", - "thai_strftime(date, fmt)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tokenization" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sentence and Word\n", - "\n", - "Default word tokenizer (\"newmm\") use maximum matching algorithm." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sent_tokenize: ['ฉันรักภาษาไทย', 'เพราะฉันใช้ภาษาไทย']\n", - "word_tokenize: ['ฉัน', 'รัก', 'ภาษาไทย', ' ', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย', ' ']\n", - "word_tokenize, without whitespace: ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย']\n" - ] - } - ], - "source": [ - "from pythainlp import sent_tokenize, word_tokenize\n", - "\n", - "text = \"ฉันรักภาษาไทย เพราะฉันใช้ภาษาไทย \"\n", - "\n", - "print(\"sent_tokenize:\", sent_tokenize(text))\n", - "print(\"word_tokenize:\", word_tokenize(text))\n", - "print(\"word_tokenize, without whitespace:\", word_tokenize(text, keep_whitespace=False))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Other algorithm can be chosen. We can also create a tokenizer with custom dictionary." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "newmm: ['กฎหมายแรงงาน', 'ฉบับ', 'ปรับปรุง', 'ใหม่', 'ประกาศ', 'ใช้แล้ว']\n", - "longest: ['กฎหมายแรงงาน', 'ฉบับ', 'ปรับปรุง', 'ใหม่', 'ประกาศใช้', 'แล้ว']\n", - "custom: ['กฎ', 'หมายแรง', 'งาน', 'ฉบับปรับปรุงใหม่ประกาศใช้แล้ว']\n" - ] - } - ], - "source": [ - "from pythainlp import word_tokenize, Tokenizer\n", - "\n", - "text = \"กฎหมายแรงงานฉบับปรับปรุงใหม่ประกาศใช้แล้ว\"\n", - "\n", - "print(\"newmm:\", word_tokenize(text)) # default engine is \"newmm\"\n", - "print(\"longest:\", word_tokenize(text, engine=\"longest\"))\n", - "\n", - "words = [\"กฎ\", \"งาน\"]\n", - "custom_tokenizer = Tokenizer(words)\n", - "print(\"custom:\", custom_tokenizer.word_tokenize(text))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Default word tokenizer use a word list from pythainlp.corpus.common.thai_words().\n", - "We can get that list, add/remove words, and create new tokenizer from the modified list." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "newmm: ['ไอแซค', ' ', 'อสิ', 'มอ', 'ฟ']\n", - "custom: ['ไอแซค', ' ', 'อสิมอฟ']\n" - ] - } - ], - "source": [ - "from pythainlp.corpus.common import thai_words\n", - "from pythainlp import word_tokenize, Tokenizer\n", - "\n", - "text = \"ไอแซค อสิมอฟ\"\n", - "\n", - "print(\"newmm:\", word_tokenize(text))\n", - "\n", - "words = set(thai_words()) # thai_words() returns frozenset\n", - "words.add(\"อสิมอฟ\")\n", - "custom_tokenizer = Tokenizer(words)\n", - "print(\"custom:\", custom_tokenizer.word_tokenize(text))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "speedtest_text = \"\"\"\n", - "ครบรอบ 14 ปี ตากใบ เช้าวันนั้น 25 ต.ค. 2547 ผู้ชุมนุมชายกว่า 1,370 คน\n", - "ถูกโยนขึ้นรถยีเอ็มซี 22 หรือ 24 คัน นอนซ้อนกันคันละ 4-5 ชั้น เดินทางจากสถานีตำรวจตากใบ ไปไกล 150 กิโลเมตร\n", - "ไปถึงค่ายอิงคยุทธบริหาร ใช้เวลากว่า 6 ชั่วโมง / ในอีกคดีที่ญาติฟ้องร้องรัฐ คดีจบลงที่การประนีประนอมยอมความ\n", - "กระทรวงกลาโหมจ่ายค่าสินไหมทดแทนรวม 42 ล้านบาทให้กับญาติผู้เสียหาย 79 ราย\n", - "ปิดหีบและนับคะแนนเสร็จแล้ว ที่หน่วยเลือกตั้งที่ 32 เขต 13 แขวงหัวหมาก เขตบางกะปิ กรุงเทพมหานคร\n", - "ผู้สมัคร ส.ส. และตัวแทนพรรคการเมืองจากหลายพรรคต่างมาเฝ้าสังเกตการนับคะแนนอย่างใกล้ชิด โดย\n", - "ฐิติภัสร์ โชติเดชาชัยนันต์ จากพรรคพลังประชารัฐ และพริษฐ์ วัชรสินธุ จากพรรคประชาธิปัตย์ได้คะแนน\n", - "96 คะแนนเท่ากัน\n", - "เช้าวันอาทิตย์ที่ 21 เมษายน 2019 ซึ่งเป็นวันอีสเตอร์ วันสำคัญของชาวคริสต์\n", - "เกิดเหตุระเบิดต่อเนื่องในโบสถ์คริสต์และโรงแรมอย่างน้อย 7 แห่งในประเทศศรีลังกา\n", - "มีผู้เสียชีวิตแล้วอย่างน้อย 156 คน และบาดเจ็บหลายร้อยคน ยังไม่มีข้อมูลว่าผู้ก่อเหตุมาจากฝ่ายใด\n", - "จีนกำหนดจัดการประชุมข้อริเริ่มสายแถบและเส้นทางในช่วงปลายสัปดาห์นี้ ปักกิ่งยืนยันว่า\n", - "อภิมหาโครงการเชื่อมโลกของจีนไม่ใช่เครื่องมือแผ่อิทธิพล แต่ยินดีรับฟังข้อวิจารณ์ เช่น ประเด็นกับดักหนี้สิน\n", - "และความไม่โปร่งใส รัฐบาลปักกิ่งบอกว่า เวทีประชุม Belt and Road Forum ในช่วงวันที่ 25-27 เมษายน\n", - "ถือเป็นงานการทูตที่สำคัญที่สุดของจีนในปี 2019\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1.05 s, sys: 8.68 ms, total: 1.06 s\n", - "Wall time: 1.08 s\n" - ] - } - ], - "source": [ - "# Speed test: Calling \"longest\" engine through word_tokenize wrapper\n", - "%time tokens = word_tokenize(speedtest_text, engine=\"longest\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 11.6 ms, sys: 235 µs, total: 11.8 ms\n", - "Wall time: 11.8 ms\n" - ] - } - ], - "source": [ - "# Speed test: Calling \"newmm\" engine through word_tokenize wrapper\n", - "%time tokens = word_tokenize(speedtest_text, engine=\"newmm\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 10.6 ms, sys: 562 µs, total: 11.1 ms\n", - "Wall time: 12.4 ms\n" - ] - } - ], - "source": [ - "# Speed test: Directly call \"newmm\" engine from pythainlp.tokenize.newmm\n", - "%time tokens = pythainlp.tokenize.newmm.segment(speedtest_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['มี|ความ|เป็น|ไป|ได้|อย่าง|ไร|บ้าง|',\n", - " 'มี|ความ|เป็นไป|ได้|อย่าง|ไร|บ้าง|',\n", - " 'มี|ความ|เป็นไปได้|อย่าง|ไร|บ้าง|',\n", - " 'มี|ความเป็นไป|ได้|อย่าง|ไร|บ้าง|',\n", - " 'มี|ความเป็นไปได้|อย่าง|ไร|บ้าง|',\n", - " 'มี|ความ|เป็น|ไป|ได้|อย่างไร|บ้าง|',\n", - " 'มี|ความ|เป็นไป|ได้|อย่างไร|บ้าง|',\n", - " 'มี|ความ|เป็นไปได้|อย่างไร|บ้าง|',\n", - " 'มี|ความเป็นไป|ได้|อย่างไร|บ้าง|',\n", - " 'มี|ความเป็นไปได้|อย่างไร|บ้าง|',\n", - " 'มี|ความ|เป็น|ไป|ได้|อย่างไรบ้าง|',\n", - " 'มี|ความ|เป็นไป|ได้|อย่างไรบ้าง|',\n", - " 'มี|ความ|เป็นไปได้|อย่างไรบ้าง|',\n", - " 'มี|ความเป็นไป|ได้|อย่างไรบ้าง|',\n", - " 'มี|ความเป็นไปได้|อย่างไรบ้าง|']" + }, + { + "cell_type": "code", + "metadata": { + "id": "ukSQP8ZTIgqV", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "9f0bff09-0527-45ca-9f25-65c60f286930" + }, + "source": [ + "pythainlp.util.countthai(\"วันอาทิตย์ที่ 24 มีนาคม 2562\", ignore_chars=\"\")" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "67.85714285714286" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 11 + } ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get all possible segmentations\n", - "from pythainlp.tokenize.multi_cut import find_all_segment, mmcut, segment\n", - "\n", - "find_all_segment(\"มีความเป็นไปได้อย่างไรบ้าง\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Subword and Thai Character Cluster (TCC)\n", - "\n", - "According to [Character Cluster Based Thai Information Retrieval](https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval) (Theeramunkong et al. 2004)." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['ป', 'ระ', 'เท', 'ศ', 'ไท', 'ย']" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kW89ZW-IIgqX", + "colab_type": "text" + }, + "source": [ + "## Collation\n", + "\n", + "Sorting according to Thai dictionary." ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp import subword_tokenize\n", - "\n", - "subword_tokenize(\"ประเทศไทย\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Low-level TCC operations" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['ป', 'ระ', 'เท', 'ศ', 'ไท', 'ย']" + }, + { + "cell_type": "code", + "metadata": { + "id": "hT1Pj52bIgqY", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "b948f6ce-ee51-4f3e-cdad-43b3957155e0" + }, + "source": [ + "from pythainlp.util import collate\n", + "\n", + "thai_words = [\"ค้อน\", \"กระดาษ\", \"กรรไกร\", \"ไข่\", \"ผ้าไหม\"]\n", + "collate(thai_words)" + ], + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['กรรไกร', 'กระดาษ', 'ไข่', 'ค้อน', 'ผ้าไหม']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 12 + } ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp.tokenize import tcc\n", - "\n", - "tcc.segment(\"ประเทศไทย\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{1, 3, 5, 6, 8, 9}" + }, + { + "cell_type": "code", + "metadata": { + "id": "XgWpZM8hIgqb", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "d9003bd2-e0ee-47c7-aa67-f498e4f47578" + }, + "source": [ + "collate(thai_words, reverse=True)" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['ผ้าไหม', 'ค้อน', 'ไข่', 'กระดาษ', 'กรรไกร']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 13 + } ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tcc.tcc_pos(\"ประเทศไทย\") # return positions" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ป-ระ-เท-ศ-ไท-ย-" - ] - } - ], - "source": [ - "for ch in tcc.tcc(\"ประเทศไทย\"): # generator\n", - " print(ch, end='-')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Transliteration" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'maeo'" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g-czYhLoIgqd", + "colab_type": "text" + }, + "source": [ + "## Date and Time Format\n", + "\n", + "Get Thai day and month names with Thai Buddhist Era (B.E.).\n", + "Use formatting directives similar to datetime.strftime()." ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp.transliterate import romanize\n", - "\n", - "romanize(\"แมว\")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'mɛːw'" + }, + { + "cell_type": "code", + "metadata": { + "id": "F03_rMWzIgqe", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "ffeda738-0926-4439-d3a0-14869d0d59db" + }, + "source": [ + "import datetime\n", + "from pythainlp.util import thai_strftime\n", + "\n", + "fmt = \"%Aที่ %-d %B พ.ศ. %Y เวลา %H:%M น. (%a %d-%b-%y)\"\n", + "date = datetime.datetime(1976, 10, 6, 1, 40)\n", + "\n", + "thai_strftime(date, fmt)" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'วันพุธที่ 6 ตุลาคม พ.ศ. 2519 เวลา 01:40 น. (พ 06-ต.ค.-19)'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 14 + } ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp.transliterate import transliterate\n", - "\n", - "transliterate(\"แมว\")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "#!pip3 install pythainlp[icu]\n", - "#transliterate(\"แมว\", engine=\"icu\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Normalization" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8VFPOHyZIgqh", + "colab_type": "text" + }, + "source": [ + "## Tokenization" ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp.util import normalize\n", - "\n", - "normalize(\"เเปลก\") == \"แปลก\" # เ เ ป ล ก vs แปลก" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Soundex\n", - "\n", - "\"Soundex is a phonetic algorithm for indexing names by sound.\" ([Wikipedia](https://en.wikipedia.org/wiki/Soundex)). PyThaiNLP provides three kinds of Thai soundex." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "True\n", - "True\n", - "True\n" - ] - } - ], - "source": [ - "from pythainlp.soundex import lk82, metasound, udom83\n", - "\n", - "# check equivalence\n", - "print(lk82(\"รถ\") == lk82(\"รด\"))\n", - "print(udom83(\"วรร\") == udom83(\"วัน\"))\n", - "print(metasound(\"นพ\") == metasound(\"นภ\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "บูรณะ - lk82: บE400 - udom83: บ930000 - metasound: บ550\n", - "บูรณการ - lk82: บE419 - udom83: บ931900 - metasound: บ551\n", - "มัก - lk82: ม1000 - udom83: ม100000 - metasound: ม100\n", - "มัค - lk82: ม1000 - udom83: ม100000 - metasound: ม100\n", - "มรรค - lk82: ม1000 - udom83: ม310000 - metasound: ม551\n", - "ลัก - lk82: ร1000 - udom83: ร100000 - metasound: ล100\n", - "รัก - lk82: ร1000 - udom83: ร100000 - metasound: ร100\n", - "รักษ์ - lk82: ร1000 - udom83: ร100000 - metasound: ร100\n", - " - lk82: - udom83: - metasound: \n" - ] - } - ], - "source": [ - "texts = [\"บูรณะ\", \"บูรณการ\", \"มัก\", \"มัค\", \"มรรค\", \"ลัก\", \"รัก\", \"รักษ์\", \"\"]\n", - "for text in texts:\n", - " print(\n", - " \"{} - lk82: {} - udom83: {} - metasound: {}\".format(\n", - " text, lk82(text), udom83(text), metasound(text)\n", - " )\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Spellchecking\n", - "\n", - "Default spellchecker uses [Peter Norvig's algorithm](http://www.norvig.com/spell-correct.html) together with word frequency from Thai National Corpus (TNC)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['เหลียม', 'เหลือม']" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SklPJ-DbIgqi", + "colab_type": "text" + }, + "source": [ + "### Sentence and Word\n", + "\n", + "Default word tokenizer (\"newmm\") use maximum matching algorithm." ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp import spell\n", - "\n", - "# list possible spellings\n", - "spell(\"เหลืยม\")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'เหลียม'" + }, + { + "cell_type": "code", + "metadata": { + "id": "JEbY-MGCIgqi", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + }, + "outputId": "ce82fcbe-117f-4e12-db86-f01b4ea988e4" + }, + "source": [ + "from pythainlp import sent_tokenize, word_tokenize\n", + "\n", + "text = \"ฉันรักภาษาไทย เพราะฉันใช้ภาษาไทย \"\n", + "\n", + "print(\"sent_tokenize:\", sent_tokenize(text))\n", + "print(\"word_tokenize:\", word_tokenize(text))\n", + "print(\"word_tokenize, without whitespace:\", word_tokenize(text, keep_whitespace=False))" + ], + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "text": [ + "sent_tokenize: ['ฉันรักภาษาไทย', 'เพราะฉันใช้ภาษาไทย']\n", + "word_tokenize: ['ฉัน', 'รัก', 'ภาษาไทย', ' ', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย', ' ']\n", + "word_tokenize, without whitespace: ['ฉัน', 'รัก', 'ภาษาไทย', 'เพราะ', 'ฉัน', 'ใช้', 'ภาษาไทย']\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp import correct\n", - "\n", - "# choose the most likely spelling\n", - "correct(\"เหลืยม\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Spellchecking - Custom dictionary and word frequency\n", - "\n", - "Custom dictionary can be provided when creating spellchecker." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['เหลือม']\n", - "เหลือม\n" - ] - } - ], - "source": [ - "from pythainlp.corpus import ttc # Thai Textbook Corpus\n", - "from pythainlp.spell import NorvigSpellChecker\n", - "\n", - "checker = NorvigSpellChecker(custom_dict=ttc.word_freqs())\n", - "print(checker.spell(\"เหลืยม\"))\n", - "print(checker.correct(\"เหลืยม\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('ดวงๆ', 3),\n", - " ('กระพือ', 6),\n", - " ('อุปสมบท', 17),\n", - " ('หาเช้ากินค่ำ', 14),\n", - " ('จะเห็นได้ว่า', 152),\n", - " ('ยวด', 2),\n", - " ('กล่อม', 182),\n", - " ('เต้า', 37),\n", - " ('จัว', 2)]" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e5P_YygrIgqm", + "colab_type": "text" + }, + "source": [ + "Other algorithm can be chosen. We can also create a tokenizer with custom dictionary." ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(checker.dictionary())[1:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also apply conditions and filter function to dictionary when creating spellchecker." - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "39977" + }, + { + "cell_type": "code", + "metadata": { + "id": "mI_Qz3k3Igqm", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + }, + "outputId": "2d10dc44-fc8d-4c4d-8526-3b5abe9494d5" + }, + "source": [ + "from pythainlp import word_tokenize, Tokenizer\n", + "\n", + "text = \"กฎหมายแรงงานฉบับปรับปรุงใหม่ประกาศใช้แล้ว\"\n", + "\n", + "print(\"newmm:\", word_tokenize(text)) # default engine is \"newmm\"\n", + "print(\"longest:\", word_tokenize(text, engine=\"longest\"))\n", + "\n", + "words = [\"กฎ\", \"งาน\"]\n", + "custom_tokenizer = Tokenizer(words)\n", + "print(\"custom:\", custom_tokenizer.word_tokenize(text))" + ], + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "text": [ + "newmm: ['กฎหมายแรงงาน', 'ฉบับ', 'ปรับปรุง', 'ใหม่', 'ประกาศ', 'ใช้แล้ว']\n", + "longest: ['กฎหมายแรงงาน', 'ฉบับ', 'ปรับปรุง', 'ใหม่', 'ประกาศใช้', 'แล้ว']\n", + "custom: ['กฎ', 'หมายแรง', 'งาน', 'ฉบับปรับปรุงใหม่ประกาศใช้แล้ว']\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "checker = NorvigSpellChecker() # use default filter (remove any word with number or non-Thai character)\n", - "len(checker.dictionary())" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "30379" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zIXUxXlTIgqo", + "colab_type": "text" + }, + "source": [ + "Default word tokenizer use a word list from pythainlp.corpus.common.thai_words().\n", + "We can get that list, add/remove words, and create new tokenizer from the modified list." ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "checker = NorvigSpellChecker(min_freq=5, min_len=2, max_len=15)\n", - "len(checker.dictionary())" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "76706" + }, + { + "cell_type": "code", + "metadata": { + "id": "RblqNckGIgqp", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "outputId": "b0c50208-55ce-4f63-8e99-8f98bbd31733" + }, + "source": [ + "from pythainlp.corpus.common import thai_words\n", + "from pythainlp import word_tokenize, Tokenizer\n", + "\n", + "text = \"ไอแซค อสิมอฟ\"\n", + "\n", + "print(\"newmm:\", word_tokenize(text))\n", + "\n", + "words = set(thai_words()) # thai_words() returns frozenset\n", + "words.add(\"อสิมอฟ\")\n", + "custom_tokenizer = Tokenizer(words)\n", + "print(\"custom:\", custom_tokenizer.word_tokenize(text))" + ], + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "text": [ + "newmm: ['ไอแซค', ' ', 'อสิ', 'มอ', 'ฟ']\n", + "custom: ['ไอแซค', ' ', 'อสิมอฟ']\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "checker_no_filter = NorvigSpellChecker(dict_filter=None) # use no filter\n", - "len(checker_no_filter.dictionary())" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "76700" + }, + { + "cell_type": "code", + "metadata": { + "id": "4L2kRMY5Igqr", + "colab_type": "code", + "colab": {} + }, + "source": [ + "speedtest_text = \"\"\"\n", + "ครบรอบ 14 ปี ตากใบ เช้าวันนั้น 25 ต.ค. 2547 ผู้ชุมนุมชายกว่า 1,370 คน\n", + "ถูกโยนขึ้นรถยีเอ็มซี 22 หรือ 24 คัน นอนซ้อนกันคันละ 4-5 ชั้น เดินทางจากสถานีตำรวจตากใบ ไปไกล 150 กิโลเมตร\n", + "ไปถึงค่ายอิงคยุทธบริหาร ใช้เวลากว่า 6 ชั่วโมง / ในอีกคดีที่ญาติฟ้องร้องรัฐ คดีจบลงที่การประนีประนอมยอมความ\n", + "กระทรวงกลาโหมจ่ายค่าสินไหมทดแทนรวม 42 ล้านบาทให้กับญาติผู้เสียหาย 79 ราย\n", + "ปิดหีบและนับคะแนนเสร็จแล้ว ที่หน่วยเลือกตั้งที่ 32 เขต 13 แขวงหัวหมาก เขตบางกะปิ กรุงเทพมหานคร\n", + "ผู้สมัคร ส.ส. และตัวแทนพรรคการเมืองจากหลายพรรคต่างมาเฝ้าสังเกตการนับคะแนนอย่างใกล้ชิด โดย\n", + "ฐิติภัสร์ โชติเดชาชัยนันต์ จากพรรคพลังประชารัฐ และพริษฐ์ วัชรสินธุ จากพรรคประชาธิปัตย์ได้คะแนน\n", + "96 คะแนนเท่ากัน\n", + "เช้าวันอาทิตย์ที่ 21 เมษายน 2019 ซึ่งเป็นวันอีสเตอร์ วันสำคัญของชาวคริสต์\n", + "เกิดเหตุระเบิดต่อเนื่องในโบสถ์คริสต์และโรงแรมอย่างน้อย 7 แห่งในประเทศศรีลังกา\n", + "มีผู้เสียชีวิตแล้วอย่างน้อย 156 คน และบาดเจ็บหลายร้อยคน ยังไม่มีข้อมูลว่าผู้ก่อเหตุมาจากฝ่ายใด\n", + "จีนกำหนดจัดการประชุมข้อริเริ่มสายแถบและเส้นทางในช่วงปลายสัปดาห์นี้ ปักกิ่งยืนยันว่า\n", + "อภิมหาโครงการเชื่อมโลกของจีนไม่ใช่เครื่องมือแผ่อิทธิพล แต่ยินดีรับฟังข้อวิจารณ์ เช่น ประเด็นกับดักหนี้สิน\n", + "และความไม่โปร่งใส รัฐบาลปักกิ่งบอกว่า เวทีประชุม Belt and Road Forum ในช่วงวันที่ 25-27 เมษายน\n", + "ถือเป็นงานการทูตที่สำคัญที่สุดของจีนในปี 2019\n", + "\"\"\"" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qMF_0xyOIgqs", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "outputId": "7b914ac8-456f-4af7-f62d-e4cdf61409aa" + }, + "source": [ + "# Speed test: Calling \"longest\" engine through word_tokenize wrapper\n", + "%time tokens = word_tokenize(speedtest_text, engine=\"longest\")" + ], + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "text": [ + "CPU times: user 842 ms, sys: 1.15 ms, total: 843 ms\n", + "Wall time: 849 ms\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def remove_yamok(word):\n", - " return False if \"ๆ\" in word else True\n", - "\n", - "checker_custom_filter = NorvigSpellChecker(dict_filter=remove_yamok) # use custom filter\n", - "len(checker_custom_filter.dictionary())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Part-of-Speech Tagging" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('การ', 'FIXN'), ('เดินทาง', 'VACT')]" + }, + { + "cell_type": "code", + "metadata": { + "id": "NlCSHylIIgqv", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "outputId": "c270e307-6804-4dc6-93a4-5b64776d01e7" + }, + "source": [ + "# Speed test: Calling \"newmm\" engine through word_tokenize wrapper\n", + "%time tokens = word_tokenize(speedtest_text, engine=\"newmm\")" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "text": [ + "CPU times: user 10.5 ms, sys: 0 ns, total: 10.5 ms\n", + "Wall time: 10.4 ms\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp.tag import pos_tag, pos_tag_sents\n", - "\n", - "pos_tag([\"การ\",\"เดินทาง\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[('ประกาศสำนักนายกฯ', 'NCMN'),\n", - " (' ', 'PUNC'),\n", - " ('ให้', 'JSBR'),\n", - " (' ', 'PUNC'),\n", - " (\"'พล.ท.สรรเสริญ แก้วกำเนิด'\", 'NCMN'),\n", - " (' ', 'PUNC'),\n", - " ('พ้นจากตำแหน่ง', 'NCMN'),\n", - " (' ', 'PUNC'),\n", - " ('ผู้ทรงคุณวุฒิพิเศษ', 'NCMN'),\n", - " ('กองทัพบก', 'NCMN'),\n", - " (' ', 'PUNC'),\n", - " ('กระทรวงกลาโหม', 'NCMN')],\n", - " [('และ', 'JCRG'),\n", - " ('แต่งตั้ง', 'VACT'),\n", - " ('ให้', 'JSBR'),\n", - " ('เป็น', 'VSTA'),\n", - " (\"'อธิบดีกรมประชาสัมพันธ์'\", 'NCMN')]]" + }, + { + "cell_type": "code", + "metadata": { + "id": "SaSlNna8Igqx", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "outputId": "b87c8dd1-2e86-4b81-807b-4c75ee87217d" + }, + "source": [ + "# Speed test: Directly call \"newmm\" engine from pythainlp.tokenize.newmm\n", + "%time tokens = pythainlp.tokenize.newmm.segment(speedtest_text)" + ], + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "text": [ + "CPU times: user 6.36 ms, sys: 0 ns, total: 6.36 ms\n", + "Wall time: 6.83 ms\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sents = [[\"ประกาศสำนักนายกฯ\", \" \", \"ให้\",\n", - " \" \", \"'พล.ท.สรรเสริญ แก้วกำเนิด'\", \" \", \"พ้นจากตำแหน่ง\",\n", - " \" \", \"ผู้ทรงคุณวุฒิพิเศษ\", \"กองทัพบก\", \" \", \"กระทรวงกลาโหม\"],\n", - " [\"และ\", \"แต่งตั้ง\", \"ให้\", \"เป็น\", \"'อธิบดีกรมประชาสัมพันธ์'\"]]\n", - "\n", - "pos_tag_sents(sents)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Named-Entity Tagging\n", - "\n", - "The tagger use BIO scheme:\n", - "- B - beginning of entity\n", - "- I - inside entity\n", - "- O - outside entity" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('15', 'NUM', 'B-DATE'),\n", - " (' ', 'PUNCT', 'I-DATE'),\n", - " ('ก.ย.', 'NOUN', 'I-DATE'),\n", - " (' ', 'PUNCT', 'I-DATE'),\n", - " ('61', 'NUM', 'I-DATE'),\n", - " (' ', 'PUNCT', 'O'),\n", - " ('ทดสอบ', 'VERB', 'O'),\n", - " ('ระบบ', 'NOUN', 'O'),\n", - " ('เวลา', 'NOUN', 'O'),\n", - " (' ', 'PUNCT', 'O'),\n", - " ('14', 'NOUN', 'B-TIME'),\n", - " (':', 'PUNCT', 'I-TIME'),\n", - " ('49', 'NUM', 'I-TIME'),\n", - " (' ', 'PUNCT', 'I-TIME'),\n", - " ('น.', 'NOUN', 'I-TIME'),\n", - " (' ', 'PUNCT', 'O'),\n", - " ('เดินทาง', 'VERB', 'O'),\n", - " ('จาก', 'ADP', 'O'),\n", - " ('กทม.', 'NOUN', 'B-LOCATION'),\n", - " ('ไป', 'AUX', 'O'),\n", - " ('จังหวัด', 'NOUN', 'B-LOCATION'),\n", - " ('กำแพงเพชร', 'NOUN', 'I-LOCATION'),\n", - " (' ', 'PUNCT', 'I-MONEY'),\n", - " ('ตั๋ว', 'NOUN', 'I-MONEY'),\n", - " ('ราคา', 'NOUN', 'I-MONEY'),\n", - " (' ', 'PUNCT', 'I-MONEY'),\n", - " ('297', 'NUM', 'I-MONEY'),\n", - " (' ', 'PUNCT', 'I-MONEY'),\n", - " ('บาท', 'NOUN', 'I-MONEY')]" + }, + { + "cell_type": "code", + "metadata": { + "id": "qFTYqAB1Igq1", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + }, + "outputId": "607ea178-c070-48c6-c12e-1a62c09f8847" + }, + "source": [ + "# Get all possible segmentations\n", + "from pythainlp.tokenize.multi_cut import find_all_segment, mmcut, segment\n", + "\n", + "find_all_segment(\"มีความเป็นไปได้อย่างไรบ้าง\")" + ], + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['มี|ความ|เป็น|ไป|ได้|อย่าง|ไร|บ้าง|',\n", + " 'มี|ความ|เป็นไป|ได้|อย่าง|ไร|บ้าง|',\n", + " 'มี|ความ|เป็นไปได้|อย่าง|ไร|บ้าง|',\n", + " 'มี|ความเป็นไป|ได้|อย่าง|ไร|บ้าง|',\n", + " 'มี|ความเป็นไปได้|อย่าง|ไร|บ้าง|',\n", + " 'มี|ความ|เป็น|ไป|ได้|อย่างไร|บ้าง|',\n", + " 'มี|ความ|เป็นไป|ได้|อย่างไร|บ้าง|',\n", + " 'มี|ความ|เป็นไปได้|อย่างไร|บ้าง|',\n", + " 'มี|ความเป็นไป|ได้|อย่างไร|บ้าง|',\n", + " 'มี|ความเป็นไปได้|อย่างไร|บ้าง|',\n", + " 'มี|ความ|เป็น|ไป|ได้|อย่างไรบ้าง|',\n", + " 'มี|ความ|เป็นไป|ได้|อย่างไรบ้าง|',\n", + " 'มี|ความ|เป็นไปได้|อย่างไรบ้าง|',\n", + " 'มี|ความเป็นไป|ได้|อย่างไรบ้าง|',\n", + " 'มี|ความเป็นไปได้|อย่างไรบ้าง|']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 22 + } ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp.tag.named_entity import ThaiNameTagger\n", - "\n", - "ner = ThaiNameTagger()\n", - "ner.get_ner(\"15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น. เดินทางจากกทม.ไปจังหวัดกำแพงเพชร ตั๋วราคา 297 บาท\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Word Vector" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English\n", - "INFO:gensim.models.utils_any2vec:loading projection weights from /Users/arthit/pythainlp-data/thai2vec.bin\n", - "INFO:gensim.models.utils_any2vec:loaded (60001, 400) matrix from /Users/arthit/pythainlp-data/thai2vec.bin\n", - "/usr/local/lib/python3.7/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", - " if np.issubdtype(vec.dtype, np.int):\n" - ] - }, - { - "data": { - "text/plain": [ - "0.99259853" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fWXiNUoBIgq4", + "colab_type": "text" + }, + "source": [ + "### Subword and Thai Character Cluster (TCC)\n", + "\n", + "According to [Character Cluster Based Thai Information Retrieval](https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval) (Theeramunkong et al. 2004)." ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pythainlp.word_vector\n", - "\n", - "pythainlp.word_vector.similarity(\"คน\", \"มนุษย์\")" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors\n" - ] - }, - { - "data": { - "text/plain": [ - "'แมว'" + }, + { + "cell_type": "code", + "metadata": { + "id": "Z1wdMmdmIgq6", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "c74fa03a-a288-47b8-9c49-7ed42c3545c9" + }, + "source": [ + "from pythainlp import subword_tokenize\n", + "\n", + "subword_tokenize(\"ประเทศไทย\")" + ], + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['ป', 'ระ', 'เท', 'ศ', 'ไท', 'ย']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 23 + } ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pythainlp.word_vector.doesnt_match([\"คน\", \"มนุษย์\", \"บุคคล\", \"เจ้าหน้าที่\", \"แมว\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number Spell Out" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'หนึ่งล้านสองแสนสามหมื่นสี่พันห้าร้อยหกสิบเจ็ดล้านแปดแสนเก้าหมื่นหนึ่งร้อยยี่สิบสามบาทสี่สิบห้าสตางค์'" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RfO1WbneIgq9", + "colab_type": "text" + }, + "source": [ + "Low-level TCC operations" ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pythainlp.util import bahttext\n", - "\n", - "bahttext(1234567890123.45)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'หนึ่งบาทเก้าสิบเอ็ดสตางค์'" + }, + { + "cell_type": "code", + "metadata": { + "id": "3Gyig20XIgq-", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "da9484b7-4dc4-4159-9b5f-df2771805ac9" + }, + "source": [ + "from pythainlp.tokenize import tcc\n", + "\n", + "tcc.segment(\"ประเทศไทย\")" + ], + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['ป', 'ระ', 'เท', 'ศ', 'ไท', 'ย']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "cF-zQJU1IgrA", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "fd966eb3-fd8b-46b6-ea61-93427431cdf9" + }, + "source": [ + "tcc.tcc_pos(\"ประเทศไทย\") # return positions" + ], + "execution_count": 25, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{1, 3, 5, 6, 8, 9}" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 25 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aL2PiPUvIgrE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "a04d64f4-b174-4337-8859-ecac7e660f29" + }, + "source": [ + "for ch in tcc.tcc(\"ประเทศไทย\"): # generator\n", + " print(ch, end='-')" + ], + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "text": [ + "ป-ระ-เท-ศ-ไท-ย-" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jxvgbdlhIgrG", + "colab_type": "text" + }, + "source": [ + "## Transliteration" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ujAsMHwyIgrH", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "4a14e9df-9699-4ede-848b-a280ac0ba5d1" + }, + "source": [ + "from pythainlp.transliterate import romanize\n", + "\n", + "romanize(\"แมว\")" + ], + "execution_count": 27, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'maeo'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 27 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LlDosHqXIgrJ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "a7a88a3a-a04e-4e90-aef4-c6b357ab04c7" + }, + "source": [ + "from pythainlp.transliterate import transliterate\n", + "\n", + "transliterate(\"แมว\")" + ], + "execution_count": 31, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'mɛːw'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "m5A6dPOWIgrL", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#!pip3 install pythainlp[icu]\n", + "#transliterate(\"แมว\", engine=\"icu\")" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4UwQtF3oIgrM", + "colab_type": "text" + }, + "source": [ + "## Normalization" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WXPq5bqfIgrN", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "42a7a5be-7d7d-4997-811c-424c79ce3169" + }, + "source": [ + "from pythainlp.util import normalize\n", + "\n", + "normalize(\"เเปลก\") == \"แปลก\" # เ เ ป ล ก vs แปลก" + ], + "execution_count": 32, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 32 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rlDji6ecIgrP", + "colab_type": "text" + }, + "source": [ + "## Soundex\n", + "\n", + "\"Soundex is a phonetic algorithm for indexing names by sound.\" ([Wikipedia](https://en.wikipedia.org/wiki/Soundex)). PyThaiNLP provides three kinds of Thai soundex." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "I4JyUCRJIgrP", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + }, + "outputId": "6af3c11c-3f9a-4154-b7f2-c899312846dc" + }, + "source": [ + "from pythainlp.soundex import lk82, metasound, udom83\n", + "\n", + "# check equivalence\n", + "print(lk82(\"รถ\") == lk82(\"รด\"))\n", + "print(udom83(\"วรร\") == udom83(\"วัน\"))\n", + "print(metasound(\"นพ\") == metasound(\"นภ\"))" + ], + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "True\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XTznoTg5IgrS", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 170 + }, + "outputId": "8178cd7b-735d-4ccc-c36b-a8c67ea2ddb2" + }, + "source": [ + "texts = [\"บูรณะ\", \"บูรณการ\", \"มัก\", \"มัค\", \"มรรค\", \"ลัก\", \"รัก\", \"รักษ์\", \"\"]\n", + "for text in texts:\n", + " print(\n", + " \"{} - lk82: {} - udom83: {} - metasound: {}\".format(\n", + " text, lk82(text), udom83(text), metasound(text)\n", + " )\n", + " )" + ], + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "text": [ + "บูรณะ - lk82: บE400 - udom83: บ930000 - metasound: บ550\n", + "บูรณการ - lk82: บE419 - udom83: บ931900 - metasound: บ551\n", + "มัก - lk82: ม1000 - udom83: ม100000 - metasound: ม100\n", + "มัค - lk82: ม1000 - udom83: ม100000 - metasound: ม100\n", + "มรรค - lk82: ม1000 - udom83: ม310000 - metasound: ม551\n", + "ลัก - lk82: ร1000 - udom83: ร100000 - metasound: ล100\n", + "รัก - lk82: ร1000 - udom83: ร100000 - metasound: ร100\n", + "รักษ์ - lk82: ร1000 - udom83: ร100000 - metasound: ร100\n", + " - lk82: - udom83: - metasound: \n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "spFQD8QsIgrT", + "colab_type": "text" + }, + "source": [ + "## Spellchecking\n", + "\n", + "Default spellchecker uses [Peter Norvig's algorithm](http://www.norvig.com/spell-correct.html) together with word frequency from Thai National Corpus (TNC)" ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" + }, + { + "cell_type": "code", + "metadata": { + "id": "GAz0q6lWIgrU", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "73427202-cdfe-47d9-8925-9596baafd9d3" + }, + "source": [ + "from pythainlp import spell\n", + "\n", + "# list possible spellings\n", + "spell(\"เหลืยม\")" + ], + "execution_count": 35, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['เหลียม', 'เหลือม']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 35 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "I_fDSYEmIgrV", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "e9b6f2eb-37b6-4189-8cfd-1273caf48f38" + }, + "source": [ + "from pythainlp import correct\n", + "\n", + "# choose the most likely spelling\n", + "correct(\"เหลืยม\")" + ], + "execution_count": 36, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'เหลียม'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 36 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N-dOvexUIgrX", + "colab_type": "text" + }, + "source": [ + "## Spellchecking - Custom dictionary and word frequency\n", + "\n", + "Custom dictionary can be provided when creating spellchecker." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ixx-8YtfIgrY", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "outputId": "0dbcf0dc-287e-4c00-f005-c371c81211cd" + }, + "source": [ + "from pythainlp.corpus import ttc # Thai Textbook Corpus\n", + "from pythainlp.spell import NorvigSpellChecker\n", + "\n", + "checker = NorvigSpellChecker(custom_dict=ttc.word_freqs())\n", + "print(checker.spell(\"เหลืยม\"))\n", + "print(checker.correct(\"เหลืยม\"))" + ], + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['เหลือม']\n", + "เหลือม\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "H7TxgdwbIgra", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 170 + }, + "outputId": "3709f50f-3541-41d1-d8c6-7dfd090f3c1f" + }, + "source": [ + "list(checker.dictionary())[1:10]" + ], + "execution_count": 38, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('ข้ออ้าง', 2),\n", + " ('ชนิด', 1744),\n", + " ('ด้นดั้น', 2),\n", + " ('ตบ', 112),\n", + " ('ล้วนๆ', 15),\n", + " ('ลำธาร', 281),\n", + " ('เชือด', 36),\n", + " ('กระดุม', 13),\n", + " ('โกสุม', 2)]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 38 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "70sKEAlGIgrc", + "colab_type": "text" + }, + "source": [ + "We can also apply conditions and filter function to dictionary when creating spellchecker." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "gT8G4cFzIgrc", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "ad4dd927-4c65-4164-a6cb-d123a08c9ee2" + }, + "source": [ + "checker = NorvigSpellChecker() # use default filter (remove any word with number or non-Thai character)\n", + "len(checker.dictionary())" + ], + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "39963" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "w6qI7M92Igre", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "862b3111-e83d-4662-f643-e013e2fc8cd5" + }, + "source": [ + "checker = NorvigSpellChecker(min_freq=5, min_len=2, max_len=15)\n", + "len(checker.dictionary())" + ], + "execution_count": 40, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "30376" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 40 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "cTkFjK8IIgrh", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "7c2e3d09-aa49-4ee0-edfa-49fd4876a968" + }, + "source": [ + "checker_no_filter = NorvigSpellChecker(dict_filter=None) # use no filter\n", + "len(checker_no_filter.dictionary())" + ], + "execution_count": 41, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "66209" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 41 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "70ZHCbBQIgrm", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "0dc68873-9e46-4578-bd22-aa3fc1cfb198" + }, + "source": [ + "def remove_yamok(word):\n", + " return False if \"ๆ\" in word else True\n", + "\n", + "checker_custom_filter = NorvigSpellChecker(dict_filter=remove_yamok) # use custom filter\n", + "len(checker_custom_filter.dictionary())" + ], + "execution_count": 42, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "66204" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 42 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1hoODyDrIgro", + "colab_type": "text" + }, + "source": [ + "## Part-of-Speech Tagging" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "39JixRHsIgro", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "cff28d16-3fa7-4b66-df2e-beef601ec41d" + }, + "source": [ + "from pythainlp.tag import pos_tag, pos_tag_sents\n", + "\n", + "pos_tag([\"การ\",\"เดินทาง\"])" + ], + "execution_count": 43, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('การ', 'FIXN'), ('เดินทาง', 'VACT')]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 43 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qrSDelkrIgrq", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 306 + }, + "outputId": "8cce2c89-7599-4020-b5a6-771c0fa0c005" + }, + "source": [ + "sents = [[\"ประกาศสำนักนายกฯ\", \" \", \"ให้\",\n", + " \" \", \"'พล.ท.สรรเสริญ แก้วกำเนิด'\", \" \", \"พ้นจากตำแหน่ง\",\n", + " \" \", \"ผู้ทรงคุณวุฒิพิเศษ\", \"กองทัพบก\", \" \", \"กระทรวงกลาโหม\"],\n", + " [\"และ\", \"แต่งตั้ง\", \"ให้\", \"เป็น\", \"'อธิบดีกรมประชาสัมพันธ์'\"]]\n", + "\n", + "pos_tag_sents(sents)" + ], + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[('ประกาศสำนักนายกฯ', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('ให้', 'JSBR'),\n", + " (' ', 'PUNC'),\n", + " (\"'พล.ท.สรรเสริญ แก้วกำเนิด'\", 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('พ้นจากตำแหน่ง', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('ผู้ทรงคุณวุฒิพิเศษ', 'NCMN'),\n", + " ('กองทัพบก', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('กระทรวงกลาโหม', 'NCMN')],\n", + " [('และ', 'JCRG'),\n", + " ('แต่งตั้ง', 'VACT'),\n", + " ('ให้', 'JSBR'),\n", + " ('เป็น', 'VSTA'),\n", + " (\"'อธิบดีกรมประชาสัมพันธ์'\", 'NCMN')]]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 44 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f6ShDKpHIgrs", + "colab_type": "text" + }, + "source": [ + "## Named-Entity Tagging\n", + "\n", + "The tagger use BIO scheme:\n", + "- B - beginning of entity\n", + "- I - inside entity\n", + "- O - outside entity" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TVso09S7Igrv", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 561 + }, + "outputId": "f801ac2c-d013-4243-ba0e-b8f88bc69efd" + }, + "source": [ + "from pythainlp.tag.named_entity import ThaiNameTagger\n", + "\n", + "ner = ThaiNameTagger()\n", + "ner.get_ner(\"15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น. เดินทางจากกทม.ไปจังหวัดกำแพงเพชร ตั๋วราคา 297 บาท\")" + ], + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Download: thainer-1-2\n", + "thainer-1-2 1.2\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "100%|██████████| 1886304/1886304 [00:00<00:00, 20387384.78it/s]\n" + ], + "name": "stderr" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('15', 'NUM', 'O'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('ก.ย.', 'NOUN', 'B-DATE'),\n", + " (' ', 'PUNCT', 'I-DATE'),\n", + " ('61', 'NUM', 'I-DATE'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('ทดสอบ', 'VERB', 'O'),\n", + " ('ระบบ', 'NOUN', 'O'),\n", + " ('เวลา', 'NOUN', 'O'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('14', 'NOUN', 'B-TIME'),\n", + " (':', 'PUNCT', 'I-TIME'),\n", + " ('49', 'NUM', 'I-TIME'),\n", + " (' ', 'PUNCT', 'I-TIME'),\n", + " ('น.', 'NOUN', 'I-TIME'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('เดินทาง', 'VERB', 'O'),\n", + " ('จาก', 'ADP', 'O'),\n", + " ('กทม.', 'NOUN', 'B-LOCATION'),\n", + " ('ไป', 'AUX', 'O'),\n", + " ('จังหวัด', 'NOUN', 'B-LOCATION'),\n", + " ('กำแพงเพชร', 'NOUN', 'I-LOCATION'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('ตั๋ว', 'NOUN', 'O'),\n", + " ('ราคา', 'NOUN', 'O'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('297', 'NUM', 'B-MONEY'),\n", + " (' ', 'PUNCT', 'I-MONEY'),\n", + " ('บาท', 'NOUN', 'I-MONEY')]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 47 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6cF88wN2Igry", + "colab_type": "text" + }, + "source": [ + "## Word Vector" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "GshCfJiBIgrz", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "outputId": "921340b3-4962-41a5-f550-f463360fb3b8" + }, + "source": [ + "import pythainlp.word_vector\n", + "\n", + "pythainlp.word_vector.similarity(\"คน\", \"มนุษย์\")" + ], + "execution_count": 48, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Download: thai2fit_wv\n", + "thai2fit_wv 0.1\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "100%|██████████| 62452646/62452646 [00:01<00:00, 45562176.71it/s]\n", + "INFO:gensim.models.utils_any2vec:loading projection weights from /root/pythainlp-data/thai2vec.bin\n", + "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", + " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n", + "INFO:gensim.models.utils_any2vec:loaded (51358, 300) matrix from /root/pythainlp-data/thai2vec.bin\n", + "/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ], + "name": "stderr" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.2504981" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 48 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qJP9As-_Igr0", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 122 + }, + "outputId": "7f528d29-0edf-4b3c-9c31-138d7b85e83a" + }, + "source": [ + "pythainlp.word_vector.doesnt_match([\"คน\", \"มนุษย์\", \"บุคคล\", \"เจ้าหน้าที่\", \"ไก่\"])" + ], + "execution_count": 50, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/gensim/models/keyedvectors.py:895: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n", + " vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n", + "/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ], + "name": "stderr" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'ไก่'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 50 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iS7iwPoiIgr3", + "colab_type": "text" + }, + "source": [ + "## Number Spell Out" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "F9PEEvWLIgr4", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "a5782efd-aceb-4c5e-d746-df69ba9cad8d" + }, + "source": [ + "from pythainlp.util import bahttext\n", + "\n", + "bahttext(1234567890123.45)" + ], + "execution_count": 52, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'หนึ่งล้านสองแสนสามหมื่นสี่พันห้าร้อยหกสิบเจ็ดล้านแปดแสนเก้าหมื่นหนึ่งร้อยยี่สิบสามบาทสี่สิบห้าสตางค์'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 52 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Y6DLJYOEIgr7", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "eac48468-ab8b-4e67-acad-5d6560a18979" + }, + "source": [ + "# bahttext() will round the satang part\n", + "bahttext(1.909)" + ], + "execution_count": 53, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'หนึ่งบาทเก้าสิบเอ็ดสตางค์'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 53 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "dPnFMQ2QIgr8", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] } - ], - "source": [ - "# bahttext() will round the satang part\n", - "bahttext(1.909)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + ] +} \ No newline at end of file diff --git a/notebooks/sentiment_analysis.ipynb b/notebooks/sentiment_analysis.ipynb index a1ab56694..c1af928cb 100644 --- a/notebooks/sentiment_analysis.ipynb +++ b/notebooks/sentiment_analysis.ipynb @@ -1,999 +1,1805 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Sentiment Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook details the steps taken to create a sentiment analyzer using data from [WISESIGHT Sentiment Analysis](https://www.kaggle.com/c/wisesight-sentiment/) competition. Competition metric is overall accuracy across `neg`ative, `pos`itive, `neu`tral and `q`uestion classes. We give examples using logistic regression and ULMFit.\n", - "\n", - "The results for logistic regression, FastText, ULMFit, ULMFit with semi-supervised data are as follows:\n", - "\n", - "| Model | Public Accuracy | Private Accuracy |\n", - "|---------------------|-----------------|------------------|\n", - "| Logistic Regression | 0.72781 | 0.7499 |\n", - "| FastText | 0.63144 | 0.6131 |\n", - "| ULMFit | 0.71259 | 0.74194 |\n", - "| ULMFit Semi-supervised | 0.73119 | 0.75859 |\n", - "| ULMFit Semi-supervised Repeated One Time | **0.73372** | **0.75968** |\n", - "\n", - "For more information about the competition, see [1st Place Solution](https://www.kaggle.com/c/wisesight-sentiment/discussion/83564)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# #uncomment if you are running from google colab\n", - "# !pip install sklearn_crfsuite\n", - "# !pip install emoji\n", - "# !pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", - "# !pip install fastai==1.0.45\n", - "# !wget https://github.com/PyThaiNLP/wisesight-sentiment/raw/master/wisesight-sentiment.zip; unzip wisesight-sentiment.zip\n", - "# !mkdir wisesight_data; ls" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "\n", - "import emoji\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from pythainlp import word_tokenize\n", - "from tqdm import tqdm_notebook\n", - "\n", - "#viz\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Text Processor for Logistic Regression" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def replace_url(text):\n", - " URL_PATTERN = r\"\"\"(?i)\\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\\s()<>{}\\[\\]]+|\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)|\\([^\\s]+?\\))+(?:\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)|\\([^\\s]+?\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’])|(?:(?\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
categorytextsprocessedwcuwc
0posเค้าอยากกินเอมเคคคเค้า|อยาก|กิน|เอม|เค|xxrep66
1neuถ้าเป็นมาสด้า....ต้องมาสด้าเชียงใหม่95ถ้า|เป็น|มาส|ด้า.|xxrep|ต้อง|มาส|ด้า|เชียงใหม่|95109
2neuมาเที่ยวเวียดนาม ในทริปมีพี่นุชด้วยอะ ที่รู้เพ...มา|เที่ยว|เวียดนาม|ใน|ทริป|มี|พี่|นุช|ด้วย|อะ|...4942
3posรีวิวรองพื้น Marc Jacobซะหน่อย เมื่อวานไปสอยตั...รี|วิว|รองพื้น|marc|jacob|ซะ|หน่อย|เมื่อวาน|ไป...136104
4neuD. รถ Hilux รุ่น Rocco #HiluxRevoThailandd|.|รถ|hilux|รุ่น|rocco|#|hiluxrevothailand88
\n", - "" - ], - "text/plain": [ - " category texts \\\n", - "0 pos เค้าอยากกินเอมเคคค \n", - "1 neu ถ้าเป็นมาสด้า....ต้องมาสด้าเชียงใหม่95 \n", - "2 neu มาเที่ยวเวียดนาม ในทริปมีพี่นุชด้วยอะ ที่รู้เพ... \n", - "3 pos รีวิวรองพื้น Marc Jacobซะหน่อย เมื่อวานไปสอยตั... \n", - "4 neu D. รถ Hilux รุ่น Rocco #HiluxRevoThailand \n", - "\n", - " processed wc uwc \n", - "0 เค้า|อยาก|กิน|เอม|เค|xxrep 6 6 \n", - "1 ถ้า|เป็น|มาส|ด้า.|xxrep|ต้อง|มาส|ด้า|เชียงใหม่|95 10 9 \n", - "2 มา|เที่ยว|เวียดนาม|ใน|ทริป|มี|พี่|นุช|ด้วย|อะ|... 49 42 \n", - "3 รี|วิว|รองพื้น|marc|jacob|ซะ|หน่อย|เมื่อวาน|ไป... 136 104 \n", - "4 d|.|รถ|hilux|รุ่น|rocco|#|hiluxrevothailand 8 8 " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "valid_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "neu 0.585560\n", - "neg 0.233992\n", - "pos 0.160827\n", - "q 0.019620\n", - "Name: category, dtype: float64\n" - ] - } - ], - "source": [ - "#prevalence\n", - "print(train_df[\"category\"].value_counts() / train_df.shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "neu 0.582147\n", - "neg 0.238499\n", - "pos 0.161281\n", - "q 0.018072\n", - "Name: category, dtype: float64\n" - ] - } - ], - "source": [ - "#prevalence\n", - "print(valid_df[\"category\"].value_counts() / valid_df.shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Logistic Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Features" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "#dependent variables\n", - "y_train = train_df[\"category\"]\n", - "y_valid = valid_df[\"category\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((20693, 4841), (3652, 4841))" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#text faetures\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "tfidf = TfidfVectorizer(tokenizer=process_text, ngram_range=(1,2), min_df=20, sublinear_tf=True)\n", - "tfidf_fit = tfidf.fit(all_df[\"texts\"])\n", - "text_train = tfidf_fit.transform(train_df[\"texts\"])\n", - "text_valid = tfidf_fit.transform(valid_df[\"texts\"])\n", - "text_test = tfidf_fit.transform(test_df[\"texts\"])\n", - "text_train.shape, text_valid.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[22.2505648 18.41499281] [1108.92025281 497.704347 ]\n" - ] - }, - { - "data": { - "text/plain": [ - "((20693, 2), (3652, 2))" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#word count and unique word counts; actually might not be so useful\n", - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "scaler = StandardScaler()\n", - "scaler_fit = scaler.fit(all_df[[\"wc\",\"uwc\"]].astype(float))\n", - "print(scaler_fit.mean_, scaler_fit.var_)\n", - "num_train = scaler_fit.transform(train_df[[\"wc\",\"uwc\"]].astype(float))\n", - "num_valid = scaler_fit.transform(valid_df[[\"wc\",\"uwc\"]].astype(float))\n", - "num_test = scaler_fit.transform(test_df[[\"wc\",\"uwc\"]].astype(float))\n", - "num_train.shape, num_valid.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((20693, 4843), (3652, 4843))" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#concatenate text and word count features\n", - "X_train = np.concatenate([num_train,text_train.toarray()],axis=1)\n", - "X_valid = np.concatenate([num_valid,text_valid.toarray()],axis=1)\n", - "X_test = np.concatenate([num_test,text_test.toarray()],axis=1)\n", - "X_train.shape, X_valid.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit Model" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7305585980284776" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#fit logistic regression models\n", - "model = LogisticRegression(C=2., penalty=\"l2\", solver=\"liblinear\", dual=False, multi_class=\"ovr\")\n", - "model.fit(X_train,y_train)\n", - "model.score(X_valid,y_valid)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### See Results" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "probs = model.predict_proba(X_valid)\n", - "probs_df = pd.DataFrame(probs)\n", - "probs_df.columns = model.classes_\n", - "probs_df[\"preds\"] = model.predict(X_valid)\n", - "probs_df[\"category\"] = valid_df.category\n", - "probs_df[\"texts\"] = valid_df.texts\n", - "probs_df[\"processed\"] = valid_df.processed\n", - "probs_df[\"wc\"] = valid_df.wc\n", - "probs_df[\"uwc\"] = valid_df.uwc\n", - "probs_df[\"hit\"] = (probs_df.preds==probs_df.category)\n", - "probs_df.to_csv(\"probs_df_linear.csv\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.7305585980284776\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAEKCAYAAADticXcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XeYFFXWx/HvmUCSLGAAFRBd1oQKIkFAAUkqhkVXzKjLa1pz1l0VdY2YE6CggIoYAQOIiiAoSVAEFEQxkCSLgoSZOe8fXQxD7mGmupri9/GpZ7tu3657unY4c+fWrVvm7oiISDxkRB2AiIgUHyV1EZEYUVIXEYkRJXURkRhRUhcRiREldRGRGFFSFxGJESV1EZEYUVIXEYmRrKgD2JqbanbWra6BEWvnRR1C2pi8eFbUIaQN/QPZIGftXCvqMdYt/jHpU5pdpXaR2wuLeuoiIjGStj11EZGUysuNOoJioaQuIgKQmxN1BMVCSV1EBHDPizqEYqGkLiICkKekLiISH+qpi4jEiC6UiojEiHrqIiLx4Zr9IiISI7pQKiISIxp+ERGJEV0oFRGJEfXURURiRBdKRURiRBdKRUTiw11j6iIi8aExdRGRGNHwi4hIjKinLiISI7nroo6gWCipi4iAhl9ERGJFwy/xcNPoJ1jz5194Xh55OXk82fE29jpoP0679yKySmaTl5PH2//pzZyvf+Cg4+vT5tozcE/UHdKtLz9NnBH1VygW1fauyp2P30blqpXAnbf7D+G1F97k3ufuYL/99wGgbPmy/LniT845/mIaNm/A5bd2JTs7m3Xr1vHk3c8ycczkiL9F8evVszsdOrRm4aLFHHFEKwD+859ruejCs1i8eCkAt//nfoYO/STKMFOuV8/unBCcl8OD87LTU089Pnp2vodVy/7I3+9w81l89PibzPj0a/527OF0uOUsep55N7PGTGX68C8B2LPuvpz99JV0b3V9VGEXq9ycXB7v9jQzvvmeMruVpu/QXowfNZHbLrkrv85V/72MP/9YCcDypb9z3fm3sPi3JdT+Wy2eeOUhTqzfKarwQ/NS34E880wfevd5fKPyx5/oxaOP9ogoquj1Dc5Ln03Oy04tJkk9I+oA0pNTsmxpAEqVL8OK35YBsHbVmvwaJcqUBI8kuFAsWbiUGd98D8CqlX8xe9bPVN2r6kZ1Wnc8jg/f+QiAmVO/Z/FvSwD4ccZsSpYqSXaJ7NQGnQKjR49j6bLlUYeRdj6L4Xnx3HVJb+lMPXV3Lu53C+7OuFc+ZvyrnzDkrr5c1PcWTrj1HCzDeOYfd+RXP7htA9rdeCZld69AnwsfjDDw8OxVY0/+dsgBTJs0Pb/siKMPY+mipfw6e+5m9Vue0IIZU2eybm16/7AXp8su7cK553Tiyy+ncMON3Vi+/PeoQ5KiismYeqg9dTP7w8xWbLL9amZvm1ntMNtO1rOd7uSJE2+l9wUP0Pi8NtRqWJdG5xzPkLv7cV+TK3j37n50eqBrfv1pwybSvdX19O3anTbXnh5h5OEoXaY09z/fjUf++yQr/1yVX97mlNYMe+fjzerXPrAmV9z2f9x3Y/dUhhmpHj368re6TajfoA3zFyzkoQf/G3VIUhzy8pLf0ljYwy+PATcA1YEawPXAK8AAoPemlc2sq5lNNLOJX/0xK+TQEtYPraxcsoJpwyawT739qf+P5kwdOh6AKe+NZZ96+2/2udnjv6PyvtUoU6lcSuJMhcysTB54vhvD3vqITz/4bEN5ZibHdmjGR4NHbFS/2l5VefCFe7jzqv8x9+d5qQ43MgsXLiYvLw9354UXXqbBUYdHHZIUB89LfktjYSf1ju7ew93/cPcV7t4TaOvurwGVNq3s7j3dvYG7Nzi8XJ2QQ4Ps0iUpsVup/NcHNjuMBTPnsGLhMmo3+jsA+zc5mMU/LQBg9/32yP/s3gfXJKtE9kYXWHd2/+l+E7O//5lXeg7cqPyoZvX5edYvLJy/KL+sbPmyPNr3fp76Xw+mTJia6lAjteee1fJfn3Jye6ZNi8cMqF1eTHrqYY+przKzM4A3gv1OwOrgdeSXGctVqcC5Pa8FEr3RyYPGMHPk17y5cjUn3XEeGVmZ5KxZx1u3PA/AIe0bUv+05uTm5LBu9VpeueKJKMMvVvUaHkqH09vy/fQf6D888X2fua8Xn38yjjYnt+TDTYZezuhyKjVqVefia8/n4mvPB+DfZ17PsiXxunjWr9/TtGjemCpVKjP7x4l06/YwLVo0oV69g3B3fvp5DpdddlPUYaZc/wLn5acfJ3JXt4fp8+KAqMMqmjTvgSfL3MPLrcG4+eNAYxJJfCxwDTAXqO/uo7f22Ztqdo486aeLEWt3naGN7Zm8ODXDcjsD/QPZIGftXCvqMf5677GkT2npE64ucnthCbWn7u4/Aidt5e2tJnQRkZSLSU897NkvB5rZx2Y2Ndg/zMxuD7NNEZEdEpMx9bAvlPYCbgHWAbj7FODMkNsUESm8Ypz9Yma9zWzh+g5tgfJ/m9l3ZjbNzB4sUH6Lmc0ysxlm1rZAebugbJaZ3ZzM1wj7QmkZdx9vttHwUzye7ioi8VK8PfAXgaeAvusLzOw44GSgnruvMbNqQflBJDq7BwN7Ax+Z2YHBx54GjgfmABPMbLC7b7grcAvCTuqLzWx/gms6ZtYJmB9ymyIihVeMY+ruPsrMam5SfClwv7uvCeosDMpPBgYE5bPNbBbQMHhvVnBtEjMbENSNNKlfDvQE6prZXGA2cHbIbYqIFF5O6IMIBwLNzOxeElO7r3f3CSRuzhxboN6coAzg103Kj95eI2En9blAH2AEUBlYAZwPdAu5XRGRwinE9G4z6wp0LVDUM7i5cluySOTBRsBRwMAwlksJO6kPApYDkwBNthaR9FWIMfUggW8viW9qDvCWJ24OGm9meUAVEp3ffQrUqxGUsY3yrQo7qddw93YhtyEiUnThT1V8BzgOGBFcCC0BLAYGA6+Y2SMkLpQeAIwHDDjAzGqRSOZnAmdtr5Gwk/rnZnaou38TcjsiIkVTjBdKzexV4FigipnNAe4gsYhh72Ca41rg/KDXPs3MBpK4AJoDXO7uucFxrgCGAZlAb3eftr22w07qxwAXmNlsYA2J3zzu7oeF3K6ISOHk5hbbody981beOmcr9e8F7t1C+fvA+4VpO+yk3j7k44uIFI80v1M0WWGv/fJzmMcXESk2SuoiIjESkwW9lNRFRADPi8dixkrqIiKg4RcRkVgpxtkvUVJSFxEB9dRFRGJFSV1EJEZCfF5zKimpi4iAeuoiIrGiKY3henmF1gBbb/bMwVGHkDYq7tsy6hDSxpqcdVGHEC+a/SIiEh+u4RcRkRjR8IuISIxo7RcRkRhRT11EJEZydKFURCQ+NPwiIhIjGn4REYkPTWkUEYkT9dRFRGJESV1EJEa0TICISHzoGaUiInGipC4iEiOa/SIiEiPqqYuIxIiSuohIfHiuhl9EROJDPXURkfjQlEYRkThRUhcRiZF4DKkrqYuIAHhOPLL6Lp3UH37yblq3ac7ixUtp3fRUAK6/9Qratm9JXl4eixcv5drLb+O3BYu45N9dOLXTCQBkZmVywIG1qXdAM5YvXxHlVyiS2//3CKPGjKdypYq80/85AL77/kfufuhJVv21mr33qsYDd9xI2d1245vpM7jzgScAcJzLLjyb1i2a5h8rNzeXf150JdWqVuGZh+6K5PuEZfq3o/nzjz/JzcsjJyeHZsd05NRTO3DrbVdTt24dmjc/mcmTvok6zEhkZGQwbuwHzJu7gJNPPT/qcIomHjmdjKgDiNLrr7zDOadfslHZc0/24fhmp9G2RSc+HjaSq2+4NL+8bYtOtG3Rifu7PcbYMRN36oQOcEqH43nukXs2Krvj/se4+tIuvN3vWVo1b0Kfl98EoE7t/XjthSd486Wn6dH9Hro9+CQ5BR7/1f/1QdSuuW9K40+l9u0707hRB5od0xGA6dNncFbnSxg9enzEkUXryn9fzHfffR91GMXC8zzpLZ3t0kl93BdfsnzZ7xuV/fnHyvzXpcuUxn3z/wNP+UcHBr31fujxha3B4YdSoXy5jcp+/nUuDQ4/FIDGRx3J8JGjAShdqhRZWZkArFm7FszyP7Ng4SJGfT6ef5zUNkWRR2/GjB/4/vsfow4jUtWr70WH9q3o3fvVqEMpHnmF2LbDzHqb2UIzm1qg7CEz+87MppjZ22ZWscB7t5jZLDObYWZtC5S3C8pmmdnNyXyNXTqpb82Nt13J+G8+4tTTT+Dh+57a6L1SpUtxbKtjeH/w8IiiC9f+tfbjk8++AODDEZ+x4LfF+e9NmfYdJ5/9f5x63qX894Yr8pP8A4/34NrLLsIsnj9O7s7gIf0YPWYIXS7sHHU4aeOR7ndx8y33kBeTNVOKuaf+ItBuk7LhwCHufhgwE7gFwMwOAs4EDg4+84yZZZpZJvA00B44COgc1N2mUP8VmtkfZrYi2FabWa6Zpf2YxYP3PkHDQ1vz9uvv0eVfZ2303vHtjmXCuMk7/dDL1tx96zUMeOtdzrjw36xc9RfZ2Rsuuxx2cF0GvdyDAc8/zvP9BrJmzVo+HTOOypUqcnDdAyKMOlytW3eiaZMTOfWUC/i/rufRtGnDqEOK3AkdWrNw4WImTY7RtYRi7Km7+yhg6SZlH7p7TrA7FqgRvD4ZGODua9x9NjALaBhss9z9R3dfCwwI6m5TqBdK3T3/b3szsyCgRlurb2Zdga4AFcvsxW4lK4cZ3na9/fq79B34LN3vfzq/7ORT2zPozZ1/6GVrau+3D70e+x8AP/0yh1Gfbz5mvH/NfSlTujTf//gTk6dM59PRY/nsiwmsWbuOlStXcdNdD/LAHTemOvTQzJ/3GwCLFi1h8JBhNGhQjzFjdu2x9CZNGnDSiW1o364lpUqVpHz5crz04hOcf8GVUYe2w/LTbWpcCLwWvK5OIsmvNycoA/h1k/Kjt3fglP297AnvAFsdeHX3nu7ewN0bRJXQa9XecLGvbYeW/PD97Pz9cuXK0qhpA4Z9MCKK0FJiybLlAOTl5dHjpQGccUoHAObMW5B/YXTegt+Y/fOvVN9rD665tAsfv9OfD998iYfuupmG9evFKqGXKVOasmV3y3/dqlUzpk+fGXFU0bvt9vupWbsBdQ5sxNnnXMaIEWN26oQO4HnJb2bW1cwmFti6JtuOmd0G5AAvh/E9Qu2pm9lpBXYzgAbA6jDbLIynej1I46ZHUXn3ikyY+hHd73+Glsc3o3admnieM+fXedxyXbf8+u1ObMXIEZ/z16q/Ioy6+Nxwx/1MmDyF5ctX0OqUc7jsonNZ9ddfDHjrXQBat2jCqSe0AWDSlGm80G8gWVlZZGQYt19/OZUqVogy/JSoVq0KAwb0BBJTWQcOHMTw4SM5qWNbune/kypVKvPWm72ZMuVbTj75vIijlSIpxKUBd+8J9CxsE2Z2AXAi0Mo3zMKYC+xToFqNoIxtlG+9jS3N7iguZtanwG4O8BPQy90Xbu+zNSofkt7zhlJo9szBUYeQNiru2zLqENLGmpx1UYeQNnLWzrXt19q2Rce3SDrnVB0+crvtmVlN4F13PyTYbwc8ArRw90UF6h0MvEJiDH1v4GPgAMBIXFBtRSKZTwDOcvdp22o37DH1LmEeX0SkuHgxTuIxs1eBY4EqZjYHuIPEbJeSwPDEJUbGuvsl7j7NzAYC00l0fi9399zgOFcAw4BMoPf2EjqEP/xyIPAssIe7H2JmhwEd3f2e7XxURCSlPLfInf0Nx3Lf0tzXF7ZR/17g3i2Uvw8UamZG2BdKe5H47bQOwN2nkJiPKSKSVgpzoTSdhb32Sxl3H2+20W/A1E4cEhFJgucVX089SmEn9cVmtj/gAGbWCZgfcpsiIoWW7j3wZIWd1C8nMe2nrpnNBWYDZ4fcpohIobmrp56MuUAfYARQGVgBnA9029aHRERSTT315AwClgOTgHkhtyUissPyinH2S5TCTuo13H3TlcpERNJOXC6Uhj2l8XMzOzTkNkREiszzLOktnW21p25mQwhmrWyJu3dM4vjHABeY2WxgDYnbXj1YT1hEJG2EuGJKSm1r+OXhYjh++2I4hohI6NK9B56srSZ1dx9Z1IO7+89FPYaISCrsMlMazewA4D4Sj1Mqtb7c3WuHGJeISErlxmT2SzIXSvuQWJQrBzgO6Av0DzMoEZFUc7ekt3SWTFIv7e4fk1h7/Wd3vxM4IdywRERSK/azXwpYY4nHxH8frO07FygbblgiIqkVl9kvyfTUrwLKAFcC9YFzSdzqLyISG7tMT93dJwQv/wT0JCMRiaXcvLDvxUyNZGa/jGALNyG5ux4WKSKxEZfhl2TG1K8v8LoU8A/0oAsRiZm8NJ/Vkqxkhl++3KRojJmNDykeEZFIpPtUxWQlM/xSucBuBomLpRVCi0hEJAK70vDLlyTG1I3EsMts4KIwgwJYtvrPsJvYaXQ68sqoQ0gbu5cqF3UIaWPen0ujDiFWdpnhF+Dv7r66YIGZlQwpHhGRSMRl9ksy3+LzLZR9UdyBiIhEyQuxpbNtrae+J1AdKG1mR5AYfgEoT+JmJBGR2NgVhl/aAhcANYDubEjqK4Bbww1LRCS1Yj/7xd1fAl4ys3+4+5spjElEJOXyog6gmCQzpl7fzCqu3zGzSmZ2T4gxiYiknGNJb+ksmaTe3t2Xr99x92VAh/BCEhFJvRy3pLd0lsyUxkwzK+nuawDMrDSgKY0iEivp3gNPVjJJ/WXgYzPrQ+Ji6QXAS2EGJSKSanEZU09m7ZcHzOxroDWJKZrDgP3CDkxEJJV2pZ46wG8kEvrpJJYJ0GwYEYmV2PfUzexAoHOwLQZeI/Gc0uNSFJuISMrk7gI99e+Az4AT3X0WgJldk5KoRERSLM2fUpe0bU1pPA2YD4wws15m1gpi8qtMRGQTeVjSWzrbalJ393fc/UygLjACuBqoZmbPmlmbVAUoIpIKxbmgl5ldY2bTzGyqmb1qZqXMrJaZjTOzWWb2mpmVCOqWDPZnBe/XLMr32O7NR+6+0t1fcfeTSKwDMxm4qSiNioikm7xCbNtiZtWBK4EG7n4IkAmcCTwAPOrudYBlbHguxUXAsqD80aDeDivUAsLuvszde7p7q6I0KiKSbvLMkt6SkEVihdssEqvazgdaAm8E778EnBK8PpkN9/68AbQyS66RLYnHqvAiIkWUW4htW9x9LvAw8AuJZP47iSfILXf3nKDaHBJLmxP876/BZ3OC+rvv6PdQUhcRITH7JdnNzLqa2cQCW9f1xzGzSiR637WAvYHdgHap+h7J3nwkIhJrhZnV4u49gZ5bebs1MNvdFwGY2VtAU6CimWUFvfEawNyg/lxgH2BOMFxTAViyQ18C9dRFRIBinf3yC9DIzMoEY+OtgOkkZhF2CuqcDwwKXg8O9gne/8Tdd/ipeeqpi4hQfDcfufs4M3sDmATkkJgx2BN4DxgQPI9iMvBC8JEXgH5mNgtYSmKmzA5TT72A6d+OZvz4oXwx9n0+Gz0YgEqVKjBkSD++njKCIUP6UbFi+YijDEd2yWweGvwIjw19kic/eprO154FwKFNDuOR9x7jieFPc9Uj15CRufGPTJ3DDuCtHwfRpEPTKMIOxUNPdmPSjE8ZPuatzd771+Xn8cvSb6hUOfHcmEZNGzD1p8/5YOTrfDDyda664ZJUhxuZtm2OZdrUUXw3fTQ33nB51OEUWXFNaQRw9zvcva67H+Lu57r7Gnf/0d0bunsddz99/XLm7r462K8TvP9jUb6Hkvom2rfvTONGHWh2TEcArrvuUj799HPqHXYcn376Odddd1nEEYZj3Zp1/OfMW7m63b+5ut2VHNmiPnXr1+XqR67h4Sse5MrjL2fRnIW07LRhNmtGRgbn33IBk0dNjjDy4vf6K4M47/RLNyvfq/oeND+uCXN+nbdR+YQvJtG+xem0b3E6jz/0XKrCjFRGRgZPPH4vJ550DofWO45//vMU/v73A6IOq0hyLfktnSmpb8cJJx7Pyy8nppa+/PIbnHjS8RFHFJ7Vq1YDkJmVRWZWJnm5eaxbl8O82Ykk9tXor2jcfkOP/IQuJ/LFB5/z+5LlWzzezmr8F1+yfNnvm5Xfce+N/O+ORyjCcGdsNDzqCH744Sdmz/6FdevWMXDgIDqe1DbqsIqkOHvqUQo1qZvZg2ZW3syyzexjM1tkZueE2WZRuDuDh/Rj9JghdLmwMwDVqlVlwYJFACxYsIhq1apGGWKoMjIyePSDJ+g7uT9fjf6KmV/NJDMzkzqH1QGgSYemVNm7CgCV99idRm0b80G/96MMOWWOb38cC+Yv5NtpMzd778ij6jF01Bu8NPBZDqy7fwTRpd7e1ffk1zkb/mKZM3c+e++9Z4QRFV1cknrYF0rbuPuNZnYq8BOJRcJGAf23VDmY69kVoER2ZbKyyoUc3sZat+7E/Hm/UbXq7gwZ0p+ZM37YrE6ce2l5eXlc0/5Kdiu/G7f0vI19D9yPh694kAv/+y+yS2Tz1ahJ5OUmfqQvvvNfvHTfi7E+H+uVKl2KK669mHNO+7/N3ps65Vsa12vDqpV/cVzrZvTq9zgtjjoxgiilqNL80aNJCzuprz/+CcDr7v77tu5+LTj3c7cyNVOeLebP+w2ARYuWMHjIMBo0qMfChYvYc89Eb33PPauyaNHiVIeVcitXrOSbL6Zw5LFH8k7Pt7m1U2Kpn8ObHcHetRM3wdU5tA7XP3UjAOUrl6f+cQ3Izcll3IdjI4s7LPvV3Id99q3O0M8Sw3B77b0H7386kI6tO7No4YbpxCM++ox7Hr6NSpUrsmxpvIakNjVv7gL2qbF3/n6N6nsxb96CCCMqunTvgScr7DH1d83sO6A+ieecVgVWh9zmDilTpjRly+6W/7pVq2ZMnz6T99/7iLPPTkwtPfvsTrz37vAowwxN+crl2a184vuXKFmCes2OYM4Pc6iwewUAskpkcdplnRja/wMAuh5zMV2bXkTXphfx+ftj6HH7s7FM6AAzvv2eI/92LE0Pb0fTw9sxf95vdDj2DBYtXELVahvu5q535CFkZGTEPqEDTJj4FXXq1KJmzX3Izs7mjDNOZsi7H0YdVpEU1zIBUQu1p+7uN5vZg8Dv7p5rZitJ3D6bdqpVq8KAAYkbxDKzMhk4cBDDh4/kyy+/pl+/pznv/DP49Ze5nHvuzj91a0sqVavM1cGURcvIYMy7nzHx4wlccGsXGrRqSEaG8UH/9/nm8ylRhxq6J3s9QOOmR1Fp94qMm/oRj9z/NK/1f3uLdTt0bMO5F55BTk4uq1ev5oqLb0hxtNHIzc3lqqtv5/33XiEzI4MXX3qN6dM3v96wM4nLQzIszDFRM8sGLgWaB0Ujgefcfd32PhvF8Eu6al3lkKhDSBuT//w56hDSxrw/l0YdQtrIWTu3yCn50X3PSTrnXPNL/7T9FRD2mPqzQDbwTLB/blB2ccjtiogUSlzG1MNO6ke5e70C+5+Y2dchtykiUmhxGRoI+0JprpnlT9w1s9qk/3UGEdkFFWbp3XQWdk/9BhIPrl6/lkFNoEvIbYqIFFpcepth99THAD1IDFctDV5/EXKbIiKFlocnvaWzsHvqfYEVwN3B/llAP+D0kNsVESkUXShNziHuflCB/RFmNj3kNkVECi29+9/JC3v4ZZKZNVq/Y2ZHAxNDblNEpNC0oFdy6gOfm9kvwf6+wAwz+wZwdz8s5PZFRJKSY/Hoq4ed1FP2BG0RkaKIR0oPf+0X3dMtIjuFdB9WSZYePC0iAmk/VTFZSuoiImj4RUQkVjT8IiISI7kx6asrqYuIoJ66iEisuHrqIiLxoZ66iEiMaEqjiEiMxCOlK6mLiACQE5O0rqQuIoIulIZuTc66qENIGx8vnhZ1CGljdc7aqEOQmNKFUhGRGFFPXUQkRtRTFxGJkVxXT11EJDY0T11EJEbiMqYe9oOnRUR2CsX94GkzyzSzyWb2brBfy8zGmdksM3vNzEoE5SWD/VnB+zWL8j2U1EVESAy/JLsl6Srg2wL7DwCPunsdYBlwUVB+EbAsKH80qLfDlNRFREgMvyT73/aYWQ3gBOD5YN+AlsAbQZWXgFOC1ycH+wTvtwrq7xCNqYuIUOyzXx4DbgTKBfu7A8vdPSfYnwNUD15XB34FcPccM/s9qL94RxpWT11EhMINv5hZVzObWGDruv44ZnYisNDdv4zie6inLiJC4W4+cveeQM+tvN0U6GhmHYBSQHngcaCimWUFvfUawNyg/lxgH2COmWUBFYAlO/AVAPXURUSA4htTd/db3L2Gu9cEzgQ+cfezgRFAp6Da+cCg4PXgYJ/g/U/cd3wsSEldRIRQZr9s6ibgWjObRWLM/IWg/AVg96D8WuDmonwPDb+IiABF6Bxv65ifAp8Gr38EGm6hzmrg9OJqU0ldRATIjckdpUrqIiJo7RcRkVgJY/glCkrqIiKopy4iEitxWaVRSV1EBD0kQ0QkVjT8IiISI3FJ6rqjdAtKlizJF2Pe5cuJw/n6q0+447/XRR1Syk379jPGjf+Az8e+x6jRg/LLL7nkfCZN/ogJE4dx9z1FuvFtp9SrZ3fmzfmaryZ/HHUokatQoTyvDejJ1G9G8s2UT2l0dP2oQyoSd096S2fqqW/BmjVraN3mDFauXEVWVhajPn2boUNHMG78pKhDS6kO7c9iyZJl+fvNmzfihBNb0+joDqxdu5aqVXePMLpo9O07kGee6UOfPo9HHUrkHn2kG8OGjeCfZ3YlOzubMmVKRx1SkcSlpx5qUjeza7f1vrs/Emb7RbFy5SoAsrOzyMrOTvvfzqlw8b/OoXv351i7di0Aixbt8EJyO63PRo9jv/1qRB1G5MqXL0ezY47mwouuBmDdunX8/vu6iKMqmrjMfgl7+KUBcCmJReCrA5cAR5JYOL7cNj4XuYyMDCZO+JD5c6fw8cejGD9hctQhpZS7M2hIXz4bM5guF3YGoM4BtWja9ChGjHybocMGcGT9wyKOUqJSq9a+LF68hBeef5QJ44fR47mHdvqeeq7nJb2ls7CTeg3gSHe/zt2vA+oD+7r7Xe5+V8htF0leXh4Njmo9j1C0AAAILklEQVTDfrUacFSDIzj44L9FHVJKHd/6dI5pchKnndKFrl3PpWnThmRlZlKpUkWOa3Eqt912H337PRV1mBKRrMxMjjjiUHr06MtRDduycuUqbrrxiqjDKpK4jKmHndT3ANYW2F8blG1RwaeJ5OWtDDm05Pz++wo+HTmGtm2OjTqUlJo/7zcgMcQyZMgw6jeox9x5Cxg8aCgAX078mry8PKpUqRxlmBKROXPnM2fO/Py/YN966z2OOPzQiKMqmhQsvZsSYSf1vsB4M7vTzO4ExgEvbq2yu/d09wbu3iAjY7eQQ9u6KlUqU6FCeQBKlSpF61bNmTHjh8jiSbUyZUpTtuxu+a9btmrG9OkzeHfIhzRv0RiAOnVqUaJENosXL40yVInIb78tYs6ceRx44P4AtGx5DN9+OzPiqIqmOB88HaVQL5S6+71m9gHQLCjq4u5pPzi911570PuFx8jMzCAjI4M33hjCe+9/FHVYKVOtWhVeHdADgKysTAYOHMxHw0eRnZ3Ns889yPgJQ1m7bh3/96/rI4409fr3e5oWzRtTpUplfvpxInd1e5g+Lw6IOqxIXHXNf+j70pOUKJHN7Nm/cNHF25wXkfby0nxYJVmWruNDWSWqp2dgESiVVSLqENLG6py1268ku5yctXOtqMc4eI+jk845034bV+T2wqJ56iIikPazWpKlpC4iQnyGX5TURUSIz81HSuoiIqinLiISK+qpi4jESK7nRh1CsVBSFxFBD54WEYmVdL/9P1lK6iIiqKcuIhIrmv0iIhIjmv0iIhIjWiZARCRGNKYuIhIjGlMXEYkR9dRFRGJE89RFRGJEPXURkRjR7BcRkRiJy4XSjKgDEBFJB+6e9LY9ZtbOzGaY2SwzuzkF4edTUhcRIXFHabL/bYuZZQJPA+2Bg4DOZnZQCr4CoKQuIgIUa0+9ITDL3X9097XAAODk0L9AQGPqIiIU65h6deDXAvtzgKOL6+Dbk7ZJPWftXIs6BgAz6+ruPaOOIx3oXGygc7FBXM5FYXKOmXUFuhYo6pku50DDL9vXdftVdhk6FxvoXGywy50Ld+/p7g0KbAUT+lxgnwL7NYKylFBSFxEpXhOAA8yslpmVAM4EBqeq8bQdfhER2Rm5e46ZXQEMAzKB3u4+LVXtK6lvX1qMk6UJnYsNdC420LnYhLu/D7wfRdsWl/UOREREY+oiIrGipC4iEiNK6iIiMbJLJ3Uzq2lm35pZLzObZmYfmllpM9vfzIaa2Zdm9pmZ1Q3q729mY83sGzO7x8z+jPo7FJcdOBcvmlmnAp+PzbmA/PPxnZm9HJyXN8ysjJm1MrPJwc9AbzMrGdS/38ymm9kUM3s46vhTwcxuM7OZZjbazF41s+ujjkl28aQeOAB42t0PBpYD/yBxNf/f7l4fuB54Jqj7OPC4ux9K4tbfuCnMudgV/A14xt3/DqwArgVeBP4Z/AxkAZea2e7AqcDB7n4YcE9E8aaMmdUnMf/6cKADcFS0Ecl6Suow292/Cl5/CdQEmgCvm9lXQA9gr+D9xsDrwetXUhlkihTmXOwKfnX3McHr/kArEudoZlD2EtAc+B1YDbxgZqcBq1Ieaeo1A95291XuvoIU3lwj26Z56rCmwOtcYA9gubsfHlE8USrMucgh6BSYWQZQIvzwUm7T+b7Lgd03q5S42aQhiaTfCbgCaBl+eCKbU099cyuA2WZ2OoAl1AveG0tiSAISf3rG3bbOxU9A/eB1RyA79eGFbl8zaxy8PguYCNQ0szpB2bnASDMrC1QIbji5Bqi3+aFiZxRwSnDdpRxwUtQBSYKS+padDVxkZl8D09iwFvLVwLVmNgWoQ+LP7rjb2rnoBbQIyhsDKyOKL0wzgMvN7FugEvAo0IXEcNQ3QB7wHFAOeDf4uRhNYuw91tx9EvAa8DXwAYn1TiQN6I7SQjCzMsBf7u5mdibQ2d1Ttvi9pI6Z1QTedfdDIg5lp2BmdwJ/uvsuMfMnnWlMvXDqA0+ZmZEYX70w4nhERDainrqISIxoTF1EJEaU1EVEYkRJXUQkRpTUpdiZWa6ZfWVmU83s9WDW0I4e61gzezd43dHMbt5G3YpmdtkOtHGn1i2RuFBSlzD85e6HB9MB1wKXFHwzuImp0D977j7Y3e/fRpWKQKGTukicKKlL2D4D6gSrHs4ws77AVGAfM2tjZl+Y2aSgR18WwMzaBSskTgJOW38gM7vAzJ4KXu9hZm+b2dfB1gS4H9g/+CvhoaDeDWY2IVg98a4Cx8pfYZDEwl0isaB56hIaM8sC2gNDg6IDgPPdfayZVQFuB1q7+0ozu4nE3boPkrhbtSUwi8Rdi1vyBDDS3U81s0ygLHAzcMj6tWrMrE3QZkPAgMFm1pzE3a/rVxjMAiaRWMBMZKenpC5hKB2s6giJnvoLwN7Az+4+NihvBBwEjEncy0UJ4AugLomVEL8HMLP+QNcttNESOA/A3XOB382s0iZ12gTb5GC/LIkkX45ghcGgDa0wKLGhpC5h+GvTlR2DxF1wfRgDhrt7503qFefqmAbc5+49Nmnj6mJsQyStaExdojIWaLp+xUMz283MDgS+I7ES4v5Bvc5b+fzHwKXBZzPNrALwB4le+HrDgAsLjNVXN7NqaIVBiTEldYmEuy8CLgBeDVY3/AKo6+6rSQy3vBdcKF24lUNcBRwXrJb4JXCQuy8hMZwz1cwecvcPSTzM5Iug3htAOa0wKHGmtV9ERGJEPXURkRhRUhcRiREldRGRGFFSFxGJESV1EZEYUVIXEYkRJXURkRhRUhcRiZH/B3GenKZMCOitAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "\n", - "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", - "print(model.score(X_valid,y_valid))\n", - "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", - " xticklabels=model.classes_, yticklabels=model.classes_)\n", - "plt.ylabel(\"Actual\")\n", - "plt.xlabel(\"Predicted\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## [ULMFit](https://github.com/cstorm125/thai2fit) Model" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "from fastai.text import *\n", - "from fastai.callbacks import CSVLogger, SaveModelCallback\n", - "from pythainlp.ulmfit import *\n", - "\n", - "model_path = \"wisesight_data/\"\n", - "all_df = pd.read_csv(\"all_df.csv\")\n", - "train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Finetune Language Model" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", - "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", - " NumericalizeProcessor(vocab=None, max_vocab=60000, min_freq=2)]\n", - "\n", - "data_lm = (TextList.from_df(all_df, model_path, cols=\"texts\", processor=processor)\n", - " .random_split_by_pct(valid_pct = 0.01, seed = 1412)\n", - " .label_for_lm()\n", - " .databunch(bs=48))\n", - "data_lm.sanity_check()\n", - "# data_lm.save('wisesight_lm.pkl')" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(24102, 243)" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_lm.sanity_check()\n", - "len(data_lm.train_ds), len(data_lm.valid_ds)" - ] - }, - { - "cell_type": "code", - "execution_count": 274, - "metadata": {}, - "outputs": [], - "source": [ - "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,\n", - " output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)\n", - "trn_args = dict(drop_mult=1., clip=0.12, alpha=2, beta=1)\n", - "\n", - "learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", - "\n", - "#load pretrained models\n", - "learn.load_pretrained(**_THWIKI_LSTM)" - ] - }, - { - "cell_type": "code", - "execution_count": 275, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training frozen\n" - ] - }, - { - "data": { - "text/html": [ - "Total time: 00:44

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracy
14.5844924.0614130.377183
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "#train frozen\n", - "print(\"training frozen\")\n", - "learn.freeze_to(-1)\n", - "learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))" - ] - }, - { - "cell_type": "code", - "execution_count": 276, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training unfrozen\n" - ] - }, - { - "data": { - "text/html": [ - "Total time: 04:50

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracy
14.1326073.7883040.405754
23.9424563.6274630.424405
33.7166123.5278030.436210
43.5341833.4826760.439385
53.4387063.4654900.440675
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "#train unfrozen\n", - "print(\"training unfrozen\")\n", - "learn.unfreeze()\n", - "learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# learn.save('wisesight_lm')\n", - "learn.save_encoder(\"wisesight_enc\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Train Text Classifier" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "28234\n" - ] - } - ], - "source": [ - "#lm data\n", - "data_lm = load_data(model_path, \"wisesight_lm.pkl\")\n", - "data_lm.sanity_check()\n", - "\n", - "#classification data\n", - "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", - "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", - " NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=20)]\n", - "\n", - "data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=[\"texts\"], processor=processor),\n", - " valid=TextList.from_df(valid_df, model_path, cols=[\"texts\"], processor=processor))\n", - " .label_from_df(\"category\")\n", - " .databunch(bs=50)\n", - " )\n", - "data_cls.sanity_check()\n", - "print(len(data_cls.vocab.itos))" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "#model\n", - "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,\n", - " output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)\n", - "trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)\n", - "\n", - "learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", - "#load pretrained finetuned model\n", - "learn.load_encoder(\"wisesight_enc\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# #train unfrozen\n", - "# learn.freeze_to(-1)\n", - "# learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))\n", - "# learn.freeze_to(-2)\n", - "# learn.fit_one_cycle(1, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))\n", - "# learn.freeze_to(-3)\n", - "# learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))\n", - "# learn.unfreeze()\n", - "# learn.fit_one_cycle(10, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7),\n", - "# callbacks=[SaveModelCallback(learn, every='improvement', monitor='accuracy', name='bestmodel')])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Training takes about 20 minutes so we use the script `train_model.py` to do it with the following results (validation run):\n", - "\n", - "```\n", - "epoch train_loss valid_loss accuracy\n", - "1 0.812156 0.753478 0.687532\n", - "Total time: 00:56\n", - "epoch train_loss valid_loss accuracy\n", - "1 0.740403 0.699093 0.714394\n", - "Total time: 00:57\n", - "epoch train_loss valid_loss accuracy\n", - "1 0.727394 0.668807 0.723011\n", - "Total time: 01:34\n", - "epoch train_loss valid_loss accuracy\n", - "1 0.722163 0.675351 0.723517\n", - "2 0.675266 0.654477 0.738723\n", - "3 0.669178 0.641070 0.737962\n", - "4 0.612528 0.637456 0.744551\n", - "5 0.618259 0.635149 0.749366\n", - "6 0.572621 0.651169 0.749873\n", - "7 0.561985 0.661739 0.747593\n", - "8 0.534753 0.673563 0.738469\n", - "9 0.530844 0.688871 0.746072\n", - "10 0.522788 0.670024 0.743031\n", - "Total time: 23:42\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### See Results" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "learn.load(\"bestmodel\")\n", - "\n", - "#get predictions\n", - "probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)\n", - "classes = learn.data.train_ds.classes\n", - "y_true = np.array([classes[i] for i in y_true.numpy()])\n", - "preds = np.array([classes[i] for i in probs.argmax(1).numpy()])\n", - "prob = probs.numpy()\n", - "loss = loss.numpy()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.8392661555312158" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "to_df = np.concatenate([y_true[:,None],preds[:,None],loss[:,None],prob],1)\n", - "probs_df = pd.DataFrame(to_df)\n", - "probs_df.columns = [\"category\",\"preds\",\"loss\"] + classes\n", - "probs_df[\"hit\"] = (probs_df.category == probs_df.preds)\n", - "probs_df[\"texts\"] = valid_df.texts\n", - "(y_true==preds).mean()" - ] + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "colab": { + "name": "sentiment_analysis.ipynb", + "version": "0.3.2", + "provenance": [] + }, + "accelerator": "GPU" }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAEKCAYAAADticXcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xd8FVX6x/HPE0JJsNCkSJBuwYYKKqh0qQqo2AuI+0MRLLjW1bV3cS2rroQFVCyoLCqCgojSlN6LIAiCBKQpIAlIyvn9cYcQkHJDMncuk+/b17z2zpm5c547e+/DyZkzZ8w5h4iIhENC0AGIiEjhUVIXEQkRJXURkRBRUhcRCREldRGREFFSFxEJESV1EZEQUVIXEQkRJXURkRBJDDqA/Xmr6nW61dXTZ8uUoEOIG9t2bg86hLihH8huWTvTrKDHyNy4POpTWrxCrQLX5xe11EVEQiRuW+oiIjGVkx10BIVCSV1EBCA7K+gICoWSuogI4FxO0CEUCiV1ERGAHCV1EZHwUEtdRCREdKFURCRE1FIXEQkPp9EvIiIhogulIiIhou4XEZEQ0YVSEZEQUUtdRCREdKFURCREdKFURCQ8nFOfuohIeKhPXUQkRNT9IiISImqpi4iESHZm0BEUCiV1ERFQ94uISKio++Xwd1TtKjT7T+/c9SOOq8icvkM55qy6HF27CgAljkpm59YMhrd+MHe/0seWp/O455jz4jAW9vsi5nHHQs9eN3J9tyvAORYtXEKvW+7jhm5Xcsut3ahVuzq1qzfkt02/Bx2m7/qnvkj79q1Yv2EjZ5zREoBHH72Hjhe3JifHsX79Rm76Wx/Wrl0XcKSxVbJkScZ98z9KlCxJYmIxhg0byWOPvxh0WAUTkpa6OeeCjmGf3qp6XUwDswTjipn/ZsRFj5Cetim3vMHD15C5NYO5L3+aW9Ys9XZwjg2zfopJUu+zZYrvdeRVpUolvhwzhHMbtGXHjj8Z+M6rjBk9jgXzF7N58xZGfPkezZtcEkhS37Zze0zrO//8c0jfls7AQa/kJvUjjzyCP/7YBkDvXt056aTj6dX7/pjGBRD0L7d06WTS0zNITExkwrhP6HPXI0ydNiuQWLJ2pllBj7Fj4uCoT2mpC64vcH1+KdIt9byqnH8yW1eu3yOhA9S8+BxGXfF07vpxbc5i26oNZGX8GesQYyoxMZFSSaXIzMwiOakUv65dz/x5i4IOK+YmTZpK9eope5TtSugAyaWTideGkd/S0zMAKF48kcTixQ/78+BCcqE0IegA4kXNTo1Y8enkPcoqnXMC2zds4Y8VkT+tE5NLckqvi5jzr2FBhBgza9eu49+v/pf5P0xg8U+T2br1D779ZlLQYcWVxx+/j+U/Tefqqy/h0cdeCDqcQCQkJDBj+lesTZvH2LETmDZ9dtAhFYzLiX6JY74mdTP7w8y27rX8YmafmFktP+vOj4TixajW+kx+HjF1j/KanRux4rPdib7+3y9lUf9RoW+lH13mKNp3aEX9U5pzUp3GJCcnc8WVnYIOK648/PBz1KrdkA8++IRbb70x6HACkZOTQ4OGraleswENG5zBySefEHRIBZOTE/0Sx/xuqb8M3ANUBVKAu4H3gSHAwL13NrMeZjbDzGaMS1/qc2i7VW1+Opvm/8yOjVt3x1IsgertGrJi+O5Ef8wZdWjw4FV0mfIS9f7WhtNu68iJ3S6MWZyx0qz5eaz8eTWbNv5GVlYWnw8fzdnnnhl0WHHpgw+Gcckl7YMOI1Bbtmxl3PjvaNO6WdChFExIWup+96l3dM6dnmc91czmOOfuM7N/7L2zcy4VSIXYXiit1fmvXS/HXnAKW5atIWPtb7llX176RO7r+nddSmb6Dha/NSZWYcbM6l/W0ODs+iQllWL79h00bdaY2bPnBx1W3KhTpybLlq0AoOPFbViy5KeAI4q9ChXKkZmZxZYtWylVqhStWjbhhb5vBB1WwcR5Czxafif1DDO7AhjqrXcBdniv4+KqSmJSSao0OYXv79vzD4eanc7do+ulKJk5Yy7DPx3FuO8+Izsrm3lzF/H2wA/p0fMGbr+zB5UqVWDSlBGMGT2eO3r/5d/mUBk8+HWaNmlEhQrlWLF8Bo8/3pe27Vpw/PG1cTk5rFyVRq9esR/5ErQqVSoxcMDLFCuWQEJCAkOHfs7IL74OOqyCifMWeLR8HdLo9Zu/AjQiksSnAH2ANOAs59x+r77FekhjPIv1kMZ4FushjfFMP5DdCmNI4/aRL0d9SpM63Fk0hzQ655YDF+9ns4ZTiEj8CElL3e/RL8eb2VgzW+Ctn2ZmD/lZp4jIIdHol6j0Bx4AMgGcc/OAq3yuU0Qk/0Iy+sXvpJ7snJu2V1k4nu4qIuFSiC11MxtoZut39VLkKb/NzBab2UIzez5P+QNmtszMlphZmzzlbb2yZWYW1RV5v0e/bDSz2njXdMysC7DW5zpFRPKvcFvgbwGvAe/sKjCz5kAn4HTn3J9mVtErr0ekB+Nk4FjgazM73nvb68CFwGpgupkNd84dcL4Ov5N6LyLjzk80szRgBXCtz3WKiORfVuF1IjjnJphZjb2KewLPOuf+9PZZ75V3AoZ45SvMbBlwtrdtmTfgBDMb4u17wKTud/dLGjAIeIrIXaRjgK4+1ykikn/ORb8cmuOBC8xsqpmNN7OGXnlV4Jc8+632yvZXfkB+t9Q/AzYDs4A1PtclInLo8jGqxcx6AD3yFKV6d8QfSCJQDjgXaAh85MccWH4n9RTnXFuf6xARKbh8JPW8U5rkw2pgmIvc8TnNzHKACkR6NKrl2S/FK+MA5fvld/fL92Z2qs91iIgUnP9DGj8FmkPkHh6gBLARGA5cZWYlzawmUBeYBkwH6ppZTTMrQeRi6vCDVeJ3S/18oJuZrQD+BAxwzrnTfK5XRCR/srML7VBm9gHQDKhgZquBR4jMTDvQG+a4E+jqtdoXmtlHRC6AZgG9nHPZ3nF6A6OBYsBA59zCg9Xtd1Jv5/PxRUQKRyHeKeqcu3o/m67bz/5PERlQsnf5F0C+npnp99wvK/08vohIoYnz2/+jpWeUiohA3N/+Hy0ldRERwOWEYzJjJXUREVD3i4hIqBTi6JcgKamLiIBa6iIioaKkLiISIj4+rzmWlNRFREAtdRGRUNGQRn/dsWVy0CHEjY0/jwk6hLhRpZYm/dxly470oEMIF41+EREJD6fuFxGREFH3i4hIiGjuFxGREFFLXUQkRLJ0oVREJDzU/SIiEiLqfhERCQ8NaRQRCRO11EVEQkRJXUQkRDRNgIhIeOgZpSIiYaKkLiISIhr9IiISImqpi4iEiJK6iEh4uGx1v4iIhIda6iIi4aEhjSIiYaKkLiISIuHoUldSFxEBcFnhyOoJQQcQL+rUrcnE7z/PXX5ZM4eet3bL3d77tpvYsu0nypUvG1yQheyhp/9Fkw5X0fm6W3LLFi9dzrU9+nDJ9T3pde8jbEtP3+M9a39dT8NWlzDo/aGR9XUbuLH3fXS8tgedrr2ZwR99GtPPEAs9et7AxCkjmDR1JDff2jW3/G83X8/kGaOYNHUkjzx+T4ARxkb/1BdJWz2X2bPH/mXbnXfeTObONMofzr+PnHwscUwtdc+ypSu4oPHFACQkJLB46feM+PwrAKpWrUKLluezalVakCEWus7tL+Sayzryjyf65pY98uzL3N37bzQ84zSGjRjNoPf+x209bsjd/vy/U7ng3Aa564nFinHPbf9HvRPqkJ6ewRU33U7jhmdQu2b1mH4Wv5x4Ul2u73oFrZt3YefOTD4aNoCvRn1L1apVaNe+JU0bX8zOnZlUqFAu6FB99/Y7H/HGG4MYOOiVPcpTUo7lwlZNWLlydUCRFY6wXChVS30fmjVrzIrlq/jllzUAPPPcgzz80HM4F47/03dpUP9Ujj7qyD3KVv6SRoP6pwLQqOGZjBk/KXfb2AnfU7VK5T0S9jEVylHvhDoAlC6dTK3q1Vi3YVMMoo+N40+ozcwZc9m+fQfZ2dl8/900Lrq4Nd1uuppXXkpl585MADZu/C3gSP03adJUfvt981/K+/Z9lAf+8dTh//sISUtdSX0fLu1yEUOHfg5A+w6tWLNmHQsWLA44qtioXbM630ycDMBX307k13UbAcjI2M7Adz/m1u7X7ve9aWvX8cPSnzjt5BNiEmss/LBoKY0aN6BsuTIkJZWiVeumHJtShdp1atKocQNGf/Mxw794lzPOPDXoUANx8cWtWZO2lnnzFgUdSoG5HBf1cjBmNtDM1pvZgjxlL5jZYjObZ2afmFmZPNseMLNlZrbEzNrkKW/rlS0zs/uj+Ry+JnUz+8PMtnrLDjPLNrOtftZZUMWLF6d9h5Z8+skXJCWV4u939+TpJ18KOqyYeeIffRgybARXdL+N9IztFC8e6aF7feC7XH/lJSQnJ+3zfRkZ2+nz4JPcd/vNHFG6dCxD9tXSH3/i1Zf6M/STgXw0bAAL5v1AdnY2iYnFKFP2aNq0uJxH/vk8/33r5aBDjbmkpFLcf99tPPpY34PvfDgo3Jb6W0DbvcrGAKc4504DfgQeADCzesBVwMnee94ws2JmVgx4HWgH1AOu9vY9IF/71J1zuX/bm5kBnYBz97e/mfUAegCUKlGBEsWP8jO8fbqwdVPmzlnIhvWbqHfy8VSvUY1Jk0cCULVqZSZMGk6Lppewfv3GmMcWC7WqV6P/y08D8POq1Uz4fhoA8xcuYcy3k/jXGwP4Y1s6ZkbJEiW4pktHMrOyuPPBJ+nQujkXNjsvyPB98d7gobw3OHJh+MGH72LNml+pe3wtRg6PXHOZPXMeOc5RvnxZNm36PchQY6p27RrUqHEcM2eMASAlpQrTpo6m8XkdWLduQ8DR5Z/LKsRjOTfBzGrsVfZVntUpQBfvdSdgiHPuT2CFmS0Dzva2LXPOLQcwsyHevgf8syhmF0pdpMPtUzN7BNjnnxHOuVQgFeDoI2oH0kHX5fKLGfpxpOtl0cIfqVPz7Nxt8xaOp1mTzvwW4h/upt83U75sGXJycuj39hCu6NwegHf+s7s19vqAd0lOKsU1XTrinOPhZ16mVvVqdL3q0qDC9lWFCuXYuPE3qqZU4aKOrWnT8nJcTg7nNzmHSROnUrtODUoUL16kEjrAggWLqZpyeu760h+ncG6jdofteXD56CvP2wD1pHr5K1rdgQ+911WJJPldVntlAL/sVX7OwQ7sa1I3s7y/8gSgAbDDzzoLIjk5iebNz+PO2x8MOpSYuOeRZ5k+ex6bN2+lZefruPWm68nYvp0hw0YA0KppYy7p0PqAx5g9byGfjxpL3do1uKxrLwDuuLkrTRqffcD3HU4Gvfsa5cqVITMzi3v//hhbt/zBe4P/x6tvPM3EKSPI3JlJ71vuCzpM3w0e/DpNmzSiQoVyrFg+g8cf78ugt4YEHVbhyUdSz9sAzS8zexDIAt47lPcf9Ph+XrE2s0F5VrOAn4H+zrn1B3tvUC31eLTx5zFBhxA3qtTau5uy6NqyI/3gOxURmTvTrKDH2HBh06hzzjFjxh+0Pq/7ZYRz7pQ8Zd2Am4GWzrkMr+wBAOfcM976aOBR7y2POufa7Gu//fG7T/1GP48vIlJY8tP9cijMrC1wL9B0V0L3DAfeN7N/AccCdYFpgAF1zawmkEbkYuo1B6vH79Evx5vZ2F3DeszsNDN7yM86RUQOhcu2qJeDMbMPgMnACWa22sxuAl4DjgTGmNkcM3sTwDm3EPiIyAXQUUAv51y2cy4L6A2MBn4APvL2PXDdPne/jAfuAfo5587wyhbk/XNkf9T9spu6X3ZT98tu6n7ZrTC6X35t0izqnFN5wrgC1+cXv0e/JDvnpkVGM+YqxIFDIiKFw+XEbZ7OF7+T+kYzqw04ADPrAqz1uU4RkXzzu089VvxO6r2IDPs50czSgBXA/u8zFxEJiHNqqUcjDRgEfAuUA7YCXYHHfa5XRCRf1FKPzmfAZmAWsMbnukREDllOFKNaDgd+J/UU55yGK4hI3AvLhVK/p9793syK5pykInJYcTkW9RLP9ttSN7PP8Uat7ItzrmMUxz8f6GZmK4A/idwh5bypJ0VE4sbh/oyPXQ7U/VIYkyS3K4RjiIj4Lt5b4NHab1J3zo0v6MGdcysLegwRkVgoMkMazawu8AyRJ2+U2lXunKvlY1wiIjGVHZLRL9FcKB0E/IfI7f3NgXeAd/0MSkQk1pyzqJd4Fk1ST3LOjSUy+ddK59yjQAd/wxIRia3Qj37J408zSwCWmllvIneJHuFvWCIisRWW0S/RtNTvAJKB24GzgOuJ3OovIhIaRaal7pyb7r3cBuhJRiISStk5ft+LGRvRjH75ln3chOSca+FLRCIiAQhL90s0fep353ldCrgMPehCREImJ85HtUQrmu6XmXsVfWdm03yKR0QkEPE+VDFa0XS/lMuzmkDkYunRvkUkIhKAotT9MpNIn7oR6XZZAdzkZ1AAO7J2+l3FYaPRqRpstMsJR6YEHULcmLpjSdAhhEqR6X4BTnLO7chbYGYlfYpHRCQQYRn9Es2n+H4fZZMLOxARkSC5fCzx7EDzqVcGqgJJZnYGke4XgKOI3IwkIhIaRaH7pQ3QDUgBXmR3Ut8K/MPfsEREYiv0o1+cc28Db5vZZc65/8UwJhGRmMsJOoBCEk2f+llmVmbXipmVNbMnfYxJRCTmHBb1Es+iSertnHObd604534H2vsXkohI7GU5i3qJZ9EMaSxmZiWdc38CmFkSoCGNIhIq8d4Cj1Y0Sf09YKyZDSJysbQb8LafQYmIxFpY+tSjmfvlOTObC7QiMkRzNFDd78BERGKpKLXUAdYRSeiXE5kmQKNhRCRUQt9SN7Pjgau9ZSPwIZHnlDaPUWwiIjGTXQRa6ouBicBFzrllAGbWJyZRiYjEWJw/pS5qBxrSeCmwFvjWzPqbWUsIyT9lIiJ7ycGiXuLZfpO6c+5T59xVwInAt8CdQEUz+4+ZtY5VgCIisRCWCb0OevORcy7dOfe+c+5iIvPAzAbu8z0yEZEYysnHcjBm1sfMFprZAjP7wMxKmVlNM5tqZsvM7EMzK+HtW9JbX+Ztr1GQz5GvCYSdc78751Kdcy0LUqmISLzJMYt6ORAzqwrcDjRwzp0CFAOuAp4DXnLO1QF+Z/fDhm4CfvfKX/L2O2ThmBVeRKSAsvOxRCGRyLTliUSmKl8LtACGetvfBjp7rzux+4bOoUBLs4P8y3EASuoiIkRGv0S7mFkPM5uRZ+mx6zjOuTSgL7CKSDLfQuSxoJudc1nebquJPK8C739/8d6b5e1f/lA/R7Q3H4mIhFp+RrU451KB1H1tM7OyRFrfNYHNwMdA20IIMSpqqYuIUKijX1oBK5xzG5xzmcAw4DygjNcdA5FBJ2ne6zSgGoC3/Whg06F+DiV1ERHy1/1yEKuAc80s2esbbwksIjI0vIu3T1fgM+/1cG8db/s3zrlDHjmp7hdPSkoVBgx4mUoVK+CcY8CA93nt9YE89FAfut94DRs3Rv7hfPjh5xg1+tuAoy18lY6tyGOvPki5Y8rhnOOTd4cz5L+RazpXdr+My2+8hOzsHL77ejKvPvmf3e+rWpGPxw8mte8g3n1zSFDhF6qKxx7DP1+5n7IVyoKDz94bwccDhlGnXi3uebYPSclJrF29jsd6P0XGtgxaX9KSa3pemfv+2ifVonvbm1m68KcAP4W/SpYsybhv/keJkiVJTCzGsGEjeezxF4MOq0AKa+4X59xUMxsKzAKyiAwDTwVGAkO8hwzNBgZ4bxkADDazZcBvREbKHDIrwD8IvipZqlpMA6tcuSKVK1dkzpwFHHFEaaZM/oIul/+NLl0uIn1bBi+93C+W4ezh1LI1fK+jfMXyVKhUniXzfyS5dBKDRw/g7u7/oFyFsnS/4wbuvP5eMndmUrZ8GX7flPvMFJ7r/wTOORbMWhSTpF4yobjvdZSvWI7yFcvz44KlJJdOYsCoN3mg+8M89PJ9vPbEm8yZMo8OV7bl2OOq0P+FQXu8t9aJNXl2wBNccd51vsc5dcMS3+s4kNKlk0lPzyAxMZEJ4z6hz12PMHXarEBiydqZVuDbPAekXBd1zrlp9btxe1upul88v/66njlzFgCwbVs6ixcvo2rVygFHFTub1m9iyfwfAchI387PS3+mYuUKdOnambdfe5fMnZkAeyT0pm0vIG3VWpYvWRFIzH7ZtP43flywFIici5VLV3FM5QpUq5XCnCnzAJg+cSZN21/wl/de2LkFXw//JqbxBiU9PQOA4sUTSSxenHhtIEarMG8+CpKvSd3Mnjezo8ysuJmNNbMNZuZ/E6aAqldP4fT6JzNt2mwAbunZlRnTv6Jfv76UKXN0wNH5r0pKZU449XgWzFrEcbWqUf+c03lrZD/6Dfs39U4/EYCk5CS69rqG/i8OOsjRDm+VUypR95Q6LJz9Ayt+XMkFbc4DoPlFTal0bMW/7N/y4uaM+bRoJPWEhARmTP+KtWnzGDt2AtOmzw46pAJRUo9Oa+fcVuAi4GegDnDP/nbOO/YzO3ubz6HtW+nSyQz5oB933/0of/yxjdTUwZx00vk0PLsNv/66nuee+2cgccVKUnISzw94khcffpX0bRkkJhbj6DJH0a3Dzbz6+Bs8k/oYAD3uvpH3Uz9ie8b2gCP2T1JyKZ7q/xivPvIGGdsyePqu57m0aycGfPkmyaWTyczM3GP/emecyI7tO1ix5OdgAo6xnJwcGjRsTfWaDWjY4AxOPvmEoEMqEGfRL/HM7wulu47fAfjYObflQDdK5R37Ges+dYDExEQ+HJLKkCGf8tlnowBYv35j7vaBA9/nk2FvxTqsmCmWWIznBzzJqGFj+PaLCQCsW7uBb74YD8DCOT/gchxlypfhlDPr0fKiZtz+z54cedQR5OQ4dv65k48GDQvyIxSaYonFeKr/Y3z1ydeM/3IiAKt++oU+19wLQLVaKTRuee4e72nVqQVff1Y0Wul5bdmylXHjv6NN62YsXBhsP39BxHsLPFp+J/URZrYY2A70NLNjgB0+13nI+vV7gcWLl/LKq/1zyypXrsivv64HoFPHtof1l/ZgHv7X/axY+jPv9fswt2z8qIk0OO9MZn4/m+NqVSOxeCKbN23m/zr3zt2nx99vJCN9e2gSOsADL97DymWr+DB1aG5ZmfJl2LxpM2ZG1zuu49PBw3O3mRktLmrGrZfeEUS4MVehQjkyM7PYsmUrpUqVolXLJrzQ942gwyqQKG//j3u+JnXn3P1m9jywxTmXbWbpRO60ijuNGzfkumu7MH/+D0ybGmmlP/zwc1xxZSdOP+1knHOsXLmaXr3vDzhSf5x+9ql0uLwtSxf9xHtjBgLwxjOpfPbBSB5+6QE+/PZtMjOzePSOpwOO1H+nNTyFdl1as2zRT7z1VeSmwX7PDiClZlUu7Rb5+o7/YhIjPxyV+576557G+rXrWbNqbSAxx1qVKpUYOOBlihVLICEhgaFDP2fkF18HHVaBhOUhGb4OaTSz4kBPoIlXNB5407vL6oCC6H6JV7EY0ni4iMWQxsNF0EMa40lhDGl86bjohzT2WRW/Qxr97n75D1Ac2PV32fVe2d98rldEJF/Upx6dhs650/Osf2Nmc32uU0Qk38LSNeD3kMZsM6u9a8XMahGe6xEiEiKFOPdLoPxuqd9D5MHVy731GsCNPtcpIpJvYWlt+t1S/w7oR6S76jfv9WSf6xQRybccXNRLPPO7pf4OsBV4wlu/BhgMXO5zvSIi+aILpdE5xTlXL8/6t2a2yOc6RUTyLb7b39Hzu/tllpnl3kttZucAM3yuU0Qk38IyoZffLfWzgO/NbJW3fhywxMzmA845d5rP9YuIRCXLwtFW9zupx+xhqyIiBRGOlO7/3C8r/Ty+iEhhifdulWjpGaUiIhD3QxWjpaQuIoK6X0REQkXdLyIiIZIdkra6krqICGqpi4iEilNLXUQkPNRSFxEJEQ1pFBEJkXCkdCV1EREAskKS1pXURUTQhVLfZeeE5bJFwc3dtPzgOxUR4fjZFY6SicWDDiFUwpJx4japi4jEklrqIiIhopa6iEiIZDu11EVEQkPj1EVEQiQsfep+P3haROSwUNgPnjazYmY228xGeOs1zWyqmS0zsw/NrIRXXtJbX+Ztr1GQz6GkLiJCpPsl2iVKdwA/5Fl/DnjJOVcH+B24ySu/CfjdK3/J2++QKamLiBDpfon2v4MxsxSgA/Bfb92AFsBQb5e3gc7e607eOt72lt7+h0R96iIiFProl5eBe4EjvfXywGbnXJa3vhqo6r2uCvwC4JzLMrMt3v4bD6VitdRFRMhf94uZ9TCzGXmWHruOY2YXAeudczOD+BxqqYuIkL+bj5xzqUDqfjafB3Q0s/ZAKeAo4BWgjJkleq31FCDN2z8NqAasNrNE4Ghg0yF8BEAtdRERoPD61J1zDzjnUpxzNYCrgG+cc9cC3wJdvN26Ap95r4d763jbv3Hu0PuClNRFRPBl9Mve7gPuMrNlRPrMB3jlA4DyXvldwP0F+RzqfhERAQrQOD7QMccB47zXy4Gz97HPDuDywqpTSV1EBMgOyR2lSuoiImjuFxGRUPGj+yUISuoiIqilLiISKmGZpVFJXUQEPSRDRCRU1P0iIhIiYUnquqN0P/qnvsia1XOZM3ts0KHEXP/UF0lbPZfZeT77s888xPz545k1cwwff/xfjj76qAAjDE5R/l4ALPphEtOmjWLylC+YOGk4AE899QCzZo9l6tQv+WBIv8P2u+Gci3qJZ0rq+/HOOx/R4aJrgw4jEG+/8xEX7fXZvx47gfr1W3DmWReydOly7ruvd0DRBasofy92adfuahqd254Lzu8IwDffTKJhg9acc047li1dwd133xpwhIcmBtMExISv3S9mdteBtjvn/uVn/QUxcdJUqldPCTqMQEzax2f/+usJua+nTp3FZZd2iHVYcaEofy/2Z+zYibmvp02fzSWd2wUYzaELy+gXv1vqDYCeRCaBrwrcApxJZOL4Iw/wPolj3bpdxajR3wYdhgTAOcfwzwcz6bvPubH71X/ZfsMNl/PVV+NiH1ghyHY5US/xzO8LpSnAmc6VeIlVAAAH60lEQVS5PwDM7FFgpHPuOp/rFZ/cf//tZGVl8f77w4IORQLQqlUX1q5ZxzHHlOfzz9/lxyU/8d130wC4595eZGVlM2TIpwFHeWjiva88Wn631CsBO/Os7/TK9inv00RyctJ9Dk3y64brr6BD+1bccEPR7E8XWLtmHQAbNmxi+OejadDgdACuu64L7dq1pPuNdwQZXoGoTz067wDTzOwTb70z8Nb+ds77NJHEElXj+8wVMa1bN+Pvd/ekZcvL2L59R9DhSACSk5NISEhg27Z0kpOTaNnyAp595lUuvLApd/a5mbZtrjysvxth6VM3v//kMLMzgQu81QnOudnRvC/opP7u4Ndp2qQRFSqUY926jTz2eF8GvTUkkFgO+bHih2jwXp/98cf7cu+9vSlZsiS//fY7ELlY2qt3gebyPyRB/+zi6XtRMrF4TOurUaMaQ4ZEnuBWLLEYH330GS88/zrz5o+jZMkS/PbbZgCmTZvNHbc/GNPY0jN+LvDP5JRK50b99Vqwbkqsf5ZR8z2pH6qgk3o8idtvTwD0pdgt1kk9nhVGUj+50jlRf70Wrpsatz9L3VEqIgJxP6olWkrqIiJATpz2WuSXkrqICOG5UKqkLiKCWuoiIqGilrqISIhku+ygQygUSuoiIoRnmgAldRERwvOQDCV1ERHUUhcRCRWNfhERCRGNfhERCRFNEyAiEiLqUxcRCRH1qYuIhIha6iIiIaJx6iIiIaKWuohIiGj0i4hIiITlQmlC0AGIiMQD51zUy8GYWVszW2Jmy8wspk9oV1IXESFyR2m0/x2ImRUDXgfaAfWAq82sXgw+AqCkLiICFGpL/WxgmXNuuXNuJzAE6OT7B/CoT11EhELtU68K/JJnfTVwTmEd/GDiNqln7UyzoGMAMLMezrnUoOOIBzoXu+lc7BaWc5GfnGNmPYAeeYpS4+UcqPvl4HocfJciQ+diN52L3YrcuXDOpTrnGuRZ8ib0NKBanvUUrywmlNRFRArXdKCumdU0sxLAVcDwWFUet90vIiKHI+dclpn1BkYDxYCBzrmFsapfSf3g4qKfLE7oXOymc7GbzsVenHNfAF8EUbeFZb4DERFRn7qISKgoqYuIhIiSuohIiBTppG5mNczsBzPrb2YLzewrM0sys9pmNsrMZprZRDM70du/tplNMbP5ZvakmW0L+jMUlkM4F2+ZWZc87w/NuYDc87HYzN7zzstQM0s2s5ZmNtv7Dgw0s5Le/s+a2SIzm2dmfYOOPxbM7EEz+9HMJpnZB2Z2d9AxSRFP6p66wOvOuZOBzcBlRK7m3+acOwu4G3jD2/cV4BXn3KlEbv0Nm/yci6LgBOAN59xJwFbgLuAt4ErvO5AI9DSz8sAlwMnOudOAJwOKN2bM7Cwi46/rA+2BhsFGJLsoqcMK59wc7/VMoAbQGPjYzOYA/YAq3vZGwMfe6/djGWSM5OdcFAW/OOe+816/C7Qkco5+9MreBpoAW4AdwAAzuxTIiHmksXcB8IlzLsM5t5UY3lwjB6Zx6vBnntfZQCVgs3OufkDxBCk/5yILr1FgZglACf/Di7m9x/tuBsr/ZafIzSZnE0n6XYDeQAv/wxP5K7XU/2orsMLMLgewiNO9bVOIdElA5E/PsDvQufgZOMt73REoHvvwfHecmTXyXl8DzABqmFkdr+x6YLyZHQEc7d1w0gc4/a+HCp0JQGfvusuRwMVBByQRSur7di1wk5nNBRayey7kO4G7zGweUIfIn91ht79z0R9o6pU3AtIDis9PS4BeZvYDUBZ4CbiRSHfUfCAHeBM4EhjhfS8mEel7DzXn3CzgQ2Au8CWR+U4kDuiO0nwws2Rgu3POmdlVwNXOuZhNfi+xY2Y1gBHOuVMCDuWwYGaPAtucc0Vi5E88U596/pwFvGZmRqR/tXvA8YiI7EEtdRGREFGfuohIiCipi4iEiJK6iEiIKKlLoTOzbDObY2YLzOxjb9TQoR6rmZmN8F53NLP7D7BvGTO79RDqeFTzlkhYKKmLH7Y75+p7wwF3Arfk3ejdxJTv755zbrhz7tkD7FIGyHdSFwkTJXXx20Sgjjfr4RIzewdYAFQzs9ZmNtnMZnkt+iMAzKytN0PiLODSXQcys25m9pr3upKZfWJmc72lMfAsUNv7K+EFb797zGy6N3viY3mOlTvDIJGJu0RCQePUxTdmlgi0A0Z5RXWBrs65KWZWAXgIaOWcSzez+4jcrfs8kbtVWwDLiNy1uC+vAuOdc5eYWTHgCOB+4JRdc9WYWWuvzrMBA4abWRMid7/ummEwEZhFZAIzkcOekrr4Icmb1REiLfUBwLHASufcFK/8XKAe8F3kXi5KAJOBE4nMhLgUwMzeBXrso44WwA0AzrlsYIuZld1rn9beMttbP4JIkj8Sb4ZBrw7NMCihoaQufti+98yOXuLOOz+MAWOcc1fvtV9hzo5pwDPOuX571XFnIdYhElfUpy5BmQKct2vGQzMrbWbHA4uJzIRY29vv6v28fyzQ03tvMTM7GviDSCt8l9FA9zx99VXNrCKaYVBCTEldAuGc2wB0Az7wZjecDJzonNtBpLtlpHehdP1+DnEH0NybLXEmUM85t4lId84CM3vBOfcVkYeZTPb2GwocqRkGJcw094uISIiopS4iEiJK6iIiIaKkLiISIkrqIiIhoqQuIhIiSuoiIiGipC4iEiJK6iIiIfL/x6gv8DoA708AAAAASUVORK5CYII=\n", - "text/plain": [ - "

" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "8AF08ssF6z5K", + "colab_type": "text" + }, + "source": [ + "# Sentiment Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o-tHtMyc6z5L", + "colab_type": "text" + }, + "source": [ + "This notebook details the steps taken to create a sentiment analyzer using data from [WISESIGHT Sentiment Analysis](https://www.kaggle.com/c/wisesight-sentiment/) competition. Competition metric is overall accuracy across `neg`ative, `pos`itive, `neu`tral and `q`uestion classes. We give examples using logistic regression and ULMFit.\n", + "\n", + "The results for logistic regression, FastText, ULMFit, ULMFit with semi-supervised data are as follows:\n", + "\n", + "| Model | Public Accuracy | Private Accuracy |\n", + "|---------------------|-----------------|------------------|\n", + "| Logistic Regression | 0.72781 | 0.7499 |\n", + "| FastText | 0.63144 | 0.6131 |\n", + "| ULMFit | 0.71259 | 0.74194 |\n", + "| ULMFit Semi-supervised | 0.73119 | 0.75859 |\n", + "| ULMFit Semi-supervised Repeated One Time | **0.73372** | **0.75968** |\n", + "\n", + "For more information about the competition, see [1st Place Solution](https://www.kaggle.com/c/wisesight-sentiment/discussion/83564)." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bNjkuQK46z5M", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "e04e1073-d8ca-4bd2-ac1c-2240826a75dc" + }, + "source": [ + "#uncomment if you are running from google colab\n", + "!pip install sklearn_crfsuite\n", + "!pip install emoji\n", + "!pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "!pip install fastai\n", + "!wget https://github.com/PyThaiNLP/wisesight-sentiment/archive/master.zip\n", + "!unzip master.zip\n", + "!mkdir wisesight_data; ls" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting sklearn_crfsuite\n", + " Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl\n", + "Requirement already satisfied: tabulate in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (0.8.3)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (1.12.0)\n", + "Requirement already satisfied: tqdm>=2.0 in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (4.28.1)\n", + "Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)\n", + "\u001b[K |████████████████████████████████| 757kB 3.8MB/s \n", + "\u001b[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite\n", + "Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6\n", + "Collecting emoji\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1b/d7/2746b4dd67375ce253e777ba54869545d24d2b0249ebcf83735c99df68d5/emoji-0.5.3.tar.gz (43kB)\n", + "\u001b[K |████████████████████████████████| 51kB 2.0MB/s \n", + "\u001b[?25hBuilding wheels for collected packages: emoji\n", + " Building wheel for emoji (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for emoji: filename=emoji-0.5.3-cp36-none-any.whl size=42175 sha256=208a827318503334b8daef569a30c9a2bf390eb8da0c6e8326ada209ae4708cf\n", + " Stored in directory: /root/.cache/pip/wheels/86/09/26/f944015841423cd516e8a97f30e29be59e53461aea8b7d3458\n", + "Successfully built emoji\n", + "Installing collected packages: emoji\n", + "Successfully installed emoji-0.5.3\n", + "Collecting https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "\u001b[?25l Downloading https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "\u001b[K | 15.7MB 322kB/s\n", + "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (0.3.0)\n", + "Collecting marisa-trie==0.7.4 (from pythainlp==2.1.dev2)\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1b/5f/21295ebb1feb1abde1e7652c0a4c182b4c25bdd5dda5a0f5b34d4e88bcc3/marisa_trie-0.7.4-cp36-cp36m-manylinux1_x86_64.whl (870kB)\n", + "\u001b[K |████████████████████████████████| 880kB 2.7MB/s \n", + "\u001b[?25hRequirement already satisfied: nltk>=3.2.2 in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (3.2.5)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (2018.9)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (2.21.0)\n", + "Collecting tinydb (from pythainlp==2.1.dev2)\n", + " Downloading https://files.pythonhosted.org/packages/d7/f9/0e871cbf0da678cf1780609dc6aef26a5ed544c86733fc1ceaf134fce52c/tinydb-3.13.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (4.28.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk>=3.2.2->pythainlp==2.1.dev2) (1.12.0)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (1.24.3)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (2.8)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (2019.6.16)\n", + "Building wheels for collected packages: pythainlp\n", + " Building wheel for pythainlp (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pythainlp: filename=pythainlp-2.1.dev2-cp36-none-any.whl size=11014043 sha256=edfa71c88f221b4c0428a99429b522cdd69c512db0dbe00ed8910da6114e2c44\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-foefq95n/wheels/79/4e/1e/26f3198c6712ecfbee92928ed1dde923a078da3d222401cc78\n", + "Successfully built pythainlp\n", + "Installing collected packages: marisa-trie, tinydb, pythainlp\n", + "Successfully installed marisa-trie-0.7.4 pythainlp-2.1.dev2 tinydb-3.13.0\n", + "Collecting fastai==1.0.45\n", + "\u001b[31m ERROR: Could not find a version that satisfies the requirement fastai==1.0.45 (from versions: 0.6, 0.7.0, 1.0.0b7, 1.0.0b8, 1.0.0, 1.0.1, 1.0.2, 1.0.3, 1.0.4, 1.0.5, 1.0.6, 1.0.7, 1.0.9, 1.0.10, 1.0.11, 1.0.12, 1.0.13, 1.0.14, 1.0.15, 1.0.16, 1.0.17, 1.0.18, 1.0.19, 1.0.20, 1.0.21, 1.0.22, 1.0.24, 1.0.25, 1.0.26, 1.0.27, 1.0.28, 1.0.29, 1.0.30, 1.0.31, 1.0.32, 1.0.33, 1.0.34, 1.0.35, 1.0.36, 1.0.36.post1, 1.0.37, 1.0.38, 1.0.39, 1.0.40, 1.0.41, 1.0.42, 1.0.43.post1, 1.0.44, 1.0.46, 1.0.47, 1.0.47.post1, 1.0.48, 1.0.49, 1.0.50, 1.0.50.post1, 1.0.51, 1.0.52, 1.0.53, 1.0.53.post1, 1.0.53.post2, 1.0.53.post3, 1.0.54, 1.0.55, 1.0.57)\u001b[0m\n", + "\u001b[31mERROR: No matching distribution found for fastai==1.0.45\u001b[0m\n", + "--2019-08-20 07:49:28-- https://github.com/PyThaiNLP/wisesight-sentiment/archive/master.zip\n", + "Resolving github.com (github.com)... 192.30.253.112\n", + "Connecting to github.com (github.com)|192.30.253.112|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://codeload.github.com/PyThaiNLP/wisesight-sentiment/zip/master [following]\n", + "--2019-08-20 07:49:29-- https://codeload.github.com/PyThaiNLP/wisesight-sentiment/zip/master\n", + "Resolving codeload.github.com (codeload.github.com)... 140.82.113.9\n", + "Connecting to codeload.github.com (codeload.github.com)|140.82.113.9|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [application/zip]\n", + "Saving to: ‘master.zip’\n", + "\n", + "master.zip [ <=> ] 3.95M 2.65MB/s in 1.5s \n", + "\n", + "2019-08-20 07:49:31 (2.65 MB/s) - ‘master.zip’ saved [4137118]\n", + "\n", + "Archive: master.zip\n", + "a2b5c41957bc799fe61cd407e7dc191ca6122dcb\n", + " creating: wisesight-sentiment-master/\n", + " inflating: wisesight-sentiment-master/.gitignore \n", + " inflating: wisesight-sentiment-master/README.md \n", + " inflating: wisesight-sentiment-master/exploration.ipynb \n", + " creating: wisesight-sentiment-master/kaggle-competition/\n", + " inflating: wisesight-sentiment-master/kaggle-competition/README.md \n", + " inflating: wisesight-sentiment-master/kaggle-competition/competition.ipynb \n", + " inflating: wisesight-sentiment-master/kaggle-competition/test.txt \n", + " inflating: wisesight-sentiment-master/kaggle-competition/test_label.txt \n", + " inflating: wisesight-sentiment-master/kaggle-competition/test_majority.csv \n", + " inflating: wisesight-sentiment-master/kaggle-competition/test_solution.csv \n", + " inflating: wisesight-sentiment-master/kaggle-competition/text_generation.ipynb \n", + " inflating: wisesight-sentiment-master/kaggle-competition/train.txt \n", + " inflating: wisesight-sentiment-master/kaggle-competition/train_label.txt \n", + " inflating: wisesight-sentiment-master/kaggle-competition/train_model.py \n", + " inflating: wisesight-sentiment-master/neg.txt \n", + " inflating: wisesight-sentiment-master/neu.txt \n", + " inflating: wisesight-sentiment-master/pos.txt \n", + " inflating: wisesight-sentiment-master/q.txt \n", + " creating: wisesight-sentiment-master/word-tokenization/\n", + " inflating: wisesight-sentiment-master/word-tokenization/README.md \n", + " inflating: wisesight-sentiment-master/word-tokenization/data-preparation-and-post-processing.ipynb \n", + " inflating: wisesight-sentiment-master/word-tokenization/wisesight-160-samples-tokenised.label \n", + " inflating: wisesight-sentiment-master/word-tokenization/wisesight-160-samples-tokenised.txt \n", + "master.zip sample_data wisesight_data wisesight-sentiment-master\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "O-eB6ovn_UgH", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + }, + "outputId": "6a6b59e2-e0ed-4184-a0fe-ee55cd66f3b2" + }, + "source": [ + "!cd wisesight-sentiment-master/kaggle-competition; ls" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "text": [ + "competition.ipynb test_majority.csv text_generation.ipynb train.txt\n", + "README.md\t test_solution.csv train_label.txt\n", + "test_label.txt\t test.txt\t train_model.py\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Zs8wtP0m6z5O", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import re\n", + "\n", + "import emoji\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from pythainlp import word_tokenize\n", + "from tqdm import tqdm_notebook\n", + "\n", + "#viz\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cRXpcJp16z5R", + "colab_type": "text" + }, + "source": [ + "## Text Processor for Logistic Regression" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "exvcf5XV6z5R", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def replace_url(text):\n", + " URL_PATTERN = r\"\"\"(?i)\\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\\s()<>{}\\[\\]]+|\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)|\\([^\\s]+?\\))+(?:\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)|\\([^\\s]+?\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’])|(?:(?\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorytextsprocessedwcuwc
0neuเห็นคนลบแอพ viu ก็เห็นใจและเข้าใจเขานะคะ แผลมั...เห็น|คน|ลบ|แอพ|viu|ก็|เห็นใจ|และ|เข้าใจ|เขา|นะ...4641
1neuไปชมไม้คิวของแชมป์ และรองแชมป์ กันจ้า! ..........ไป|ชม|ไม้คิว|ของ|แชมป์|และ|รอง|แชมป์|กัน|จ้า|!...4139
2negกลุ่มรถซีวิคเป็นกลุ่มที่น่ารำคานมากกกกกกกกก อว...กลุ่ม|รถ|ซีวิค|เป็น|กลุ่ม|ที่|น่า|รำ|คาน|มาก|x...4635
3neuอยากสวยเหมือนเจ้าของแบรนด์สิคะ เนย โชติกา ใบหน...อยาก|สวย|เหมือน|เจ้าของ|แบรนด์|สิ|คะ|เนย|โชติ|...7256
4negข้าวโถละร้อย แพง เพราะตักเป็นจานๆละ15 เต็มที่ก...ข้าว|โถ|ละ|ร้อย|แพง|เพราะ|ตัก|เป็น|จาน|ๆ|ละ|15...379218
\n", + "" + ], + "text/plain": [ + " category texts ... wc uwc\n", + "0 neu เห็นคนลบแอพ viu ก็เห็นใจและเข้าใจเขานะคะ แผลมั... ... 46 41\n", + "1 neu ไปชมไม้คิวของแชมป์ และรองแชมป์ กันจ้า! .......... ... 41 39\n", + "2 neg กลุ่มรถซีวิคเป็นกลุ่มที่น่ารำคานมากกกกกกกกก อว... ... 46 35\n", + "3 neu อยากสวยเหมือนเจ้าของแบรนด์สิคะ เนย โชติกา ใบหน... ... 72 56\n", + "4 neg ข้าวโถละร้อย แพง เพราะตักเป็นจานๆละ15 เต็มที่ก... ... 379 218\n", + "\n", + "[5 rows x 5 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "GojEjj2k6z5m", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 + }, + "outputId": "1596bac3-8a47-49bf-ed6c-59745c422145" + }, + "source": [ + "#prevalence\n", + "print(train_df[\"category\"].value_counts() / train_df.shape[0])" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "text": [ + "neu 0.544957\n", + "neg 0.253557\n", + "pos 0.180071\n", + "q 0.021415\n", + "Name: category, dtype: float64\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "sH1t3bal6z5o", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 + }, + "outputId": "0c50fe48-3671-4940-d4fa-294161e84fa0" + }, + "source": [ + "#prevalence\n", + "print(valid_df[\"category\"].value_counts() / valid_df.shape[0])" + ], + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "text": [ + "neu 0.542659\n", + "neg 0.264266\n", + "pos 0.170914\n", + "q 0.022161\n", + "Name: category, dtype: float64\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fhsWvG9c6z5q", + "colab_type": "text" + }, + "source": [ + "## Logistic Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oUAqMvNe6z5q", + "colab_type": "text" + }, + "source": [ + "### Create Features" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5eI-DEzW6z5r", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#dependent variables\n", + "y_train = train_df[\"category\"]\n", + "y_valid = valid_df[\"category\"]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ry4GTGaC6z5t", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "30ae3a5e-b10b-4ec6-d907-5323714b1017" + }, + "source": [ + "#text faetures\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "tfidf = TfidfVectorizer(tokenizer=process_text, ngram_range=(1,2), min_df=20, sublinear_tf=True)\n", + "tfidf_fit = tfidf.fit(all_df[\"texts\"])\n", + "text_train = tfidf_fit.transform(train_df[\"texts\"])\n", + "text_valid = tfidf_fit.transform(valid_df[\"texts\"])\n", + "text_test = tfidf_fit.transform(test_df[\"texts\"])\n", + "text_train.shape, text_valid.shape" + ], + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((20453, 4563), (3610, 4563))" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7MnvOFdC6z5v", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "outputId": "a80b5dc6-7973-4be4-8ab6-1be4945f5f03" + }, + "source": [ + "#word count and unique word counts; actually might not be so useful\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler()\n", + "scaler_fit = scaler.fit(all_df[[\"wc\",\"uwc\"]].astype(float))\n", + "print(scaler_fit.mean_, scaler_fit.var_)\n", + "num_train = scaler_fit.transform(train_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_valid = scaler_fit.transform(valid_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_test = scaler_fit.transform(test_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_train.shape, num_valid.shape" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[21.55059635 17.94551802] [1081.91655857 490.1667113 ]\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((20453, 2), (3610, 2))" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bj5PA95S6z5w", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "925ecd72-0444-48d0-baa9-1e724d0242f5" + }, + "source": [ + "#concatenate text and word count features\n", + "X_train = np.concatenate([num_train,text_train.toarray()],axis=1)\n", + "X_valid = np.concatenate([num_valid,text_valid.toarray()],axis=1)\n", + "X_test = np.concatenate([num_test,text_test.toarray()],axis=1)\n", + "X_train.shape, X_valid.shape" + ], + "execution_count": 21, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((20453, 4565), (3610, 4565))" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 21 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "W4prkGZr6z5y", + "colab_type": "text" + }, + "source": [ + "### Fit Model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_2IKPcUL6z5z", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "b8e58817-b2ad-440c-9eee-d20e5468a557" + }, + "source": [ + "#fit logistic regression models\n", + "model = LogisticRegression(C=2., penalty=\"l2\", solver=\"liblinear\", dual=False, multi_class=\"ovr\")\n", + "model.fit(X_train,y_train)\n", + "model.score(X_valid,y_valid)" + ], + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.7257617728531855" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 22 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jFGtgFHF6z51", + "colab_type": "text" + }, + "source": [ + "### See Results" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7cub1c5S6z51", + "colab_type": "code", + "colab": {} + }, + "source": [ + "probs = model.predict_proba(X_valid)\n", + "probs_df = pd.DataFrame(probs)\n", + "probs_df.columns = model.classes_\n", + "probs_df[\"preds\"] = model.predict(X_valid)\n", + "probs_df[\"category\"] = valid_df.category\n", + "probs_df[\"texts\"] = valid_df.texts\n", + "probs_df[\"processed\"] = valid_df.processed\n", + "probs_df[\"wc\"] = valid_df.wc\n", + "probs_df[\"uwc\"] = valid_df.uwc\n", + "probs_df[\"hit\"] = (probs_df.preds==probs_df.category)\n", + "probs_df.to_csv(\"probs_df_linear.csv\", index=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "pa4Q0nPS6z54", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "outputId": "3e392b32-54c6-4fda-e819-af43e9ba0fd7" + }, + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "\n", + "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", + "print(model.score(X_valid,y_valid))\n", + "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", + " xticklabels=model.classes_, yticklabels=model.classes_)\n", + "plt.ylabel(\"Actual\")\n", + "plt.xlabel(\"Predicted\")\n", + "plt.show()" + ], + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.7257617728531855\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAEKCAYAAADticXcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XeYFFXWx/HvmYDMkCTnJWNGVDCh\ngqIIGFjjK2vCsKyIrrqiYlizggnXhIqiYsSwawbDuqgYABGQHEYxMIBIljjMzHn/6AJHhKGH6epu\nit/Hpx66b1fXPV0Ohzunbt02d0dERKIhI9UBiIhI4iipi4hEiJK6iEiEKKmLiESIkrqISIQoqYuI\nRIiSuohIhCipi4hEiJK6iEiEZKU6gK0Z3Pgs3eoaGF68INUhpI2xS2anOoS0UVRcnOoQ0kZhQb6V\n9xgbFn8Xd87JrtW83P2FRSN1EZEISduRuohIUhUXpTqChFBSFxEBKCpMdQQJoaQuIgK4R+MahZK6\niAhARC48K6mLiABopC4iEiG6UCoiEiEaqYuIRIdr9ouISIToQqmISISo/CIiEiG6UCoiEiEaqYuI\nRIgulIqIRIgulIqIRIe7auoiItGhmrqISISo/CIiEiEaqYuIREjRhlRHkBBK6iIioPKLiEikqPwS\nDRWq5nLk3RdSY7dG4M7/+j3BzxPyANi3dzc6/PNMnmpzEeuWrWKXarkceW9vqjWpQ+H6DYzq9wRL\nZ81L8SdIjDoNanPdA/2pUas67s7bL7zLa0P/Q8u9WnDlwMupsEsFigqLuP+6B5gxaRaHdTmUC646\nj2IvpqiwiIduGsyUr6am+mMkXKNG9Rk69F/UrVMLd2fo0Bd5+JGnGHDn9Rx33NEUFGzgu+9+4K+9\nr2TFipWpDjepju3SiUGDbiUzI4Onnn6Ju+95JNUhlU9ERurm7qmOYYsGNz4rKYEdNehvLBg3ixnD\nPyYjO5OsnF0oWLmGyvVr0OmeC6neogGvdr+BdctWccj1Pdmweh3j//U6u7aozxG39+KtngNCj3F4\n8YLQ+6hZpwY169Rk9tQ55FTK4cn3HuO682/k77dczCtP/Juxo8Zx8FEH0rPP/3HZaVeSk1uRtWvW\nAdB8j+bc8tg/ObvjeaHHOXbJ7ND7KKlevTrUq1eHSZOmUrlyJcZ8OYJTT7uQRo3qM2rU5xQVFXHH\n7dcCcP0N4f8slFSUwiSUkZHBjGmj6dq9J/PmLWDMlyM46+yLmTFjTkriKSzIt/IeY93o5+LOORUP\nP7vc/YUlI9UBpFKFKjk0OGg3Zgz/GIDiDUUUrFwDQIebzuLLO4ZT8h+9Gq0akv/FdACWf7uAKo1r\nkVOratLjDsOSRUuZPTX2F3Lt6rX8MOcHateLjU4rVckFoFKVSiz+eUlsnyChA+TkVoQ0HRyU18KF\ni5g0KfYbyKpVq5k5M4+GDevx3/9+SlFR7GaVseMm0rBR/VSGmXQHtt+Pb7/9nrlzf2TDhg288sqb\nnHjCsakOq1y8aEPcWzrbqZN6lca1Wbv0V44a1JvTRt5Op7svJCtnF5p22Z/VC5exZMaPv9t/8Ywf\nad6tHQB12janSsNaVK5fIxWhh6peo7q02rsl0yfO4KGbBtPnht689tVLXPzPixgy4MlN+x3etQPP\nffI0dw27g4FX3pvCiJOjSZNG7Nt2L8aNm/i79l7nns77749KUVSp0aBhPX6aN3/T83n5C2jQoF4K\nI0oAL45/2wYze8rMFpnZ1BJtN5tZvplNCrbuJV671szyzGyWmR1bor1r0JZnZv3j+RihJnUz+9XM\nVm62/WRmr5tZ8zD7jkdGVia1927K1Gc/4tVuN1C4Zj3t/3EyB1xyIuPue+0P+0945G0qVK3E6e/d\nwT69urB42g8UF0WjDrdRTm5FbnviZh66aTBrVq2hxzkn8PDNj3Jq+548fMtgrrmv36Z9R7/3OWd3\nPI/rL7iRC67qlbqgk6BSpVyGv/Q4/frdzK+/rtrUfs01l1JYWMRLL72ewugkIYqL49+27Rmg6xba\n73f3tsE2AsDM9gTOAPYK3jPYzDLNLBN4BOgG7An0DPYtVdgj9X8BVwENgUZAP+BFYDjw1OY7m1lv\nMxtvZuM/WxV+bW7VgqWsWrCURZO+BeDbEeOovXdTqjSuzenv38lZX9xP5fo1OG3k7eTUrsaGVWsZ\ndeUQXul6PR9d/hgVa1Rh5Y+/hB5nsmRmZXLbEzfz4esf8enIzwDoeloXPhkxGoBRb3/CHm13/8P7\nvhk7hQZ/qk+16tEoRW0uKyuLl4cPYfjwN3jzzfc2tZ999ml079aZc3tdmsLoUmN+/kIaN2qw6Xmj\nhvWZP39hCiNKgASO1N39U2BpnD33AIa7+3p3nwvkAQcGW567f+fuBcTyZo9tHSzspH6iuz/u7r+6\n+0p3HwIc6+4vA9U339ndh7h7O3dvd1jlViGHBmt/WcGqBUvZtXmsHtqow178MvV7ntmvL88fegXP\nH3oFqxYs5dVuN7D2lxVUqJpLRnYmAHv07MSCsTPZsGpt6HEmyzX39eOHvB95Zchvv6Us+XkJbQ/Z\nF4D9D9uPeXPzAWjY9Le/0K33bkV2hQqsWBbN2R+PP34PM2fO4YEHn9jU1uWYTlz5j4s45dTzWbt2\nXSnvjqavxk+iZctmNG3amOzsbE4/vQdvv/NBqsMqn8SO1LfmEjObHJRnNubAhsBPJfaZF7Rtrb1U\nYU9pXGNmpwMbs8SpwMa/AWlxZW30P4dx9EN9yMzOYsWPixh15ZCt7lu9ZQM63/833GHZ7HmMuuqJ\nre67o9mn/d50PbUL307/jqEfPA7AEwOHcvdVg/j7rX3JzMqkYF0B91w9CICO3Y/g2FOPobCwkPXr\nCri5z22pDD80hx7anrPOPJUpU2YwbmxslH7jjXcxaNCtVNilAiPefRGAceMmcMml16Uy1KQqKiri\nsstvYMS7L5KZkcEzw15m+vTkzkxKuDLMUzez3kDvEk1DgkFraR4FbiOW+24D7gPOL2OU244tzCmN\nQd38AeAQYh9kDHAFkA8c4O6fbe29yZrSuCNIxpTGHUWypzSms1ROaUw3iZjSuPbdf8Wdc3KOu3yb\n/ZlZU+Add9+7tNfM7FoAdx8QvPY+cHOw683ufmzQ/rv9tibUkbq7fwecsJWXt5rQRUSSLuQ7Ss2s\nvrtvHKGdBGycGfMW8KKZDQIaAK2AcYABrcysGbGB8BnAX7bVT6hJ3cxaE/uVo27wL1IbYnX228Ps\nV0SkzBL4m4+ZvQR0AmqZ2TzgJqCTmbUlVrX4HvgbgLtPM7NXgOlAIdDXg2/sMLNLgPeBTOApd5+2\nrb7Drqk/QWz2y+MA7j7ZzF4ElNRFJL0kcKTu7j230Dy0lP3vAO7YQvsIYERZ+g47qee6+ziz35Wf\novHtriISLRG5RhF2Ul9sZi0IZrqY2amArvqJSPrRKo1x6QsMAXY3s3xgLnBmyH2KiJRdYTSKCGEn\n9XzgaWAUUANYCZwL3BpyvyIiZRORRenCTupvAsuBCcD8bewrIpI6qqnHpZG7b2lRGxGR9BKRpB72\n2i9fmNk+IfchIlJ+CVzQK5XCHqkfBvQys7nAemJ3SLm7twm5XxGRsgm+9GRHF3ZS7xby8UVEEiMi\n5Zew1375Iczji4gkjJK6iEiEpHmtPF5K6iIigBdrnrqISHSo/CIiEiGa/SIiEiEaqYuIRIiSuohI\nhGhBLxGRCNFIXUQkQjSlMVx3rZmc6hDSRt6sN1IdQtqo31yLfm60fN3qVIcQLZr9IiISHa7yi4hI\nhKj8IiISIVr7RUQkQjRSFxGJkEJdKBURiQ6VX0REIkTlFxGR6NCURhGRKInISD0j1QGIiKSFYo9/\n2wYze8rMFpnZ1BJt95jZTDObbGavm9muJV671szyzGyWmR1bor1r0JZnZv3j+RhK6iIiEFsmIN5t\n254BNl/T4kNgb3dvA8wGrgUwsz2BM4C9gvcMNrNMM8sEHgG6AXsCPYN9S6WkLiJC7DtK4922eSz3\nT4Glm7V94O6FwdMxQKPgcQ9guLuvd/e5QB5wYLDluft37l4ADA/2LZWSuogIlKn8Yma9zWx8ia13\nGXs7HxgZPG4I/FTitXlB29baS6ULpSIiUKb11N19CDBke7oxs+uBQuCF7Xn/tiipi4hAUma/mFkv\n4Higs/umr1rKBxqX2K1R0EYp7Vul8ouICCR09suWmFlX4GrgRHdfU+Klt4AzzGwXM2sGtALGAV8B\nrcysmZlVIHYx9a1t9aORuogI4EWJu/nIzF4COgG1zGwecBOx2S67AB+aGcAYd7/I3aeZ2SvAdGJl\nmb7uXhQc5xLgfSATeMrdp22rbyV1ERFIaPnF3XtuoXloKfvfAdyxhfYRwIiy9K2kLiICcU1V3BEo\nqYuIQGSWCVBSFxEBiMZ6XkrqIiIAXhiNrL5TJ/V7HryFo7p0ZMnipXQ57GQArry2L8d0O5Li4mKW\nLF7KlZf8k0ULf6FqtSrc89CtNGnamPXr13PVpTcxe2Zeij9B+dxw5yA+/XwcNarvyhvPPwbAlf8c\nwPc/zgPg11WrqFK5Mv8e9ghTps/i5rseBMBxLj7/TI7u2IH16ws4t+9VFGzYQFFhEccceRiXXHh2\nyj5TGC7q24uzzjkNd2fG9Nlc2qc/d913E2332wcz+Dbvey7t05/Vq9ds+2ARk5GRwdgxI5mfv5Ae\nJ52b6nDKJxo5Hftt/nt6aVKzTeiBHXjIAaxZvYZBg+/YlNQrV6nEql9XA9Cr919o1bo51/e7netu\n/gerV6/hgXseo0Wrptx29/X85aS/hh0iAHmz3gjluOMnTSE3J4frbrt3U1Iv6Z6HnqBypVz6nH8m\na9etIzsrm6ysTH5ZvJRTzr2Y/735ApmZGaxdu47c3Bw2FBZyTp9+9L/sb+y79x6hxFy/+eZrJIWr\nXv26vPv+i3Q4sDvr1q3nyWf+xX8/+IR33v5g08/JbXdeyy+/LOHB+7frBsPttnzd6qT2tyWXX9ab\nAw5oQ9UqVVKa1AsL8q28x1h2Wqe4c071Vz8ud39h2alvPhr35dcsX7bid20b/6IC5ObmsPH/cqvd\nmvPF6HEAfDvnexo1bkCt2jWSFWoo2rXdh2pVq2zxNXfnvf99SvdjOgGQU7EiWVmZAKwvKIDYPFvM\njNzcHAAKCwspLCwkmIMbGVlZWVTMqUhmZia5uTksXLjodz8nFSvuQroOjsLUsGF9unfrzFNPvZTq\nUBKjuAxbGtupk/rWXHX9pXw5+QP+fOpxDBrwCADTp82m6/GdAdh3/71p2Lg+9RrUTWWYofr6m6nU\nrF6dJo1/Wz9o8rSZ9Djzb5x0Th9uvOqSTUm+qKiIU87tyxHH9+SQ9vvRZq/dUxV2wi1c8DOPPDSU\nSdM+Ztqcz1m58lc+/t/nADw4eADT876gVevmPPn4cymONPkG3XcL/a+9neKIfGNQIldpTKVQk7qZ\n/WpmK4NtnZkVmdnKMPtMhHvueIhD2nThjdfe5dwLY/cQPPrAUKpWq8KIj1+h1197Mm3KTIoTeAda\nuhnx4cd0P6bj79ra7LU7b77wOMOffIAnn3uF9esLAMjMzOTfwx7ho9efY8r02cz57vsURByOartW\npVv3zhywz1Hs3fowcnNzOe3/TgTg7xdfy96tD2P27G/588ndUxxpch3X/WgWLVrMhIlTUh1K4mik\nvm3uXsXdq7p7VSAHOAUYvLX9Sy5nuWrd0q3tljRvvPou3U44GoiVZa669Ea6dzqdK/pcT42a1fnx\nh3kpjjAchYVF/PeTL+ja+Ygtvt6i6Z/Izcn5Q/KuWqUyB+7fhs/GjE9ClMnRsdOh/PDDPJYsWUZh\nYSHvvP0B7Q/ab9PrxcXFvP7au5zQ49hSjhI9hx7ajhOO70Le7DG88PxgjjyyA8OeeTDVYZWLF8a/\npbOklV885g1gqz/97j7E3du5e7vKFVNTr27a/E+bHnfpfiTfzpkLQNWqVcjOjk0WOuPsUxj35YTf\n1VWjZMz4iTRv0oh6dWpvaps3fyGFhbFvfJm/8Gfm/vATDevXZemy5az8dRUA69av58uvJtKsSeMt\nHndHNG/efNq1b0tOTkUAjuh4CLNnfUezEj8nXbt3Zs7s71IVYkpcf8NAmjZvR8vWB3PmWRczatTn\nnNvr76kOq1y8OP4tnYU6pdHMTi7xNANoB6wLs8+yeHDIXRzSoR3Va+7KmCkfcv/AwRx5zOE0b9mU\n4uJi8n9awHX9bgOgZetm3PfI7TgwZ2YeV/39ptQGnwBX3TSQryZOZvnylXT+81lcfMHZnHLCsYz8\n7yd0O7rT7/adMHkaQ597haysLDIyjBv69aX6rtWYlTeX62+/l6LiYrzYOfaow+nU4aDUfKAQTBg/\nmbfffJ//jX6DwsJCpkyewbNPD+f1d56lSpXKmBnTps6k3xU7/s/DTi/Nk3W8Qp3SaGZPl3haCHwP\nPOHui7b13mRMadxRhDWlcUeU7CmN6SwdpjSmi0RMafzlmI5x55zaH36StlO8Qh2pu/t5YR5fRCRR\n0r2sEq+wZ7+0NrOPzGxq8LyNmd0QZp8iItvDiyzuLZ2FfaH0CWILw28AcPfJxL69Q0QkrehCaXxy\n3X3cZncYpvmEIBHZGXlxeo/A4xV2Ul9sZi0gdre9mZ0KLAi5TxGRMkv3EXi8wk7qfYEhwO5mlg/M\nBc4MuU8RkTJz10g9HvnA08AooAawEjgXuDXkfkVEykQj9fi8CSwHJgDzQ+5LRGS7Faf5rJZ4hZ3U\nG7m77hYRkbQXlQulYU9p/MLM9gm5DxGRcvNii3tLZ1sdqZvZ28BWb5t19xPjOP5hQC8zmwusByz2\nVm9T1kBFRMIUle85Ka38cm8Cjt8tAccQEQlduo/A47XVpO7un5T34O7+Q3mPISKSDDvNlEYzawUM\nAPYEKm5sd/fmIcYlIpJURRGZ/RLPhdKngUeJ3d5/JPAs8HyYQYmIJJu7xb2ls3iSeo67f0Rs7fUf\n3P1m4LhwwxIRSa5Ezn4xs8vMbKqZTTOzy4O2Gmb2oZnNCf6sHrSbmT1oZnlmNtnM9i/P54gnqa83\nswxgjpldYmYnAZXL06mISLpxj38rjZntDfwVOBDYFzjezFoC/YGP3L0V8FHwHGITSloFW29ilZHt\nFk9SvwzIBf4OHACcTexWfxGRyEjgSH0PYKy7r3H3QuAT4GSgBzAs2GcY8OfgcQ/g2eB7nMcAu5pZ\n/e39HNu8UOruXwUPVwH6JiMRiaSi4oTdizkVuMPMagJrge7AeKCuu29cpXYhUDd43BD4qcT75wVt\n27WibTyzX0axhZuQ3P2o7elQRCQdleXmIzPrTaxUstEQdx8SO47PMLO7gA+A1cAkoOj3fbmbWSi3\nO8Wz9ku/Eo8rAqegL7oQkYgpLsOsliCBDynl9aHAUAAzu5PY6PtnM6vv7guC8sqiYPd8oHGJtzcK\n2rZLPOWXrzdr+tzMxm1vhyIi6SiRUxXNrI67LzKzPxGrpx8MNCN2PXJg8Oebwe5vAZeY2XDgIGBF\niTJNmcVTfqlR4mkGsYul1ba3QxGRdJTgtV/+HdTUNwB93X25mQ0EXjGzC4AfgNODfUcQq7vnAWso\n57XLeMovXxOrqRuxsstc4ILydBqP5etXh93FDuP4/fqmOoS00bRS3W3vtJOYtO67VIcQKWUpv2yL\nux++hbYlQOcttDuxb4lLiHiS+h7uvq5kg5ntkqgARETSQQJnv6RUPJ/iiy20fZnoQEREUsnLsKWz\n0tZTr0dsrmSOme1HrPwCUJXYzUgiIpGRyPJLKpVWfjkW6EVses19/JbUVwLXhRuWiEhypftCXfEq\nbT31YcAwMzvF3f+dxJhERJKuONUBJEg8NfUDzGzXjU/MrLqZ3R5iTCIiSedY3Fs6iyepd3P35Ruf\nuPsyYnMqRUQio9At7i2dxTOlMdPMdnH39QBmlgNoSqOIREq6j8DjFU9SfwH4yMyeJnaxtBe/LR8p\nIhIJUampx7P2y11m9g1wNLEpmu8DTcIOTEQkmXamkTrAz8QS+mnElgnQbBgRiZTIj9TNrDXQM9gW\nAy8T+57SI5MUm4hI0hTtBCP1mcBo4Hh3zwMwsyuSEpWISJLF8X3SO4TSpjSeTOzrlEaZ2RNm1hki\n8k+ZiMhmirG4t3S21aTu7m+4+xnA7sAo4HKgjpk9amZdkhWgiEgyRGVBr23efOTuq939RXc/gdg6\nMBOBa0KPTEQkiYrLsKWzeGe/AJvuJi31u/lERHZExZbeZZV4lSmpi4hEVVGqA0gQJXUREaIz+0VJ\nXUQE0n5WS7yU1EVESP9ZLfFSUhcRQeWXSKpWrQoPPTKAPfZsjbvTt09/vho3EYBLLr2AOwZcR7Mm\n7Vi6ZFmKI0287F2yue+1e8iukE1mZiajR3zGc4Oe55oHr6ZVm1YUFRYya9JsHuj/IEWFsUtKfW65\niAOPas+6teu57x/3kTf12xR/isSo26AOtzx4PTVq18Ddef35txj+5GtcdPUFdDz2cIqLi1m2ZBk3\nX3Yni39eQsdjD+Oiqy+kuLiYoqIi7rvxQb4ZNyXVHyN0x3bpxKBBt5KZkcFTT7/E3fc8kuqQyiXd\npyrGy9zT85eOapVbJD2wRx+/hy+/+Ipnh71CdnY2ubkVWbHiVxo2rM9Dj9xJq9Yt6Hh4j6Qn9YOr\nt05KPxVzK7JuzToyszIZ9J97efSmx6myaxW+GvUVAP0fvoapY6fyznPv0v7I9vQ47wRuOOdGdt9v\nd/rc8jcuOzH8VSSWFK4KvY+adWpSq25NZk2ZTW6lHJ57fyj9zr+ORfMXsXrVGgD+74JTaN66KQOu\nuY+c3BzWrlkLQMs9WjBwyC2cevhZocc5acl3ofexNRkZGcyYNpqu3Xsyb94Cxnw5grPOvpgZM+ak\nJJ7Cgvxyj7OHNjor7pxzwbzn03ZcH883H+0UqlatTIcO7Xl22CsAbNiwgRUrfgVgwF3Xc+MNd5Gu\n/wAmyro16wDIysoiMysLd9+U0AFmTZpFrfq1ADiky8H8998fATBz4kwqVa1MjTrVkx90CJYsWsKs\nKbMBWLN6Ld/P+Z469WptSugAObk5bPxx2JjQY+0VI/9zAnBg+/349tvvmTv3RzZs2MArr7zJiScc\nm+qwyiUqNx+FmtTN7G4zq2pm2Wb2kZn9YmbhD2G2Q5MmjVm8eCmDH7ub0Z+/xUMP30lubg7djzua\n+fN/ZurUmakOMXQZGRkMfu9hXp70EhNHT2TWpFmbXsvMyqTzyZ0Z//F4AGrVq8kv8xdven3xgsXU\nrFcr6TGHrX6jeuy2T2umTpgOwMX9/8o741+j28nH8Ng9Qzft16nb4bw2+nn+9dzd3HrFwFSFmzQN\nGtbjp3nzNz2fl7+ABg3qpTCi8lNSj08Xd18JHA98D7QErtrazmbW28zGm9n4gg0rQw7t97Kysti3\n7V4MffIFDu9wIqvXrOXa6y7jyn59uPP2+5MaS6oUFxdzcddLOPPAs9mtbWua7Pbbd6Fcekdfpo6d\nytRx01IYYXLl5OZw99Dbue/GBzeN0gcPfILj253KyP98yOnnnbxp349HjubUw8+i3/nXcdHVF6Yq\nZCkHt/i3dBZ2Ut94IfY44FV3X1Hazu4+xN3buXu7CtlVQw7t9/LzF5Cfv5Cvx38DwJtvjGTftnvR\npGljPvvyXSZP+4SGDevx6WdvUadO9EakJa1euZpvvphM+07tADjz8r9QrWY1Hr/1t9UhFi9cQu0G\nv52HWvVrsWTh4j8ca0eVmZXJ3UNv573/fMioEZ/+4fWR//mAzsd1/EP7xDHf0LBJA6rVqJaMMFNm\nfv5CGjdqsOl5o4b1mT9/YQojKj+N1OPzjpnNBA4g9j2ntYF1Ife5XRYtWkx+/gJatmoGQMdOh/LN\npGm0bHYgbfbqSJu9OpKfv5AjDjuRRYuik7w2qlajGpWqVgKgQsUK7H/EfvyU9xNdzziWdh0PYMAl\nv7+mMObDMRx9SmcAdt9vd9b8upqli6IzK+jGQf2ZO+d7Xnj85U1tjZs12vS407GH833ejwA0atpw\nU/tu+7SmQoVsViwtdfyyw/tq/CRatmxG06aNyc7O5vTTe/D2Ox+kOqxyKSrDls5CndLo7v3N7G5g\nhbsXmdlqoEeYfZbH1VfewpND7ye7Qjbfz/2Jvn2uTnVISVOjTnX63d+PjMwMMjKMT98ezdiPxjFi\n7jv8nL+If70xCIDPR37BCw+8yLj/fUX7o9rz9GdPsX7tOu67Mjolqn0P3IfjTuvKnOnf8sKHTwEw\neMAQevzlOJq0+BPFxc6CeQsZcM29AHQ+riPdT+tK4YZC1q9bz7UX3ZTK8JOiqKiIyy6/gRHvvkhm\nRgbPDHuZ6dNnpzqscknkPHUz2xV4Etib2H1N5wOziH2DXFNi5ejT3X2ZmRnwANAdWAP0cvcJ2913\nmFfqzSwb6AMcETR9Ajzm7hu29d5UTGlMV8ma0rgjSMaUxh1FKqc0pptETGm8/0/xT2m84sfSpzSa\n2TBgtLs/aWYVgFzgOmCpuw80s/5AdXe/xsy6A5cSS+oHAQ+4+0Hb+znCLr88Sqz0MjjY9g/aRETS\nSqJq6mZWjdhAdiiAuxe4+3JiVYphwW7DgD8Hj3sAz3rMGGBXM6u/vZ8j7DtK27v7viWe/8/Mvgm5\nTxGRMktgaaAZ8AvwtJntC3wNXAbUdfcFwT4LgbrB44bATyXePy9oW8B2CHukXmRmLTY+MbPmpP91\nBhHZCRVb/FvJ6dfB1rvEobIIqhLuvh+wGuhfsi+P1b1DKTGHPVK/itgXV28s/jUFzgu5TxGRMivL\naNPdS/sGuHnAPHcfGzx/jVhS/9nM6rv7gqC8sih4PR9oXOL9jYK27RL2SP1z4HFiZailweMvQ+5T\nRKTMivG4t9K4+0LgJzPbLWjqDEwH3gLODdrOBd4MHr8FnGMxBxObLbhdpRcIf6T+LLASuC14/hfg\nOeC0kPsVESmTBN9UdCnwQjDz5TtiFYoM4BUzuwD4ATg92HcEsZkvecSmNJarmhF2Ut/b3fcs8XyU\nmU0PuU8RkTJLZIHb3ScB7bbwUuct7OtA30T1HXb5ZULw6wQAZnYQMD7kPkVEyiwqywSEPVI/APjC\nzH4Mnv8JmGVmU4j9A9Um5P4Jw+0sAAAL5UlEQVRFROJSaNG43zHspN415OOLiCRENFJ6+Gu//BDm\n8UVEEiXdyyrx0neUiojANqcq7iiU1EVEUPlFRCRSVH4REYmQooiM1ZXURUTQSF1EJFJcI3URkejQ\nSF1EJEI0pVFEJEKikdKV1EVEACiMSFpXUhcRQRdKQ7e6YF2qQ0gbY5bNTnUIaUM/FxIWXSgVEYkQ\njdRFRCJEI3URkQgpco3URUQiQ/PURUQiRDV1EZEIUU1dRCRCVH4REYkQlV9ERCJEs19ERCJE5RcR\nkQjRhVIRkQiJSk09I9UBiIikg2I87q00ZlbRzMaZ2TdmNs3Mbgnam5nZWDPLM7OXzaxC0L5L8Dwv\neL1peT6HkrqICODucW/bsB44yt33BdoCXc3sYOAu4H53bwksAy4I9r8AWBa03x/st92U1EVEgCI8\n7q00HrMqeJodbA4cBbwWtA8D/hw87hE8J3i9s5nZ9n4OJXURERJXfgEws0wzmwQsAj4EvgWWu3th\nsMs8oGHwuCHwE0Dw+gqg5vZ+DiV1ERHKVn4xs95mNr7E1nuzYxW5e1ugEXAgsHuyPodmv4iIULZ5\n6u4+BBgSx37LzWwUcAiwq5llBaPxRkB+sFs+0BiYZ2ZZQDVgSRnD30QjdRERYlMa4/2vNGZW28x2\nDR7nAMcAM4BRwKnBbucCbwaP3wqeE7z+P4/jauzWaKQuIkJClwmoDwwzs0xiA+dX3P0dM5sODDez\n24GJwNBg/6HAc2aWBywFzihP50rqIiIkbpkAd58M7LeF9u+I1dc3b18HnJaQzlFSFxEBtPbLTiEj\nI4OxY0YyP38hPU46d9tviJBq1arw0CMD2GPP1rg7ffv056txE+l90Tn8tfdZFBUV8cF7H3PjP8t1\nn8QOJ2/2GH5dtYqiomIKCws5+JDuqQ4pZaJ2LspRxk4rSuql+PulFzJz5hyqVqmS6lCSbuDdN/Lf\nDz/lnLMuITs7m9zcihx+xMEcd9zRdDj4eAoKCqhVe7un0u7Qjj7mNJYsWZbqMNJClM6FRupxMLN/\nlPa6uw8Ks//yaNiwPt27dWbAwAe5/LLe235DhFStWpkOHdrT529XAbBhwwZWrNjABRf+hfvve4yC\nggIAFv+y3bOuRNKOFvSKTzugD7E7phoCFwH7A1WCLW0Nuu8W+l97O8XFUVmQM35NmjRm8eKlDH7s\nbkZ//hYPPXwnubk5tGjZjEM6tOejUf/m3fdeZP/990l1qEnn7owc8RJjx4zkwgvOTHU4KRW1c1Hk\nxXFv6Szs8ksjYH93/xXAzG4G3nX3s0Lut1yO6340ixYtZsLEKXQ84pBUh5N0WVlZ7Nt2L67qdwtf\nj/+GgXf/kyuuvIisrCyqV9+Vzkeewv4HtOGZZx+izd6dUh1uUnU88iTmz19I7do1eW/kcGbNymP0\nZ2NTHVZKRO1cRKWmHvZIvS5QUOJ5QdC2RSVvvS0uXh1yaFt36KHtOOH4LuTNHsMLzw/myCM7MOyZ\nB1MWT7Ll5y8gP38hX4//BoA33xjJvvvuxfz8hbz91vsATPh6MsXFxdSsVSOVoSbd/PkLAfjllyW8\n+eZI2rdvm+KIUidq5yKRa7+kUthJ/VlgnJndHIzSxwLPbG1ndx/i7u3cvV1GRqWQQ9u6628YSNPm\n7WjZ+mDOPOtiRo36nHN7/T1l8STbokWLyc9fQMtWzQDo2OlQZs3M4913PuDwIw4GoEXLpmRXqMCS\nxUtTGWpS5ebmULlypU2Pjzm6I9OmzUpxVKkRxXORqDtKUy3U8ou732FmI4HDg6bz3H1imH1KYlx9\n5S08OfR+sitk8/3cn+jb52pWr17LI48O5MtxI9lQULDpQurOom7d2rz2auwmwKysTIYPf4P3P/g4\ntUGlSBTPRXFEyi+WrnWkrAoN0zOwFKhUoWKqQ0gbqwvWpToESUOFBfnbvf74RnvVPSjunDPt57Hl\n7i8smqcuIgJpP6slXkrqIiJEp/yipC4iQnRuPlJSFxFBI3URkUjRSF1EJEKKvCjVISSEkrqICNFZ\nJkBJXUQELb0rIhIpGqmLiESIZr+IiESIZr+IiESIlgkQEYkQ1dRFRCJENXURkQjRSF1EJEI0T11E\nJEI0UhcRiRDNfhERiZCoXCjNSHUAIiLpwN3j3rbFzLqa2SwzyzOz/kkIfxMldRERYneUxvtfacws\nE3gE6AbsCfQ0sz2T8BEAJXURESChI/UDgTx3/87dC4DhQI/QP0BANXURERJaU28I/FTi+TzgoEQd\nfFvSNqkXFuRbqmMAMLPe7j4k1XGkA52L3+hc/CYq56IsOcfMegO9SzQNSZdzoPLLtvXe9i47DZ2L\n3+hc/GanOxfuPsTd25XYSib0fKBxieeNgrakUFIXEUmsr4BWZtbMzCoAZwBvJavztC2/iIjsiNy9\n0MwuAd4HMoGn3H1asvpXUt+2tKiTpQmdi9/oXPxG52Iz7j4CGJGKvi0q6x2IiIhq6iIikaKkLiIS\nIUrqIiIRslMndTNramYzzOwJM5tmZh+YWY6ZtTCz98zsazMbbWa7B/u3MLMxZjbFzG43s1Wp/gyJ\nsh3n4hkzO7XE+yNzLmDT+ZhpZi8E5+U1M8s1s85mNjH4GXjKzHYJ9h9oZtPNbLKZ3Zvq+JPBzK43\ns9lm9pmZvWRm/VIdk+zkST3QCnjE3fcClgOnELuaf6m7HwD0AwYH+z4APODu+xC79TdqynIudga7\nAYPdfQ9gJfAP4Bng/4KfgSygj5nVBE4C9nL3NsDtKYo3aczsAGLzr9sC3YH2qY1INlJSh7nuPil4\n/DXQFDgUeNXMJgGPA/WD1w8BXg0ev5jMIJOkLOdiZ/CTu38ePH4e6EzsHM0O2oYBRwArgHXAUDM7\nGViT9EiT73DgdXdf4+4rSeLNNVI6zVOH9SUeFwF1geXu3jZF8aRSWc5FIcGgwMwygArhh5d0m8/3\nXQ7U/MNOsZtNDiSW9E8FLgGOCj88kT/SSP2PVgJzzew0AIvZN3htDLGSBMR+9Yy60s7F98ABweMT\ngezkhxe6P5nZIcHjvwDjgaZm1jJoOxv4xMwqA9WCG06uAPb946Ei51Pgz8F1lyrACakOSGKU1Lfs\nTOACM/sGmMZvayFfDvzDzCYDLYn92h11WzsXTwAdg/ZDgNUpii9Ms4C+ZjYDqA7cD5xHrBw1BSgG\nHgOqAO8EPxefEau9R5q7TwBeBr4BRhJb70TSgO4oLQMzywXWurub2RlAT3dP2uL3kjxm1hR4x933\nTnEoOwQzuxlY5e47xcyfdKaaetkcADxsZkasvnp+iuMREfkdjdRFRCJENXURkQhRUhcRiRAldRGR\nCFFSl4QzsyIzm2RmU83s1WDW0PYeq5OZvRM8PtHM+pey765mdvF29HGz1i2RqFBSlzCsdfe2wXTA\nAuCiki8GNzGV+WfP3d9y94Gl7LIrUOakLhIlSuoSttFAy2DVw1lm9iwwFWhsZl3M7EszmxCM6CsD\nmFnXYIXECcDJGw9kZr3M7OHgcV0ze93Mvgm2Q4GBQIvgt4R7gv2uMrOvgtUTbylxrE0rDBJbuEsk\nEjRPXUJjZllAN+C9oKkVcK67jzGzWsANwNHuvtrMriF2t+7dxO5WPQrII3bX4pY8CHzi7ieZWSZQ\nGegP7L1xrRoz6xL0eSBgwFtmdgSxu183rjCYBUwgtoCZyA5PSV3CkBOs6gixkfpQoAHwg7uPCdoP\nBvYEPo/dy0UF4Etgd2IrIc4BMLPngd5b6OMo4BwAdy8CVphZ9c326RJsE4PnlYkl+SoEKwwGfWiF\nQYkMJXUJw9rNV3YMEnfJ9WEM+NDde262XyJXxzRggLs/vlkflyewD5G0opq6pMoYoMPGFQ/NrJKZ\ntQZmElsJsUWwX8+tvP8joE/w3kwzqwb8SmwUvtH7wPklavUNzawOWmFQIkxJXVLC3X8BegEvBasb\nfgns7u7riJVb3g0ulC7ayiEuA44MVkv8GtjT3ZcQK+dMNbN73P0DYl9m8mWw32tAFa0wKFGmtV9E\nRCJEI3URkQhRUhcRiRAldRGRCFFSFxGJECV1EZEIUVIXEYkQJXURkQhRUhcRiZD/B16fsAOjNHqO\nAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KIfXHEVn6z58", + "colab_type": "text" + }, + "source": [ + "## [ULMFit](https://github.com/cstorm125/thai2fit) Model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xPQoEbfC6z58", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from fastai.text import *\n", + "from fastai.callbacks import CSVLogger, SaveModelCallback\n", + "from pythainlp.ulmfit import *\n", + "\n", + "model_path = \"wisesight_data/\"\n", + "all_df = pd.read_csv(\"all_df.csv\")\n", + "train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RIUQEyb96z5-", + "colab_type": "text" + }, + "source": [ + "### Finetune Language Model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XGpprjbp6z5_", + "colab_type": "code", + "colab": {} + }, + "source": [ + "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", + "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", + " NumericalizeProcessor(vocab=None, max_vocab=60000, min_freq=2)]\n", + "\n", + "data_lm = (TextList.from_df(all_df, model_path, cols=\"texts\", processor=processor)\n", + " .split_by_rand_pct(valid_pct = 0.01, seed = 1412)\n", + " .label_for_lm()\n", + " .databunch(bs=48))\n", + "data_lm.sanity_check()\n", + "data_lm.save('wisesight_lm.pkl')" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8GiTvaHX6z6A", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "4ca15151-7db8-4bc8-d035-ddcbfc3383e1" + }, + "source": [ + "data_lm.sanity_check()\n", + "len(data_lm.train_ds), len(data_lm.valid_ds)" + ], + "execution_count": 32, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(23823, 240)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Bm7PYDIC6z6E", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "f768001e-09ee-4572-ea48-ac9e38e7296a" + }, + "source": [ + "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,\n", + " output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)\n", + "trn_args = dict(drop_mult=1., clip=0.12, alpha=2, beta=1)\n", + "\n", + "learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", + "\n", + "#load pretrained models\n", + "learn.load_pretrained(**_THWIKI_LSTM)" + ], + "execution_count": 33, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LanguageLearner(data=TextLMDataBunch;\n", + "\n", + "Train: LabelList (23823 items)\n", + "x: LMTextList\n", + "xxbos ประเทศ เรา ผลิต และ ส่งออก ยาสูบ เยอะ สุด ใน โลก จิง ป่าว คับ,xxbos คะ,xxbos อิ เหี้ย ออม ทำ กู อยาก กิน เอ็ม เค,xxbos xxwrep 2 😅,xxbos สวัสดี วัน พุธ แนน อะไร นะ\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: wisesight_data;\n", + "\n", + "Valid: LabelList (240 items)\n", + "x: LMTextList\n", + "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: wisesight_data;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): AWD_LSTM(\n", + " (encoder): Embedding(15000, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " (1): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[RNNTrainer\n", + "learn: LanguageLearner(data=TextLMDataBunch;\n", + "\n", + "Train: LabelList (23823 items)\n", + "x: LMTextList\n", + "xxbos ประเทศ เรา ผลิต และ ส่งออก ยาสูบ เยอะ สุด ใน โลก จิง ป่าว คับ,xxbos คะ,xxbos อิ เหี้ย ออม ทำ กู อยาก กิน เอ็ม เค,xxbos xxwrep 2 😅,xxbos สวัสดี วัน พุธ แนน อะไร นะ\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: wisesight_data;\n", + "\n", + "Valid: LabelList (240 items)\n", + "x: LMTextList\n", + "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: wisesight_data;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): AWD_LSTM(\n", + " (encoder): Embedding(15000, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " (1): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[...], layer_groups=[Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): Embedding(15000, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (2): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)\n", + "alpha: 2\n", + "beta: 1], layer_groups=[Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): Embedding(15000, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (2): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=15000, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 33 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "uJK68vJT6z6G", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 97 + }, + "outputId": "9bde3724-568f-4630-afb7-df40b3aff0d4" + }, + "source": [ + "#train frozen\n", + "print(\"training frozen\")\n", + "learn.freeze_to(-1)\n", + "learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))" + ], + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "text": [ + "training frozen\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
04.8411874.4627140.31974202:47
" + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "axooWmsg6z6I", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "outputId": "819107f9-5d37-49ef-dac6-7cde4204656b" + }, + "source": [ + "#train unfrozen\n", + "print(\"training unfrozen\")\n", + "learn.unfreeze()\n", + "learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))" + ], + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "text": [ + "training unfrozen\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
04.4118344.2055520.34176603:31
14.1780304.0370950.36150803:31
23.9703883.9309190.37013903:31
33.7561903.8903980.37619103:31
43.6717043.8902320.37559503:31
" + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OZC4BGnB6z6L", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# learn.save('wisesight_lm')\n", + "learn.save_encoder(\"wisesight_enc\")" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hTTQ76Ls6z6N", + "colab_type": "text" + }, + "source": [ + "### Train Text Classifier" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "A2Z09Mf26z6N", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "27cfec82-23e3-4e0b-dd0d-7b4ba855e646" + }, + "source": [ + "#lm data\n", + "data_lm = load_data(model_path, \"wisesight_lm.pkl\")\n", + "data_lm.sanity_check()\n", + "\n", + "#classification data\n", + "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", + "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", + " NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=20)]\n", + "\n", + "data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=[\"texts\"], processor=processor),\n", + " valid=TextList.from_df(valid_df, model_path, cols=[\"texts\"], processor=processor))\n", + " .label_from_df(\"category\")\n", + " .databunch(bs=50)\n", + " )\n", + "data_cls.sanity_check()\n", + "print(len(data_cls.vocab.itos))" + ], + "execution_count": 39, + "outputs": [ + { + "output_type": "stream", + "text": [ + "15000\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RjRFWx8-6z6P", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "76a4bb55-a3cd-4d51-d5fc-6c2fec878573" + }, + "source": [ + "#model\n", + "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False,\n", + " output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5)\n", + "trn_args = dict(bptt=70, drop_mult=0.7, alpha=2, beta=1, max_len=500)\n", + "\n", + "learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", + "#load pretrained finetuned model\n", + "learn.load_encoder(\"wisesight_enc\")" + ], + "execution_count": 40, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RNNLearner(data=TextClasDataBunch;\n", + "\n", + "Train: LabelList (20453 items)\n", + "x: TextList\n", + "xxbos กันแดด คิว เพลส ตัวใหม่ นี่ คุม มัน ดีจริง อ่ะ นี่ หน้า มัน ยิ่ง ที โซน ยิ่ง มัน เยอะ นีเวีย หลอด ยาว ๆ ฝา เขียว ก็ เอา ไม่อยู่ อ่ะ แล้ว xxunk,xxbos พบ กับ การ ร่วม ตัว ของ ศิลปิน soul pop สาม ยุค สาม สไตล์ ใน งาน jamnight อะไร ก็ช่าง xxunk ( ชุ่ย ) นำ ทีม โดย soul after six , the parkinson และ the xxup toys งาน นี้ นอกจาก จะ ได้ ดู โชว์ แบบ เต็ม รูปแบบ จาก ทั้ง สาม วง แล้ว ยัง มี โชว์ สุด พิเศษ ที่ ทั้ง สาม จะ ร่วม แจม กัน ด้วย ไม่ อยาก ให้ พลาด เจอกัน วันที่ 29 กันยายน นี้ ที่ ช่าง ชุ่ย ประตู เปิด 19.00 น. เป็นต้นไป สามารถ ซื้อ บัตร ได้ แล้ว ที่ event pop : http : / / go . eventpop . me / jamnight * จำกัด ผู้ ที่ มีอายุ 20 ปี ขึ้นไป # jamnightbyjameson # jamesonthailand # soulaftersix # theparkinson # thetoys,xxbos 👌 🏻 👌 🏻 👌 🏻 xxwrep 2 😆,xxbos จ - ศ แถม ถึง 29 ไม่ทัน มะ ว้า xxrep 4 ,xxbos ใช้ ดี ค่ะ บอกต่อ คือ เป็น คน แพ้ ง่าย มาก กก ใช้ กา นิ เย หรือ พอน ก็ แพ้ แต่ ใช้ ครีม แตงโม แล้ว คือ ดี สิว ลด กันน้ำ ค่ะ นี้ ใช้ ไป เล่น สงกรานต์ มา รอด ค่ะ 555 เล่อ ค่า xxrep 5 \n", + "y: CategoryList\n", + "neg,neu,neu,neu,neg\n", + "Path: wisesight_data;\n", + "\n", + "Valid: LabelList (3610 items)\n", + "x: TextList\n", + "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", + "y: CategoryList\n", + "neu,neu,neg,neu,neg\n", + "Path: wisesight_data;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): MultiBatchEncoder(\n", + " (module): AWD_LSTM(\n", + " (encoder): Embedding(15000, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " )\n", + " (1): PoolingLinearClassifier(\n", + " (layers): Sequential(\n", + " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Dropout(p=0.27999999999999997)\n", + " (2): Linear(in_features=1200, out_features=50, bias=True)\n", + " (3): ReLU(inplace)\n", + " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): Dropout(p=0.1)\n", + " (6): Linear(in_features=50, out_features=4, bias=True)\n", + " )\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[RNNTrainer\n", + "learn: RNNLearner(data=TextClasDataBunch;\n", + "\n", + "Train: LabelList (20453 items)\n", + "x: TextList\n", + "xxbos กันแดด คิว เพลส ตัวใหม่ นี่ คุม มัน ดีจริง อ่ะ นี่ หน้า มัน ยิ่ง ที โซน ยิ่ง มัน เยอะ นีเวีย หลอด ยาว ๆ ฝา เขียว ก็ เอา ไม่อยู่ อ่ะ แล้ว xxunk,xxbos พบ กับ การ ร่วม ตัว ของ ศิลปิน soul pop สาม ยุค สาม สไตล์ ใน งาน jamnight อะไร ก็ช่าง xxunk ( ชุ่ย ) นำ ทีม โดย soul after six , the parkinson และ the xxup toys งาน นี้ นอกจาก จะ ได้ ดู โชว์ แบบ เต็ม รูปแบบ จาก ทั้ง สาม วง แล้ว ยัง มี โชว์ สุด พิเศษ ที่ ทั้ง สาม จะ ร่วม แจม กัน ด้วย ไม่ อยาก ให้ พลาด เจอกัน วันที่ 29 กันยายน นี้ ที่ ช่าง ชุ่ย ประตู เปิด 19.00 น. เป็นต้นไป สามารถ ซื้อ บัตร ได้ แล้ว ที่ event pop : http : / / go . eventpop . me / jamnight * จำกัด ผู้ ที่ มีอายุ 20 ปี ขึ้นไป # jamnightbyjameson # jamesonthailand # soulaftersix # theparkinson # thetoys,xxbos 👌 🏻 👌 🏻 👌 🏻 xxwrep 2 😆,xxbos จ - ศ แถม ถึง 29 ไม่ทัน มะ ว้า xxrep 4 ,xxbos ใช้ ดี ค่ะ บอกต่อ คือ เป็น คน แพ้ ง่าย มาก กก ใช้ กา นิ เย หรือ พอน ก็ แพ้ แต่ ใช้ ครีม แตงโม แล้ว คือ ดี สิว ลด กันน้ำ ค่ะ นี้ ใช้ ไป เล่น สงกรานต์ มา รอด ค่ะ 555 เล่อ ค่า xxrep 5 \n", + "y: CategoryList\n", + "neg,neu,neu,neu,neg\n", + "Path: wisesight_data;\n", + "\n", + "Valid: LabelList (3610 items)\n", + "x: TextList\n", + "xxbos เห็น คน ลบ แอ พ viu ก็ เห็นใจ และ เข้าใจ เขา นะคะ แผล มัน ยัง ใหม่ แถม อารมณ์ ยิ่ง โดน xxunk ง่าย อยู่ นี่ เนอะ 5 xxrep 7 ส่วน ทาง นี้ ก็ กอด netflix แน่น มาก เธอ อย่า ทำร้าย เรา นะ เรา รู้ เธอ ไม่ ทำร้าย เรา แน่นอน,xxbos ไป ชม ไม้ คิว ของ แชมป์ และ รอง แชมป์ กัน จ้า ! . xxrep 32 เก็บตก จาก การแข่งขัน แสงโสม สนุกเกอร์ 6 แดง โอเพ่น ประจำปี 2560 สนาม ที่ 2 ณ มัง กี้ สนุกเกอร์ คลับ ซอย โชค ชัย 4 ลาดพร้าว เมื่อ วันที่ 12 ต.ค. 60,xxbos กลุ่ม รถ ซีวิค เป็น กลุ่ม ที่ น่า รำ คาน มาก xxrep 9 อวด รถ กัน ได้ ทุก วินาที อวด ทำไม มึง ก็ ใช้ รถ เหมือนกัน ทุกคน ละ ก็ พวก xxunk ที่ บอ กว่า อวด รถ แต่ ถ่าย นม ตัวเอง ชัด ละ รถ เบลอ นี่ คือ ? xxrep 5 ,xxbos อยาก สวย เหมือน เจ้าของ แบรนด์ สิ คะ เนย โชติ กา ใบหน้า สวย ใส xxunk แม้ แต่งหน้า นี่ ขนาด เป็น คุณแม่ แล้ว นะเนี่ย ก็ ยัง สวย ไม่ xxunk ผ่าน ไป กี่ ปี ๆ ก็ ไม่ เปลี่ยน ผิว ดี๊ ดี ความ สวย . เรา สร้าง เอง ได้ ด้วยตัวเอง ถ้า ได้ ใช้ มาส ์กโช ต้อง สวย เหมือน โชติ กา แน่นอน ค่ะ # มาส ์กโช สวย ข้ามคืน # cho _ cosmetics # daradaily # ดารา เดลี่,xxbos ข้าว โถ ละ ร้อย แพง เพราะ ตัก เป็น จาน ๆ ละ 15 เต็มที่ ก็ 5 จาน คนไทย ต้อง กินข้าว ประเทศ xxunk ข้าว กินข้าว ในประเทศ ตัวเอง หม้อ เป็น ร้อย เป็นลม ดีกว่า ค่า ฉะ xxunk ถุง 5 โล ไม่ เกิน 200 เป็น ข้าว มะลิ ไก่ นี่ ไม่รู้ ว่า เป็นตัว หรือเปล่า แต่ ถ้า ตัว ละ 250 บาท แพง ไก่ย่าง ขาย 140 - 160 มี เยอะแยะ ยัง ได้ กำไร แพง สุด ไม่ ควร xxunk 200 ข้าวผัด ปู จาน ใหญ่ 300 ร้อย แพง ถ้า ผัด เป็น จาน ๆ ละ 50 ผัด 4 จาน ก็ เต็ม ถาด ใหญ่ แล้ว ส่วน เครื่อง ดืม ด้านบน อะไร 80 ถ้า เป็น ชาเย็น แพง มาก น้ำ ดืม ขวด ใหญ่ ขวด ละ 50 แพง บ้าน เรา เมืองร้อน อย่า เห็นแก่ตัว 30 ก็ พอแล้ว คน ต้อง ซื้อ เยอะ เบียร์ ช้าง ขวด ละ 120 กำไร xxunk น่าเกลียด มา 3 ขวด 360 แพง xxunk ขวด ใหญ่ ก็ แพง แต่ น้ำแข็ง พอได้ เพราะ อากาศ บ้าน เรา ร้อน ละลาย ง่าย อันนี้ พอ เข้าใจ คน ขาย แต่ ทะเล เผา ทะเล ลวก ไม่เห็น หน้าตา ว่า มี อะไร บ้าง ก็ กุ้ง หมึก ปู xxunk 300 ตัว กลางๆ ใส่ มา อย่าง 5 ตัว ปู สัก ตัว กำไร xxunk ตำ ทะเล ตำ กุ้ง สด 150 ไม่ แพง กุ้ง ชุ ป แป้ง ทอด แพง มาก ๆ ต้มยำ หม้อ ละ 300 ร้อย ถือว่า แพง มาก เพราะ มัน ใส่ ได้ ไม่ เยอะ หรอก มันดี ตรง ที่ มี น้ำ กับ ไฟ อุ่น ร้อน ของ กินใน บ้าน ยัง แทบ แตะ ไม่ได้ ทั้งที่ ป ระ ก็ xxunk คน ในประเทศ ยัง กิน ไม่ อิม ส่งออก นอก พอ มี น้อย เหลือ น้อย ก็ ขาย ให้ กัน แพง ๆ ระบบ แย่ เอาเปรียบ กันเอง ที่ บ้าน ขาย อาหาร กับข้าว ตาม สั่ง ป รุ่ง สุก ใหม่ แค่ จาน ละ 100 - 150 คน ยัง ว่า แพง ทั้งที่ บอ กว่า อร่อย ถ้า ไป เจอ แบบนี้ สงสัย ช็อค ตาย คา ร้าน เลย มัง ถ้า ต้อง ไป เจอ ทั้ง แพง ทั้ง ไม่อร่อย สัก แต่ ทำ ขาย ใคร ว่า ไม่ แพง ยินดี ด้วย ที่ คุณ เป็น คน มี ตัง แต่ เรา มอง จาก ค่าแรง กลางๆ ของ คน ในประเทศ นะ ซึ่ง ส่วนมาก คนใน ประเทศไทย ได้ แต่ ค่าแรงขั้นต่ำ กับ เบี้ย ขยัน เล็กน้อย คนจน เยอะ กว่า คนรวย ด้วย ทำ อะไร ต้อง นึกถึง ความ สมควร นึกถึง กันและกัน แต่ เรา ไม่ใช่ คน xxunk ชอบ มอง และ คิด จาก ความเป็นจริง ถึง ขาย ให้ ชาวต่างชาติ ก็ เถอะ เหมือน คนไทย เอาเปรียบ ช่วย โอกาส และ ไม่ ค่อย xxunk\n", + "y: CategoryList\n", + "neu,neu,neg,neu,neg\n", + "Path: wisesight_data;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): MultiBatchEncoder(\n", + " (module): AWD_LSTM(\n", + " (encoder): Embedding(15000, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " )\n", + " (1): PoolingLinearClassifier(\n", + " (layers): Sequential(\n", + " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Dropout(p=0.27999999999999997)\n", + " (2): Linear(in_features=1200, out_features=50, bias=True)\n", + " (3): ReLU(inplace)\n", + " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): Dropout(p=0.1)\n", + " (6): Linear(in_features=50, out_features=4, bias=True)\n", + " )\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('wisesight_data'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(\n", + " (0): Embedding(15000, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): PoolingLinearClassifier(\n", + " (layers): Sequential(\n", + " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Dropout(p=0.27999999999999997)\n", + " (2): Linear(in_features=1200, out_features=50, bias=True)\n", + " (3): ReLU(inplace)\n", + " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): Dropout(p=0.1)\n", + " (6): Linear(in_features=50, out_features=4, bias=True)\n", + " )\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)\n", + "alpha: 2\n", + "beta: 1], layer_groups=[Sequential(\n", + " (0): Embedding(15000, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(15000, 400, padding_idx=1)\n", + " )\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): PoolingLinearClassifier(\n", + " (layers): Sequential(\n", + " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Dropout(p=0.27999999999999997)\n", + " (2): Linear(in_features=1200, out_features=50, bias=True)\n", + " (3): ReLU(inplace)\n", + " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (5): Dropout(p=0.1)\n", + " (6): Linear(in_features=50, out_features=4, bias=True)\n", + " )\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 40 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pRgoPD766z6S", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# #train unfrozen\n", + "# learn.freeze_to(-1)\n", + "# learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7))\n", + "# learn.freeze_to(-2)\n", + "# learn.fit_one_cycle(1, slice(1e-2 / (2.6 ** 4), 1e-2), moms=(0.8, 0.7))\n", + "# learn.freeze_to(-3)\n", + "# learn.fit_one_cycle(1, slice(5e-3 / (2.6 ** 4), 5e-3), moms=(0.8, 0.7))\n", + "# learn.unfreeze()\n", + "# learn.fit_one_cycle(10, slice(1e-3 / (2.6 ** 4), 1e-3), moms=(0.8, 0.7),\n", + "# callbacks=[SaveModelCallback(learn, every='improvement', monitor='accuracy', name='bestmodel')])" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w54ZOwk66z6U", + "colab_type": "text" + }, + "source": [ + "Training takes about 20 minutes so we use the script `train_model.py` to do it with the following results (validation run):\n", + "\n", + "```\n", + "epoch train_loss valid_loss accuracy\n", + "1 0.812156 0.753478 0.687532\n", + "Total time: 00:56\n", + "epoch train_loss valid_loss accuracy\n", + "1 0.740403 0.699093 0.714394\n", + "Total time: 00:57\n", + "epoch train_loss valid_loss accuracy\n", + "1 0.727394 0.668807 0.723011\n", + "Total time: 01:34\n", + "epoch train_loss valid_loss accuracy\n", + "1 0.722163 0.675351 0.723517\n", + "2 0.675266 0.654477 0.738723\n", + "3 0.669178 0.641070 0.737962\n", + "4 0.612528 0.637456 0.744551\n", + "5 0.618259 0.635149 0.749366\n", + "6 0.572621 0.651169 0.749873\n", + "7 0.561985 0.661739 0.747593\n", + "8 0.534753 0.673563 0.738469\n", + "9 0.530844 0.688871 0.746072\n", + "10 0.522788 0.670024 0.743031\n", + "Total time: 23:42\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vM--oaCJ6z6V", + "colab_type": "text" + }, + "source": [ + "### See Results" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "eOCe24KL6z6W", + "colab_type": "code", + "colab": {} + }, + "source": [ + "learn.load(\"bestmodel\")\n", + "\n", + "#get predictions\n", + "probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)\n", + "classes = learn.data.train_ds.classes\n", + "y_true = np.array([classes[i] for i in y_true.numpy()])\n", + "preds = np.array([classes[i] for i in probs.argmax(1).numpy()])\n", + "prob = probs.numpy()\n", + "loss = loss.numpy()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LaJYU8f56z6Z", + "colab_type": "code", + "colab": {}, + "outputId": "28603bc9-8cf5-4aba-cfee-836d4c6b5b91" + }, + "source": [ + "to_df = np.concatenate([y_true[:,None],preds[:,None],loss[:,None],prob],1)\n", + "probs_df = pd.DataFrame(to_df)\n", + "probs_df.columns = [\"category\",\"preds\",\"loss\"] + classes\n", + "probs_df[\"hit\"] = (probs_df.category == probs_df.preds)\n", + "probs_df[\"texts\"] = valid_df.texts\n", + "(y_true==preds).mean()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.8392661555312158" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "l_evHEMM6z6b", + "colab_type": "code", + "colab": {}, + "outputId": "732e91f4-a281-4a70-bf3b-8c6d43cad41a" + }, + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import seaborn as sns\n", + "\n", + "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", + "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", + " xticklabels=classes, yticklabels=classes)\n", + "plt.ylabel(\"Actual\")\n", + "plt.xlabel(\"Predicted\")\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAEKCAYAAADticXcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xd8FVX6x/HPE0JJsNCkSJBuwYYKKqh0qQqo2AuI+0MRLLjW1bV3cS2rroQFVCyoLCqCgojSlN6LIAiCBKQpIAlIyvn9cYcQkHJDMncuk+/b17z2zpm5c547e+/DyZkzZ8w5h4iIhENC0AGIiEjhUVIXEQkRJXURkRBRUhcRCREldRGREFFSFxEJESV1EZEQUVIXEQkRJXURkRBJDDqA/Xmr6nW61dXTZ8uUoEOIG9t2bg86hLihH8huWTvTrKDHyNy4POpTWrxCrQLX5xe11EVEQiRuW+oiIjGVkx10BIVCSV1EBCA7K+gICoWSuogI4FxO0CEUCiV1ERGAHCV1EZHwUEtdRCREdKFURCRE1FIXEQkPp9EvIiIhogulIiIhou4XEZEQ0YVSEZEQUUtdRCREdKFURCREdKFURCQ8nFOfuohIeKhPXUQkRNT9IiISImqpi4iESHZm0BEUCiV1ERFQ94uISKio++Xwd1TtKjT7T+/c9SOOq8icvkM55qy6HF27CgAljkpm59YMhrd+MHe/0seWp/O455jz4jAW9vsi5nHHQs9eN3J9tyvAORYtXEKvW+7jhm5Xcsut3ahVuzq1qzfkt02/Bx2m7/qnvkj79q1Yv2EjZ5zREoBHH72Hjhe3JifHsX79Rm76Wx/Wrl0XcKSxVbJkScZ98z9KlCxJYmIxhg0byWOPvxh0WAUTkpa6OeeCjmGf3qp6XUwDswTjipn/ZsRFj5Cetim3vMHD15C5NYO5L3+aW9Ys9XZwjg2zfopJUu+zZYrvdeRVpUolvhwzhHMbtGXHjj8Z+M6rjBk9jgXzF7N58xZGfPkezZtcEkhS37Zze0zrO//8c0jfls7AQa/kJvUjjzyCP/7YBkDvXt056aTj6dX7/pjGBRD0L7d06WTS0zNITExkwrhP6HPXI0ydNiuQWLJ2pllBj7Fj4uCoT2mpC64vcH1+KdIt9byqnH8yW1eu3yOhA9S8+BxGXfF07vpxbc5i26oNZGX8GesQYyoxMZFSSaXIzMwiOakUv65dz/x5i4IOK+YmTZpK9eope5TtSugAyaWTideGkd/S0zMAKF48kcTixQ/78+BCcqE0IegA4kXNTo1Y8enkPcoqnXMC2zds4Y8VkT+tE5NLckqvi5jzr2FBhBgza9eu49+v/pf5P0xg8U+T2br1D779ZlLQYcWVxx+/j+U/Tefqqy/h0cdeCDqcQCQkJDBj+lesTZvH2LETmDZ9dtAhFYzLiX6JY74mdTP7w8y27rX8YmafmFktP+vOj4TixajW+kx+HjF1j/KanRux4rPdib7+3y9lUf9RoW+lH13mKNp3aEX9U5pzUp3GJCcnc8WVnYIOK648/PBz1KrdkA8++IRbb70x6HACkZOTQ4OGraleswENG5zBySefEHRIBZOTE/0Sx/xuqb8M3ANUBVKAu4H3gSHAwL13NrMeZjbDzGaMS1/qc2i7VW1+Opvm/8yOjVt3x1IsgertGrJi+O5Ef8wZdWjw4FV0mfIS9f7WhtNu68iJ3S6MWZyx0qz5eaz8eTWbNv5GVlYWnw8fzdnnnhl0WHHpgw+Gcckl7YMOI1Bbtmxl3PjvaNO6WdChFExIWup+96l3dM6dnmc91czmOOfuM7N/7L2zcy4VSIXYXiit1fmvXS/HXnAKW5atIWPtb7llX176RO7r+nddSmb6Dha/NSZWYcbM6l/W0ODs+iQllWL79h00bdaY2bPnBx1W3KhTpybLlq0AoOPFbViy5KeAI4q9ChXKkZmZxZYtWylVqhStWjbhhb5vBB1WwcR5Czxafif1DDO7AhjqrXcBdniv4+KqSmJSSao0OYXv79vzD4eanc7do+ulKJk5Yy7DPx3FuO8+Izsrm3lzF/H2wA/p0fMGbr+zB5UqVWDSlBGMGT2eO3r/5d/mUBk8+HWaNmlEhQrlWLF8Bo8/3pe27Vpw/PG1cTk5rFyVRq9esR/5ErQqVSoxcMDLFCuWQEJCAkOHfs7IL74OOqyCifMWeLR8HdLo9Zu/AjQiksSnAH2ANOAs59x+r77FekhjPIv1kMZ4FushjfFMP5DdCmNI4/aRL0d9SpM63Fk0hzQ655YDF+9ns4ZTiEj8CElL3e/RL8eb2VgzW+Ctn2ZmD/lZp4jIIdHol6j0Bx4AMgGcc/OAq3yuU0Qk/0Iy+sXvpJ7snJu2V1k4nu4qIuFSiC11MxtoZut39VLkKb/NzBab2UIzez5P+QNmtszMlphZmzzlbb2yZWYW1RV5v0e/bDSz2njXdMysC7DW5zpFRPKvcFvgbwGvAe/sKjCz5kAn4HTn3J9mVtErr0ekB+Nk4FjgazM73nvb68CFwGpgupkNd84dcL4Ov5N6LyLjzk80szRgBXCtz3WKiORfVuF1IjjnJphZjb2KewLPOuf+9PZZ75V3AoZ45SvMbBlwtrdtmTfgBDMb4u17wKTud/dLGjAIeIrIXaRjgK4+1ykikn/ORb8cmuOBC8xsqpmNN7OGXnlV4Jc8+632yvZXfkB+t9Q/AzYDs4A1PtclInLo8jGqxcx6AD3yFKV6d8QfSCJQDjgXaAh85MccWH4n9RTnXFuf6xARKbh8JPW8U5rkw2pgmIvc8TnNzHKACkR6NKrl2S/FK+MA5fvld/fL92Z2qs91iIgUnP9DGj8FmkPkHh6gBLARGA5cZWYlzawmUBeYBkwH6ppZTTMrQeRi6vCDVeJ3S/18oJuZrQD+BAxwzrnTfK5XRCR/srML7VBm9gHQDKhgZquBR4jMTDvQG+a4E+jqtdoXmtlHRC6AZgG9nHPZ3nF6A6OBYsBA59zCg9Xtd1Jv5/PxRUQKRyHeKeqcu3o/m67bz/5PERlQsnf5F0C+npnp99wvK/08vohIoYnz2/+jpWeUiohA3N/+Hy0ldRERwOWEYzJjJXUREVD3i4hIqBTi6JcgKamLiIBa6iIioaKkLiISIj4+rzmWlNRFREAtdRGRUNGQRn/dsWVy0CHEjY0/jwk6hLhRpZYm/dxly470oEMIF41+EREJD6fuFxGREFH3i4hIiGjuFxGREFFLXUQkRLJ0oVREJDzU/SIiEiLqfhERCQ8NaRQRCRO11EVEQkRJXUQkRDRNgIhIeOgZpSIiYaKkLiISIhr9IiISImqpi4iEiJK6iEh4uGx1v4iIhIda6iIi4aEhjSIiYaKkLiISIuHoUldSFxEBcFnhyOoJQQcQL+rUrcnE7z/PXX5ZM4eet3bL3d77tpvYsu0nypUvG1yQheyhp/9Fkw5X0fm6W3LLFi9dzrU9+nDJ9T3pde8jbEtP3+M9a39dT8NWlzDo/aGR9XUbuLH3fXS8tgedrr2ZwR99GtPPEAs9et7AxCkjmDR1JDff2jW3/G83X8/kGaOYNHUkjzx+T4ARxkb/1BdJWz2X2bPH/mXbnXfeTObONMofzr+PnHwscUwtdc+ypSu4oPHFACQkJLB46feM+PwrAKpWrUKLluezalVakCEWus7tL+Sayzryjyf65pY98uzL3N37bzQ84zSGjRjNoPf+x209bsjd/vy/U7ng3Aa564nFinHPbf9HvRPqkJ6ewRU33U7jhmdQu2b1mH4Wv5x4Ul2u73oFrZt3YefOTD4aNoCvRn1L1apVaNe+JU0bX8zOnZlUqFAu6FB99/Y7H/HGG4MYOOiVPcpTUo7lwlZNWLlydUCRFY6wXChVS30fmjVrzIrlq/jllzUAPPPcgzz80HM4F47/03dpUP9Ujj7qyD3KVv6SRoP6pwLQqOGZjBk/KXfb2AnfU7VK5T0S9jEVylHvhDoAlC6dTK3q1Vi3YVMMoo+N40+ozcwZc9m+fQfZ2dl8/900Lrq4Nd1uuppXXkpl585MADZu/C3gSP03adJUfvt981/K+/Z9lAf+8dTh//sISUtdSX0fLu1yEUOHfg5A+w6tWLNmHQsWLA44qtioXbM630ycDMBX307k13UbAcjI2M7Adz/m1u7X7ve9aWvX8cPSnzjt5BNiEmss/LBoKY0aN6BsuTIkJZWiVeumHJtShdp1atKocQNGf/Mxw794lzPOPDXoUANx8cWtWZO2lnnzFgUdSoG5HBf1cjBmNtDM1pvZgjxlL5jZYjObZ2afmFmZPNseMLNlZrbEzNrkKW/rlS0zs/uj+Ry+JnUz+8PMtnrLDjPLNrOtftZZUMWLF6d9h5Z8+skXJCWV4u939+TpJ18KOqyYeeIffRgybARXdL+N9IztFC8e6aF7feC7XH/lJSQnJ+3zfRkZ2+nz4JPcd/vNHFG6dCxD9tXSH3/i1Zf6M/STgXw0bAAL5v1AdnY2iYnFKFP2aNq0uJxH/vk8/33r5aBDjbmkpFLcf99tPPpY34PvfDgo3Jb6W0DbvcrGAKc4504DfgQeADCzesBVwMnee94ws2JmVgx4HWgH1AOu9vY9IF/71J1zuX/bm5kBnYBz97e/mfUAegCUKlGBEsWP8jO8fbqwdVPmzlnIhvWbqHfy8VSvUY1Jk0cCULVqZSZMGk6Lppewfv3GmMcWC7WqV6P/y08D8POq1Uz4fhoA8xcuYcy3k/jXGwP4Y1s6ZkbJEiW4pktHMrOyuPPBJ+nQujkXNjsvyPB98d7gobw3OHJh+MGH72LNml+pe3wtRg6PXHOZPXMeOc5RvnxZNm36PchQY6p27RrUqHEcM2eMASAlpQrTpo6m8XkdWLduQ8DR5Z/LKsRjOTfBzGrsVfZVntUpQBfvdSdgiHPuT2CFmS0Dzva2LXPOLQcwsyHevgf8syhmF0pdpMPtUzN7BNjnnxHOuVQgFeDoI2oH0kHX5fKLGfpxpOtl0cIfqVPz7Nxt8xaOp1mTzvwW4h/upt83U75sGXJycuj39hCu6NwegHf+s7s19vqAd0lOKsU1XTrinOPhZ16mVvVqdL3q0qDC9lWFCuXYuPE3qqZU4aKOrWnT8nJcTg7nNzmHSROnUrtODUoUL16kEjrAggWLqZpyeu760h+ncG6jdofteXD56CvP2wD1pHr5K1rdgQ+911WJJPldVntlAL/sVX7OwQ7sa1I3s7y/8gSgAbDDzzoLIjk5iebNz+PO2x8MOpSYuOeRZ5k+ex6bN2+lZefruPWm68nYvp0hw0YA0KppYy7p0PqAx5g9byGfjxpL3do1uKxrLwDuuLkrTRqffcD3HU4Gvfsa5cqVITMzi3v//hhbt/zBe4P/x6tvPM3EKSPI3JlJ71vuCzpM3w0e/DpNmzSiQoVyrFg+g8cf78ugt4YEHVbhyUdSz9sAzS8zexDIAt47lPcf9Ph+XrE2s0F5VrOAn4H+zrn1B3tvUC31eLTx5zFBhxA3qtTau5uy6NqyI/3gOxURmTvTrKDH2HBh06hzzjFjxh+0Pq/7ZYRz7pQ8Zd2Am4GWzrkMr+wBAOfcM976aOBR7y2POufa7Gu//fG7T/1GP48vIlJY8tP9cijMrC1wL9B0V0L3DAfeN7N/AccCdYFpgAF1zawmkEbkYuo1B6vH79Evx5vZ2F3DeszsNDN7yM86RUQOhcu2qJeDMbMPgMnACWa22sxuAl4DjgTGmNkcM3sTwDm3EPiIyAXQUUAv51y2cy4L6A2MBn4APvL2PXDdPne/jAfuAfo5587wyhbk/XNkf9T9spu6X3ZT98tu6n7ZrTC6X35t0izqnFN5wrgC1+cXv0e/JDvnpkVGM+YqxIFDIiKFw+XEbZ7OF7+T+kYzqw04ADPrAqz1uU4RkXzzu089VvxO6r2IDPs50czSgBXA/u8zFxEJiHNqqUcjDRgEfAuUA7YCXYHHfa5XRCRf1FKPzmfAZmAWsMbnukREDllOFKNaDgd+J/UU55yGK4hI3AvLhVK/p9793syK5pykInJYcTkW9RLP9ttSN7PP8Uat7ItzrmMUxz8f6GZmK4A/idwh5bypJ0VE4sbh/oyPXQ7U/VIYkyS3K4RjiIj4Lt5b4NHab1J3zo0v6MGdcysLegwRkVgoMkMazawu8AyRJ2+U2lXunKvlY1wiIjGVHZLRL9FcKB0E/IfI7f3NgXeAd/0MSkQk1pyzqJd4Fk1ST3LOjSUy+ddK59yjQAd/wxIRia3Qj37J408zSwCWmllvIneJHuFvWCIisRWW0S/RtNTvAJKB24GzgOuJ3OovIhIaRaal7pyb7r3cBuhJRiISStk5ft+LGRvRjH75ln3chOSca+FLRCIiAQhL90s0fep353ldCrgMPehCREImJ85HtUQrmu6XmXsVfWdm03yKR0QkEPE+VDFa0XS/lMuzmkDkYunRvkUkIhKAotT9MpNIn7oR6XZZAdzkZ1AAO7J2+l3FYaPRqRpstMsJR6YEHULcmLpjSdAhhEqR6X4BTnLO7chbYGYlfYpHRCQQYRn9Es2n+H4fZZMLOxARkSC5fCzx7EDzqVcGqgJJZnYGke4XgKOI3IwkIhIaRaH7pQ3QDUgBXmR3Ut8K/MPfsEREYiv0o1+cc28Db5vZZc65/8UwJhGRmMsJOoBCEk2f+llmVmbXipmVNbMnfYxJRCTmHBb1Es+iSertnHObd604534H2vsXkohI7GU5i3qJZ9EMaSxmZiWdc38CmFkSoCGNIhIq8d4Cj1Y0Sf09YKyZDSJysbQb8LafQYmIxFpY+tSjmfvlOTObC7QiMkRzNFDd78BERGKpKLXUAdYRSeiXE5kmQKNhRCRUQt9SN7Pjgau9ZSPwIZHnlDaPUWwiIjGTXQRa6ouBicBFzrllAGbWJyZRiYjEWJw/pS5qBxrSeCmwFvjWzPqbWUsIyT9lIiJ7ycGiXuLZfpO6c+5T59xVwInAt8CdQEUz+4+ZtY5VgCIisRCWCb0OevORcy7dOfe+c+5iIvPAzAbu8z0yEZEYysnHcjBm1sfMFprZAjP7wMxKmVlNM5tqZsvM7EMzK+HtW9JbX+Ztr1GQz5GvCYSdc78751Kdcy0LUqmISLzJMYt6ORAzqwrcDjRwzp0CFAOuAp4DXnLO1QF+Z/fDhm4CfvfKX/L2O2ThmBVeRKSAsvOxRCGRyLTliUSmKl8LtACGetvfBjp7rzux+4bOoUBLs4P8y3EASuoiIkRGv0S7mFkPM5uRZ+mx6zjOuTSgL7CKSDLfQuSxoJudc1nebquJPK8C739/8d6b5e1f/lA/R7Q3H4mIhFp+RrU451KB1H1tM7OyRFrfNYHNwMdA20IIMSpqqYuIUKijX1oBK5xzG5xzmcAw4DygjNcdA5FBJ2ne6zSgGoC3/Whg06F+DiV1ERHy1/1yEKuAc80s2esbbwksIjI0vIu3T1fgM+/1cG8db/s3zrlDHjmp7hdPSkoVBgx4mUoVK+CcY8CA93nt9YE89FAfut94DRs3Rv7hfPjh5xg1+tuAoy18lY6tyGOvPki5Y8rhnOOTd4cz5L+RazpXdr+My2+8hOzsHL77ejKvPvmf3e+rWpGPxw8mte8g3n1zSFDhF6qKxx7DP1+5n7IVyoKDz94bwccDhlGnXi3uebYPSclJrF29jsd6P0XGtgxaX9KSa3pemfv+2ifVonvbm1m68KcAP4W/SpYsybhv/keJkiVJTCzGsGEjeezxF4MOq0AKa+4X59xUMxsKzAKyiAwDTwVGAkO8hwzNBgZ4bxkADDazZcBvREbKHDIrwD8IvipZqlpMA6tcuSKVK1dkzpwFHHFEaaZM/oIul/+NLl0uIn1bBi+93C+W4ezh1LI1fK+jfMXyVKhUniXzfyS5dBKDRw/g7u7/oFyFsnS/4wbuvP5eMndmUrZ8GX7flPvMFJ7r/wTOORbMWhSTpF4yobjvdZSvWI7yFcvz44KlJJdOYsCoN3mg+8M89PJ9vPbEm8yZMo8OV7bl2OOq0P+FQXu8t9aJNXl2wBNccd51vsc5dcMS3+s4kNKlk0lPzyAxMZEJ4z6hz12PMHXarEBiydqZVuDbPAekXBd1zrlp9btxe1upul88v/66njlzFgCwbVs6ixcvo2rVygFHFTub1m9iyfwfAchI387PS3+mYuUKdOnambdfe5fMnZkAeyT0pm0vIG3VWpYvWRFIzH7ZtP43flywFIici5VLV3FM5QpUq5XCnCnzAJg+cSZN21/wl/de2LkFXw//JqbxBiU9PQOA4sUTSSxenHhtIEarMG8+CpKvSd3Mnjezo8ysuJmNNbMNZuZ/E6aAqldP4fT6JzNt2mwAbunZlRnTv6Jfv76UKXN0wNH5r0pKZU449XgWzFrEcbWqUf+c03lrZD/6Dfs39U4/EYCk5CS69rqG/i8OOsjRDm+VUypR95Q6LJz9Ayt+XMkFbc4DoPlFTal0bMW/7N/y4uaM+bRoJPWEhARmTP+KtWnzGDt2AtOmzw46pAJRUo9Oa+fcVuAi4GegDnDP/nbOO/YzO3ubz6HtW+nSyQz5oB933/0of/yxjdTUwZx00vk0PLsNv/66nuee+2cgccVKUnISzw94khcffpX0bRkkJhbj6DJH0a3Dzbz6+Bs8k/oYAD3uvpH3Uz9ie8b2gCP2T1JyKZ7q/xivPvIGGdsyePqu57m0aycGfPkmyaWTyczM3GP/emecyI7tO1ix5OdgAo6xnJwcGjRsTfWaDWjY4AxOPvmEoEMqEGfRL/HM7wulu47fAfjYObflQDdK5R37Ges+dYDExEQ+HJLKkCGf8tlnowBYv35j7vaBA9/nk2FvxTqsmCmWWIznBzzJqGFj+PaLCQCsW7uBb74YD8DCOT/gchxlypfhlDPr0fKiZtz+z54cedQR5OQ4dv65k48GDQvyIxSaYonFeKr/Y3z1ydeM/3IiAKt++oU+19wLQLVaKTRuee4e72nVqQVff1Y0Wul5bdmylXHjv6NN62YsXBhsP39BxHsLPFp+J/URZrYY2A70NLNjgB0+13nI+vV7gcWLl/LKq/1zyypXrsivv64HoFPHtof1l/ZgHv7X/axY+jPv9fswt2z8qIk0OO9MZn4/m+NqVSOxeCKbN23m/zr3zt2nx99vJCN9e2gSOsADL97DymWr+DB1aG5ZmfJl2LxpM2ZG1zuu49PBw3O3mRktLmrGrZfeEUS4MVehQjkyM7PYsmUrpUqVolXLJrzQ942gwyqQKG//j3u+JnXn3P1m9jywxTmXbWbpRO60ijuNGzfkumu7MH/+D0ybGmmlP/zwc1xxZSdOP+1knHOsXLmaXr3vDzhSf5x+9ql0uLwtSxf9xHtjBgLwxjOpfPbBSB5+6QE+/PZtMjOzePSOpwOO1H+nNTyFdl1as2zRT7z1VeSmwX7PDiClZlUu7Rb5+o7/YhIjPxyV+576557G+rXrWbNqbSAxx1qVKpUYOOBlihVLICEhgaFDP2fkF18HHVaBhOUhGb4OaTSz4kBPoIlXNB5407vL6oCC6H6JV7EY0ni4iMWQxsNF0EMa40lhDGl86bjohzT2WRW/Qxr97n75D1Ac2PV32fVe2d98rldEJF/Upx6dhs650/Osf2Nmc32uU0Qk38LSNeD3kMZsM6u9a8XMahGe6xEiEiKFOPdLoPxuqd9D5MHVy731GsCNPtcpIpJvYWlt+t1S/w7oR6S76jfv9WSf6xQRybccXNRLPPO7pf4OsBV4wlu/BhgMXO5zvSIi+aILpdE5xTlXL8/6t2a2yOc6RUTyLb7b39Hzu/tllpnl3kttZucAM3yuU0Qk38IyoZffLfWzgO/NbJW3fhywxMzmA845d5rP9YuIRCXLwtFW9zupx+xhqyIiBRGOlO7/3C8r/Ty+iEhhifdulWjpGaUiIhD3QxWjpaQuIoK6X0REQkXdLyIiIZIdkra6krqICGqpi4iEilNLXUQkPNRSFxEJEQ1pFBEJkXCkdCV1EREAskKS1pXURUTQhVLfZeeE5bJFwc3dtPzgOxUR4fjZFY6SicWDDiFUwpJx4japi4jEklrqIiIhopa6iEiIZDu11EVEQkPj1EVEQiQsfep+P3haROSwUNgPnjazYmY228xGeOs1zWyqmS0zsw/NrIRXXtJbX+Ztr1GQz6GkLiJCpPsl2iVKdwA/5Fl/DnjJOVcH+B24ySu/CfjdK3/J2++QKamLiBDpfon2v4MxsxSgA/Bfb92AFsBQb5e3gc7e607eOt72lt7+h0R96iIiFProl5eBe4EjvfXywGbnXJa3vhqo6r2uCvwC4JzLMrMt3v4bD6VitdRFRMhf94uZ9TCzGXmWHruOY2YXAeudczOD+BxqqYuIkL+bj5xzqUDqfjafB3Q0s/ZAKeAo4BWgjJkleq31FCDN2z8NqAasNrNE4Ghg0yF8BEAtdRERoPD61J1zDzjnUpxzNYCrgG+cc9cC3wJdvN26Ap95r4d763jbv3Hu0PuClNRFRPBl9Mve7gPuMrNlRPrMB3jlA4DyXvldwP0F+RzqfhERAQrQOD7QMccB47zXy4Gz97HPDuDywqpTSV1EBMgOyR2lSuoiImjuFxGRUPGj+yUISuoiIqilLiISKmGZpVFJXUQEPSRDRCRU1P0iIhIiYUnquqN0P/qnvsia1XOZM3ts0KHEXP/UF0lbPZfZeT77s888xPz545k1cwwff/xfjj76qAAjDE5R/l4ALPphEtOmjWLylC+YOGk4AE899QCzZo9l6tQv+WBIv8P2u+Gci3qJZ0rq+/HOOx/R4aJrgw4jEG+/8xEX7fXZvx47gfr1W3DmWReydOly7ruvd0DRBasofy92adfuahqd254Lzu8IwDffTKJhg9acc047li1dwd133xpwhIcmBtMExISv3S9mdteBtjvn/uVn/QUxcdJUqldPCTqMQEzax2f/+usJua+nTp3FZZd2iHVYcaEofy/2Z+zYibmvp02fzSWd2wUYzaELy+gXv1vqDYCeRCaBrwrcApxJZOL4Iw/wPolj3bpdxajR3wYdhgTAOcfwzwcz6bvPubH71X/ZfsMNl/PVV+NiH1ghyHY5US/xzO8LpSnAmc6VeIlVAAAH60lEQVS5PwDM7FFgpHPuOp/rFZ/cf//tZGVl8f77w4IORQLQqlUX1q5ZxzHHlOfzz9/lxyU/8d130wC4595eZGVlM2TIpwFHeWjiva88Wn631CsBO/Os7/TK9inv00RyctJ9Dk3y64brr6BD+1bccEPR7E8XWLtmHQAbNmxi+OejadDgdACuu64L7dq1pPuNdwQZXoGoTz067wDTzOwTb70z8Nb+ds77NJHEElXj+8wVMa1bN+Pvd/ekZcvL2L59R9DhSACSk5NISEhg27Z0kpOTaNnyAp595lUuvLApd/a5mbZtrjysvxth6VM3v//kMLMzgQu81QnOudnRvC/opP7u4Ndp2qQRFSqUY926jTz2eF8GvTUkkFgO+bHih2jwXp/98cf7cu+9vSlZsiS//fY7ELlY2qt3gebyPyRB/+zi6XtRMrF4TOurUaMaQ4ZEnuBWLLEYH330GS88/zrz5o+jZMkS/PbbZgCmTZvNHbc/GNPY0jN+LvDP5JRK50b99Vqwbkqsf5ZR8z2pH6qgk3o8idtvTwD0pdgt1kk9nhVGUj+50jlRf70Wrpsatz9L3VEqIgJxP6olWkrqIiJATpz2WuSXkrqICOG5UKqkLiKCWuoiIqGilrqISIhku+ygQygUSuoiIoRnmgAldRERwvOQDCV1ERHUUhcRCRWNfhERCRGNfhERCRFNEyAiEiLqUxcRCRH1qYuIhIha6iIiIaJx6iIiIaKWuohIiGj0i4hIiITlQmlC0AGIiMQD51zUy8GYWVszW2Jmy8wspk9oV1IXESFyR2m0/x2ImRUDXgfaAfWAq82sXgw+AqCkLiICFGpL/WxgmXNuuXNuJzAE6OT7B/CoT11EhELtU68K/JJnfTVwTmEd/GDiNqln7UyzoGMAMLMezrnUoOOIBzoXu+lc7BaWc5GfnGNmPYAeeYpS4+UcqPvl4HocfJciQ+diN52L3YrcuXDOpTrnGuRZ8ib0NKBanvUUrywmlNRFRArXdKCumdU0sxLAVcDwWFUet90vIiKHI+dclpn1BkYDxYCBzrmFsapfSf3g4qKfLE7oXOymc7GbzsVenHNfAF8EUbeFZb4DERFRn7qISKgoqYuIhIiSuohIiBTppG5mNczsBzPrb2YLzewrM0sys9pmNsrMZprZRDM70du/tplNMbP5ZvakmW0L+jMUlkM4F2+ZWZc87w/NuYDc87HYzN7zzstQM0s2s5ZmNtv7Dgw0s5Le/s+a2SIzm2dmfYOOPxbM7EEz+9HMJpnZB2Z2d9AxSRFP6p66wOvOuZOBzcBlRK7m3+acOwu4G3jD2/cV4BXn3KlEbv0Nm/yci6LgBOAN59xJwFbgLuAt4ErvO5AI9DSz8sAlwMnOudOAJwOKN2bM7Cwi46/rA+2BhsFGJLsoqcMK59wc7/VMoAbQGPjYzOYA/YAq3vZGwMfe6/djGWSM5OdcFAW/OOe+816/C7Qkco5+9MreBpoAW4AdwAAzuxTIiHmksXcB8IlzLsM5t5UY3lwjB6Zx6vBnntfZQCVgs3OufkDxBCk/5yILr1FgZglACf/Di7m9x/tuBsr/ZafIzSZnE0n6XYDeQAv/wxP5K7XU/2orsMLMLgewiNO9bVOIdElA5E/PsDvQufgZOMt73REoHvvwfHecmTXyXl8DzABqmFkdr+x6YLyZHQEc7d1w0gc4/a+HCp0JQGfvusuRwMVBByQRSur7di1wk5nNBRayey7kO4G7zGweUIfIn91ht79z0R9o6pU3AtIDis9PS4BeZvYDUBZ4CbiRSHfUfCAHeBM4EhjhfS8mEel7DzXn3CzgQ2Au8CWR+U4kDuiO0nwws2Rgu3POmdlVwNXOuZhNfi+xY2Y1gBHOuVMCDuWwYGaPAtucc0Vi5E88U596/pwFvGZmRqR/tXvA8YiI7EEtdRGREFGfuohIiCipi4iEiJK6iEiIKKlLoTOzbDObY2YLzOxjb9TQoR6rmZmN8F53NLP7D7BvGTO79RDqeFTzlkhYKKmLH7Y75+p7wwF3Arfk3ejdxJTv755zbrhz7tkD7FIGyHdSFwkTJXXx20Sgjjfr4RIzewdYAFQzs9ZmNtnMZnkt+iMAzKytN0PiLODSXQcys25m9pr3upKZfWJmc72lMfAsUNv7K+EFb797zGy6N3viY3mOlTvDIJGJu0RCQePUxTdmlgi0A0Z5RXWBrs65KWZWAXgIaOWcSzez+4jcrfs8kbtVWwDLiNy1uC+vAuOdc5eYWTHgCOB+4JRdc9WYWWuvzrMBA4abWRMid7/ummEwEZhFZAIzkcOekrr4Icmb1REiLfUBwLHASufcFK/8XKAe8F3kXi5KAJOBE4nMhLgUwMzeBXrso44WwA0AzrlsYIuZld1rn9beMttbP4JIkj8Sb4ZBrw7NMCihoaQufti+98yOXuLOOz+MAWOcc1fvtV9hzo5pwDPOuX571XFnIdYhElfUpy5BmQKct2vGQzMrbWbHA4uJzIRY29vv6v28fyzQ03tvMTM7GviDSCt8l9FA9zx99VXNrCKaYVBCTEldAuGc2wB0Az7wZjecDJzonNtBpLtlpHehdP1+DnEH0NybLXEmUM85t4lId84CM3vBOfcVkYeZTPb2GwocqRkGJcw094uISIiopS4iEiJK6iIiIaKkLiISIkrqIiIhoqQuIhIiSuoiIiGipC4iEiJK6iIiIfL/x6gv8DoA708AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] } - ], - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "import seaborn as sns\n", - "\n", - "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", - "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", - " xticklabels=classes, yticklabels=classes)\n", - "plt.ylabel(\"Actual\")\n", - "plt.xlabel(\"Predicted\")\n", - "plt.show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + ] +} \ No newline at end of file diff --git a/notebooks/text_generation.ipynb b/notebooks/text_generation.ipynb index 4806c8bb0..f39407fcc 100644 --- a/notebooks/text_generation.ipynb +++ b/notebooks/text_generation.ipynb @@ -1,203 +1,638 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Thai Wiki Language Model for Text Generation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook details how you can use pretrained language model on [Thai Wikipedia Dump](https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2) to generate texts." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# #uncomment if you are running from google colab\n", - "# !pip install sklearn_crfsuite\n", - "# !pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", - "# !pip install fastai==1.0.45" - ] + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "colab": { + "name": "text_generation.ipynb", + "version": "0.3.2", + "provenance": [] + }, + "accelerator": "GPU" }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Download : wiki_lm_lstm\n", - "re-download\n", - "from wiki_lm_lstm 0.31 update to wiki_lm_lstm 0.31\n", - "yes or no (y / n) : y\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "vfD07MBXKROC", + "colab_type": "text" + }, + "source": [ + "# Thai Wiki Language Model for Text Generation" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "thwiki_lm.pth?dl=1: 1.05GB [01:28, 11.8MB/s] \n" - ] + "cell_type": "markdown", + "metadata": { + "id": "BunBriX0KROF", + "colab_type": "text" + }, + "source": [ + "This notebook details how you can use pretrained language model on [Thai Wikipedia Dump](https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2) to generate texts." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Download : wiki_itos_lstm\n", - "re-download\n", - "from wiki_itos_lstm 0.31 update to wiki_itos_lstm 0.31\n", - "yes or no (y / n) : y\n" - ] + "cell_type": "code", + "metadata": { + "id": "O8IVDoE9KROG", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "935e7e3e-6f0d-4880-86b8-30df8e2eb853" + }, + "source": [ + "#uncomment if you are running from google colab\n", + "!pip install sklearn_crfsuite\n", + "!pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "!pip install fastai\n", + "!pip install emoji" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: sklearn_crfsuite in /usr/local/lib/python3.6/dist-packages (0.3.6)\n", + "Requirement already satisfied: python-crfsuite>=0.8.3 in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (0.9.6)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (1.12.0)\n", + "Requirement already satisfied: tqdm>=2.0 in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (4.28.1)\n", + "Requirement already satisfied: tabulate in /usr/local/lib/python3.6/dist-packages (from sklearn_crfsuite) (0.8.3)\n", + "Collecting https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "\u001b[?25l Downloading https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", + "\u001b[K - 11.2MB 218kB/s\n", + "\u001b[?25hRequirement already satisfied (use --upgrade to upgrade): pythainlp==2.1.dev2 from https://github.com/PyThaiNLP/pythainlp/archive/dev.zip in /usr/local/lib/python3.6/dist-packages\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (0.3.0)\n", + "Requirement already satisfied: marisa-trie==0.7.4 in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (0.7.4)\n", + "Requirement already satisfied: nltk>=3.2.2 in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (3.2.5)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (2018.9)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (2.21.0)\n", + "Requirement already satisfied: tinydb in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (3.13.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from pythainlp==2.1.dev2) (4.28.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk>=3.2.2->pythainlp==2.1.dev2) (1.12.0)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (2019.6.16)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (1.24.3)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pythainlp==2.1.dev2) (3.0.4)\n", + "Building wheels for collected packages: pythainlp\n", + " Building wheel for pythainlp (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pythainlp: filename=pythainlp-2.1.dev2-cp36-none-any.whl size=11014043 sha256=3dfa6501ae5079e51204d5ab850ab32965c85f27bb642a67712b39b106feb3fc\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-5gfc5rda/wheels/79/4e/1e/26f3198c6712ecfbee92928ed1dde923a078da3d222401cc78\n", + "Successfully built pythainlp\n", + "Requirement already satisfied: fastai in /usr/local/lib/python3.6/dist-packages (1.0.57)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from fastai) (3.13)\n", + "Requirement already satisfied: spacy>=2.0.18 in /usr/local/lib/python3.6/dist-packages (from fastai) (2.1.8)\n", + "Requirement already satisfied: typing; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from fastai) (3.7.4)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from fastai) (1.3.1)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.6/dist-packages (from fastai) (4.6.3)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from fastai) (19.1)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from fastai) (3.0.3)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.6/dist-packages (from fastai) (4.3.0)\n", + "Requirement already satisfied: torch>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from fastai) (1.1.0)\n", + "Requirement already satisfied: fastprogress>=0.1.19 in /usr/local/lib/python3.6/dist-packages (from fastai) (0.1.21)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.6/dist-packages (from fastai) (0.3.0)\n", + "Requirement already satisfied: nvidia-ml-py3 in /usr/local/lib/python3.6/dist-packages (from fastai) (7.352.0)\n", + "Requirement already satisfied: bottleneck in /usr/local/lib/python3.6/dist-packages (from fastai) (1.2.1)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from fastai) (2.21.0)\n", + "Requirement already satisfied: numexpr in /usr/local/lib/python3.6/dist-packages (from fastai) (2.6.9)\n", + "Requirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.6/dist-packages (from fastai) (1.16.4)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from fastai) (0.24.2)\n", + "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from fastai) (0.6)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (0.2.2)\n", + "Requirement already satisfied: plac<1.0.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (0.9.6)\n", + "Requirement already satisfied: blis<0.3.0,>=0.2.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (0.2.4)\n", + "Requirement already satisfied: srsly<1.1.0,>=0.0.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (0.0.7)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (1.0.2)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (2.0.2)\n", + "Requirement already satisfied: thinc<7.1.0,>=7.0.8 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (7.0.8)\n", + "Requirement already satisfied: preshed<2.1.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.0.18->fastai) (2.0.1)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->fastai) (2.4.2)\n", + "Requirement already satisfied: attrs in /usr/local/lib/python3.6/dist-packages (from packaging->fastai) (19.1.0)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from packaging->fastai) (1.12.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->fastai) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->fastai) (1.1.0)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->fastai) (2.5.3)\n", + "Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from Pillow->fastai) (0.46)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->fastai) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->fastai) (2019.6.16)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->fastai) (2.8)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->fastai) (3.0.4)\n", + "Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas->fastai) (2018.9)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in /usr/local/lib/python3.6/dist-packages (from thinc<7.1.0,>=7.0.8->spacy>=2.0.18->fastai) (4.28.1)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib->fastai) (41.0.1)\n", + "Collecting emoji\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/1b/d7/2746b4dd67375ce253e777ba54869545d24d2b0249ebcf83735c99df68d5/emoji-0.5.3.tar.gz (43kB)\n", + "\u001b[K |████████████████████████████████| 51kB 4.4MB/s \n", + "\u001b[?25hBuilding wheels for collected packages: emoji\n", + " Building wheel for emoji (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for emoji: filename=emoji-0.5.3-cp36-none-any.whl size=42175 sha256=c3f1611ca03c91684bc818c0ad78dcb8d0542c7eab7fc3dfe3a6640090c8f196\n", + " Stored in directory: /root/.cache/pip/wheels/86/09/26/f944015841423cd516e8a97f30e29be59e53461aea8b7d3458\n", + "Successfully built emoji\n", + "Installing collected packages: emoji\n", + "Successfully installed emoji-0.5.3\n" + ], + "name": "stdout" + } + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "thwiki_itos.pkl?dl=1: 1.53MB [00:02, 743kB/s] \n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from ast import literal_eval\n", - "from tqdm import tqdm_notebook\n", - "from collections import Counter\n", - "import re\n", - "\n", - "#viz\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "#fastai\n", - "import fastai\n", - "from fastai.text import *\n", - "from fastai.callbacks import CSVLogger\n", - "\n", - "#pythainlp\n", - "from pythainlp.ulmfit import *" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "#get dummy data\n", - "imdb = untar_data(URLs.IMDB_SAMPLE)\n", - "dummy_df = pd.read_csv(imdb/'texts.csv')\n", - "\n", - "#get vocab\n", - "thwiki_itos = pickle.load(open(_THWIKI_LSTM['itos_fname'],'rb'))\n", - "thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)\n", - "\n", - "#dummy databunch\n", - "tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)\n", - "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", - " NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)]\n", - "data_lm = (TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)\n", - " .random_split_by_pct(0.2)\n", - " .label_for_lm()\n", - " .databunch(bs=64))\n", - "\n", - "\n", - "data_lm.sanity_check()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "metadata": { + "id": "DvwUYZGmKROK", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 119 + }, + "outputId": "03569098-5d70-4756-f8b4-c77de3cd4b5c" + }, + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from ast import literal_eval\n", + "from tqdm import tqdm_notebook\n", + "from collections import Counter\n", + "import re\n", + "\n", + "#viz\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "#fastai\n", + "import fastai\n", + "from fastai.text import *\n", + "from fastai.callbacks import CSVLogger\n", + "\n", + "#pythainlp\n", + "from pythainlp.ulmfit import *" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Download: wiki_lm_lstm\n", + "wiki_lm_lstm 0.32\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "100%|██████████| 1050919089/1050919089 [00:25<00:00, 41157162.35it/s]\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Download: wiki_itos_lstm\n", + "wiki_itos_lstm 0.32\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "100%|██████████| 1530484/1530484 [00:00<00:00, 19090275.60it/s]\n" + ], + "name": "stderr" + } + ] + }, { - "data": { - "text/plain": [ - "60004" + "cell_type": "code", + "metadata": { + "id": "PnQcr3gWKROS", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#get dummy data\n", + "imdb = untar_data(URLs.IMDB_SAMPLE)\n", + "dummy_df = pd.read_csv(imdb/'texts.csv')\n", + "\n", + "#get vocab\n", + "thwiki_itos = pickle.load(open(_THWIKI_LSTM['itos_fname'],'rb'))\n", + "thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)\n", + "\n", + "#dummy databunch\n", + "tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)\n", + "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", + " NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3)]\n", + "data_lm = (TextList.from_df(dummy_df, imdb, cols=['text'], processor=processor)\n", + " .random_split_by_pct(0.2)\n", + " .label_for_lm()\n", + " .databunch(bs=64))\n", + "\n", + "\n", + "data_lm.sanity_check()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VJI1MZzvKROW", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "f8db8372-fc6a-44ff-f7cd-9e4e8d99684b" + }, + "source": [ + "#check vocab size\n", + "len(data_lm.vocab.itos)" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "60005" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 6 + } ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#check vocab size\n", - "len(data_lm.vocab.itos)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,\n", - " output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)\n", - "trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)\n", - "\n", - "learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", - "\n", - "#load pretrained models\n", - "learn.load_pretrained(**_THWIKI_LSTM)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ + }, { - "data": { - "text/plain": [ - "'กาลครั้งหนึ่งนานมาแล้ว เกิดอาการปวดศีรษะ ความขัดแย้งในครอบครัว ความถึงแก่อสัญกรรมของบิดามารดา ได้สร้างความตื่นตระหนกให้แก่ลูกหลานมาก โดยเด็กทั้งสองลงท้ายด้วยคำว่า \"แม่\" เข้าด้วยกัน และและคำว่า \"ลูก\" เป็นคำที่มาจากคำว่า \"ลูก\" ในภาษาสันสกฤต หมายถึงแม่ ซึ่งหมายถึงครอบครัว และ \"ความอุดมสมบูรณ์\" \\n \\n \\n \\n =2444 (วงดนตรี)= \\n 2537 (วงดนตรี) \\n \\n สถาบันดนตรี (Music Academy) เป็นคณะแรกในจุฬาลงกรณ์มหาวิทยาลัย ก่อตั้งเมื่อปี พ.ศ. 2514 ในชื่อ \"คณะนิเทศศาสตร์และการบัญชี\" โดยมีผลงานจากริมดวงจันทร์ที่ประสบความสำเร็จอย่างมากทั้งในและนอกประเทศ การเผยแพร่เพลงและวัฒนธรรมของคนไทย และในสื่อต่างๆ ที่เปิดให้ทุกคนในประเทศไทยเข้าใจในดนตรีแนวนี้ เป็นที่รู้จักของผู้ชมทั่วไป และความนิยมในแนวดนตรีที่ไม่เหมือนใคร \\n \\n ประวัติ'" + "cell_type": "code", + "metadata": { + "id": "B9DJVRZ-KROb", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "54031979-d708-4550-d0b7-8cd42b07cf96" + }, + "source": [ + "config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True,\n", + " output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)\n", + "trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)\n", + "\n", + "learn = language_model_learner(data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", + "\n", + "#load pretrained models\n", + "learn.load_pretrained(**_THWIKI_LSTM)" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LanguageLearner(data=TextLMDataBunch;\n", + "\n", + "Train: LabelList (800 items)\n", + "x: LMTextList\n", + "xxbos every once in a long while a movie will come along that will be so xxunk that i feel xxunk to xxunk people . if i labor all my days and i can save but one soul from xxunk this movie , how great will be my joy . \n", + " \n", + " where to begin my xxunk of pain . for xxunk , there was a musical xxunk every five minutes . there was no character development . every character was a stereotype . we had xxunk guy , fat guy who xxunk xxunk , xxunk foreign guy , etc . the script xxunk as if it were being written as the movie was being shot . the production value was so xxunk low that it xxunk like i was xxunk a junior high video presentation . have the directors , producers , etc . ever even seen a movie before ? xxunk is getting worse and worse with every new entry . the concept for this movie xxunk so funny . how could you go wrong with gary coleman and a xxunk of xxunk xxunk actors . but trust me when i say this , things went wrong , xxup very xxup wrong .,xxbos name just says it all . i xxunk this movie with my dad when it came out and having xxunk in korea he had great xxunk for the man . the xxunk thing about this film is that it only xxunk on a short period of the man ' s life - xxunk enough the man ' s entire life would have made such an epic xxunk that it is xxunk to imagine the cost for production . \n", + " \n", + " some xxunk xxunk to the xxunk xxunk about the man , which are cheap xxunk . the theme of the movie \" duty , honor , country \" are not just xxunk words xxunk from the lips of a xxunk officer - it is the deep declaration of one man ' s total xxunk to his country . \n", + " \n", + " xxunk peck being the liberal that he was xxunk a better understanding of the man . he does a great job xxunk the fearless general xxunk with the humane side of the man .,xxbos this movie xxunk at being one of the most unique movies you ' ve seen . however this comes from the fact that you can ' t make heads or tails of this mess . it almost xxunk as a series of challenges set up to xxunk xxunk or not you are xxunk to walk out of the movie and give up the money you just paid . if you don ' t want to feel xxunk you ' ll sit through this xxunk film and xxunk a real sense of xxunk for the actors xxunk , they ' ve all seen better days , but then you xxunk they xxunk got paid xxunk a bit of money to do this and you ' ll lose xxunk for them just like you ' ve xxunk done for the film . i can ' t go on enough about this xxunk movie , its almost something that ed wood would have made and in that case it xxunk would have been his xxunk . \n", + " \n", + " to start you are forced to sit through an opening dialogue the xxunk of which you ' ve never seen / heard , this thing has got to be five minutes long . on top of that it is xxunk , as to xxunk that you the viewer cannot read . then we meet mr . xxunk and the xxunk of xxunk lines gets xxunk , it is as if he is operating xxunk to get lines on to the movie xxunk tag line . soon we meet stephen xxunk , who i xxunk xxunk ) and he does his best not to xxunk in this but xxunk he does . then comes the ultimate xxunk , tara reid playing an intelligent role , oh help us ! tara reid is not a very talented actress and xxunk she xxunk gets xxunk in movies , in my xxunk though she should stick to movies of the american pie type . \n", + " \n", + " all in all you just may want to see this for yourself when it comes out on video , i know that i got a kick out of it , i mean xxunk all be xxunk here , xxunk its xxunk to xxunk in the xxunk of others .,xxbos from the start , you know how this movie will end . it ' s so full of clich é s your typical xxup xxunk member will not even like this movie . i give it 2 out of 10, only because of the acting of william benton . i can ' t believe people xxunk 6 + for this movie . it ' s so biased towards a ' certain point of view ' ( once a thief xxunk people xxunk ' t born bad . neither are they born good . they are born with a clean slate . it ' s society , parents and education what makes them who they are . and if they take the wrong turn , somewhere down the line , it xxunk isn ' t going to be the american xxunk system that gets them back on track ! xxunk , xxunk this movie like the xxunk . i bet you have better things to do with your time than waste it on this piece of xxunk . \n", + " \n", + " ,xxbos i was xxunk enough to meet george pal ( and still have my xxup ds : xxup xxunk xxunk xxunk by him ) at a convention xxunk after the release , and xxunk him why he xxunk to do the film \" camp \". before he could answer , two studio xxunk xxunk and xxunk me on how the studio \" knew best \" and how \" no one will take such a film xxunk \". i had been reading the xxunk xxunk for a couple of years thanks to a friend ( xxunk xxunk of the 1970 s will recall xxunk and his band ? i was in a couple of years of that with him ), and had higher hopes than what we got . \n", + " \n", + " the xxunk xxunk that no high adventure would ever be done xxunk , and so doing ' camp ' was the only way . several other xxunk xxunk in on my side , with pal listening as best he could . at the end of the little event , pal came up to us and xxunk , xxunk he could have done more and better . \n", + " \n", + " xxup star xxup wars put the lie to the xxunk , and a year after pal ' s death , spielberg and lucas xxunk that doc savage could have xxunk been the next major movie franchise xxunk if it xxunk ' t been for the xxunk . \n", + " \n", + " tear out the memory or history of doc , and the film would have been worth a 6 / 10 rating as nothing more than a xxunk xxunk seller . \n", + " \n", + " but xxunk the legacy like that was no less an xxunk than killing a baby in the xxunk . \n", + " \n", + " doc savage can still come to the screen , and survive the xxunk xxunk by the xxunk to indiana jones , but it would have to be done in all xxunk and xxunk to xxunk the glory that we should expect from the first american xxunk . \n", + " \n", + " xxup xxunk : yes , there was a second script for xxup xxunk xxup of xxup evil , and it ' s a lot more serious . yes , there was xxunk xxunk shot , but mostly xxunk xxunk and very little with actors . and , yes , there _ is _ a xxunk of ron xxunk xxunk over a brick wall and xxunk at something over his shoulder with a xxunk built bronze xxunk . xxunk ' s xxunk a xxunk over a button down white shirt with a bronze tie , and the words \" xxup doc xxup savage : xxup xxunk xxup of xxup evil xxunk coming next summer !\" xxup xxunk : if anyone knows who the studio xxunk were that xxunk george pal in 1975 to san diego for the convention , xxunk the xxunk up the side of the head and call them the xxunk that they are . at the time , they were doing xxunk and fu xxunk in stripes and xxunk canvas xxunk , and carrying paramount xxunk .\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: /root/.fastai/data/imdb_sample;\n", + "\n", + "Valid: LabelList (200 items)\n", + "x: LMTextList\n", + "xxbos does any one know what the 2 sports cars were ? i think robert stack ' s might have been a xxunk . rock hudson ' s character told his father he was taking a job in iraq , isn ' t that xxunk ? i have had xxunk malone in my xxunk bank most of my life , maybe this was the film that xxunk me . xxunk xxunk sure did have some xxunk in this film and xxunk xxunk malone but xxunk ' s part made a more xxunk impact so she got the oscar for best supporting role . was xxunk ' s part xxunk a leading role ? old man xxunk character was was xxunk a pretty common picture of xxunk of his era in that he was a regular guy who made it big in an emerging industry but in building a whole town he had xxunk his children to have his wife bring them up . in time , being xxunk he xxunk that they were all he really had and they were xxunk rotten , looking for attention , so rather than try to xxunk to his children he xxunk his head off . an ancient morality tale . but xxunk , what were those sports cars ?,xxbos god bless 80 ' s xxunk films . this is a fun , fun movie . this is what xxunk films are all about . now i ' m not xxunk horror movies , just xxunk films . it goes like this : a high school nerd is xxunk on by all these stupid xxunk and xxunk , and then one of their xxunk goes xxunk wrong . xxunk and back for revenge , xxunk a joker / xxunk mask ( pretty xxunk looking , might i add ), marty begins to kill off those xxunk one by one many years later , after he xxunk to make them believe that their old xxunk high school is having a reunion . that is xxunk the plot ? what ' s wrong with that ? that ' s the beauty of 80 ' s xxunk films , most of them i would say . a lot of things could be so xxunk , but they keep drawing you more in an ' in as they go by . xxunk this film . \n", + " \n", + " it features some xxunk xxunk , and some are xxunk creative as well . ( poisoning of a xxunk can , acid bath , i can ' t remember a xxunk ever being used before in any other xxunk film either ) it really is a fun , fun movie . that ' s all it is . nevermind the fact that the characters are complete xxunk , never mind their xxunk , and never mind the xxunk , random things that xxunk in this film . such as lights being able to be controlled by the killer ( when he ' s not even switching any xxunk , you ' ll see ) and xxunk being able to xxunk up blood , xxunk being able to have acid come out of them , just use that as part of your entertainment ! because xxunk what really makes it xxunk . \n", + " \n", + " movies like this represent 80 ' s xxunk . never again could movies like this get made , know why ? it isn ' t the 80 ' s xxunk . that is why you should just xxunk them for what they are , good fun ! i highly xxunk this film if you ' re a hardcore fan of xxunk such as friday the 13 th . \n", + " \n", + " one last note this movie also had a kick ass xxunk as well , marty xxunk . a xxunk , nerd , who kills all his old xxunk in a xxunk xxunk mask . a good xxunk makes a good xxunk . simon xxunk , who played marty xxunk xxunk suicide xxunk after xxunk high was released . that alone xxunk something xxunk to the film , and sticks with it and it even makes you feel more sorry for the marty character , i guess . all in all , great 80 ' s xxunk fun ! it ' s a shame it will never be the same again xxunk,xxbos the basic formula for the original series was ; take someone , get the audience to like them , then put them into xxunk danger . this formula xxunk for the 32 xxunk made between 1964 - 68. \n", + " \n", + " now , we jump forward 40 years to xxunk we are xxunk to alan tracy , a xxunk xxunk college school kid , with his friend , fermat , a young xxunk . they are xxunk off by lady xxunk in her pink ford xxunk to the island paradise where the tracy family live , for the school xxunk . almost xxunk , they are left in the care of xxunk and his daughter , xxunk xxunk the xxunk go to rescue john from xxunk 5 which has been xxunk by a xxunk xxunk . this is all part of the hood ' s scheme to take over tracy island so that he can steal the xxunk machines xxunk \n", + " \n", + " xxunk to rob a bank ! \n", + " \n", + " yes . the plot xxup is as xxunk as that ! \n", + " \n", + " the dialogue is xxunk , the acting more wooden than that of the ( xxunk ) puppets , the effects , anything but special and hans xxunk ' s score xxunk what little there was of barry gray ' s glorious theme xxunk through xxunk ' s xxunk xxunk . the rest of the score was xxunk xxunk . in fact , part of the score was broadcast the following week on the radio and didn ' t xxunk it ! i didn ' t even xxunk to stay to witness xxunk ' s xxunk xxunk with the end titles \n", + " \n", + " to be fair , ron cook xxunk xxunk well as parker , he and sophia myles as xxunk xxunk xxunk . with the right material , they could have been show xxunk . the xxup cgi work was what i would have called leading edge - 5 years ago . \n", + " \n", + " the dynamics of the main craft were just wrong ; the original series models at least xxunk as if they had mass \n", + " \n", + " another xxunk point is that the whole production xxunk to be one long set of product xxunk , from every vehicle being built by ford to the entire content of the tracy xxunk being produced by ben & jerry ' s . \n", + " \n", + " my son ( 9 ) xxunk the film but this cross between spy kids and ' xxunk ', xxunk xxunk at his age group , added nothing to the xxunk legend . when star trek hit the big screen in 1979 with ' the motion picture ', a whole new xxunk of life was xxunk into the franchise which then continued for another 20 years or so . with this film , xxunk has xxunk a golden opportunity to do the same with the xxunk franchise . \n", + " \n", + " i xxunk that this film , like ' the avengers ' and ' the saint ' before it , will sink into xxunk within 6 months , leaving the original series to its ' classic ' status .,xxbos the views of earth that are xxunk in this film to have been xxunk by xxup nasa have xxunk been xxunk with the historical weather data for the time of apollo 11, and show a good match between the cloud patterns in the video sequence and the xxunk xxunk records on the day . \n", + " \n", + " this would xxunk to xxunk the entire argument put forward in the film that the \" whole earth \" picture is xxunk a small part of the planet framed by the spacecraft window . \n", + " \n", + " i am waiting for bart xxunk to now xxunk that the historical weather data has been xxunk by xxup nasa , though that would no doubt xxunk them in also xxunk every xxunk newspaper copy with a weather map , and the ones in private hands would still be a problem . \n", + " \n", + " ah , a response : \" xxunk to xxunk this movie by xxunk to xxup nasa weather data i ' d say is a xxunk , but weak and xxunk argument . what about the rest of the xxunk and xxunk in the movie ? a certain wise man once said something about xxunk xxunk and xxunk xxunk . do you in any way feel that maybe this could xxunk to what you are xxunk to do here ? : - ) this movie is just packed with xxunk evidence against the xxunk once made by u . s . government that the xxunk were a success , and that man now are true masters of the universe . things are xxunk never xxunk what they xxunk .. just watch the movie , and i dear say you ' ll see things a bit different than before .\" \n", + " \n", + " first off , weather data doesn ' t come from xxup nasa , it comes for met xxunk around the world . second , the weather data xxunk a major xxunk in the film . third , far from being \" packed with xxunk evidence \", the remaining xxunk in the film have been xxunk xxunk . xxunk thought he had a xxunk secret piece of film , so he edited it and added his own interpretation . xxunk for him , his source film is public domain , and the xxunk xxunk edited out xxunk his xxunk .,xxbos xxup swing ! is an important film because it ' s one of the remaining xxunk and xxunk films from the 1930 s . many of these films have simply xxunk so xxunk that they are xxunk , but this one is in xxunk good shape . it ' s also a nice chance to see many of the talented black xxunk of the period just after the xxunk of the old cotton xxunk time all but xxunk today . \n", + " \n", + " xxunk , while the film is xxunk important and has some lovely performances , it ' s also a mess . the main plot is very similar to the hollywood xxunk of the xxunk a prima donna who is going to xxunk the show and the surprise unknown who xxunk from no where to save the day . however , the writing is just xxunk and a bit xxunk at xxunk projects images of black america that some might find a bit xxunk . this is because before the plot really gets going , you are xxunk to a xxunk xxunk who lives off his hard working wife ( a popular stereotype of the time ) and when he is xxunk with a xxunk ( who , by the way , xxunk xxunk this role ), they have a fight which xxunk like a scene from xxup wwe smackdown ! and , the one lady wants to cut the other lady with a straight xxunk xxunk scene xxunk ! later in the film , when the prima donna is xxunk xxunk , her husband xxunk her in the face and everyone xxunk him ! it xxunk like the film , at times , wants to appeal to the xxunk common xxunk in the audience xxup plus they can ' t even do this xxunk some of the worst acting i ' ve seen in a very long time . \n", + " \n", + " still , if you can look past a xxunk production in just about every way ( with xxunk characters , bad acting and direction and poor writing ), this one might be worth a xxunk so you can see excellent singing and tap xxunk well as to catch a xxunk of xxunk black culture . just don ' t say i didn ' t xxunk you about the xxunk ' s really , really bad !\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: /root/.fastai/data/imdb_sample;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): AWD_LSTM(\n", + " (encoder): Embedding(60005, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(60005, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " (1): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('/root/.fastai/data/imdb_sample'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[RNNTrainer\n", + "learn: LanguageLearner(data=TextLMDataBunch;\n", + "\n", + "Train: LabelList (800 items)\n", + "x: LMTextList\n", + "xxbos every once in a long while a movie will come along that will be so xxunk that i feel xxunk to xxunk people . if i labor all my days and i can save but one soul from xxunk this movie , how great will be my joy . \n", + " \n", + " where to begin my xxunk of pain . for xxunk , there was a musical xxunk every five minutes . there was no character development . every character was a stereotype . we had xxunk guy , fat guy who xxunk xxunk , xxunk foreign guy , etc . the script xxunk as if it were being written as the movie was being shot . the production value was so xxunk low that it xxunk like i was xxunk a junior high video presentation . have the directors , producers , etc . ever even seen a movie before ? xxunk is getting worse and worse with every new entry . the concept for this movie xxunk so funny . how could you go wrong with gary coleman and a xxunk of xxunk xxunk actors . but trust me when i say this , things went wrong , xxup very xxup wrong .,xxbos name just says it all . i xxunk this movie with my dad when it came out and having xxunk in korea he had great xxunk for the man . the xxunk thing about this film is that it only xxunk on a short period of the man ' s life - xxunk enough the man ' s entire life would have made such an epic xxunk that it is xxunk to imagine the cost for production . \n", + " \n", + " some xxunk xxunk to the xxunk xxunk about the man , which are cheap xxunk . the theme of the movie \" duty , honor , country \" are not just xxunk words xxunk from the lips of a xxunk officer - it is the deep declaration of one man ' s total xxunk to his country . \n", + " \n", + " xxunk peck being the liberal that he was xxunk a better understanding of the man . he does a great job xxunk the fearless general xxunk with the humane side of the man .,xxbos this movie xxunk at being one of the most unique movies you ' ve seen . however this comes from the fact that you can ' t make heads or tails of this mess . it almost xxunk as a series of challenges set up to xxunk xxunk or not you are xxunk to walk out of the movie and give up the money you just paid . if you don ' t want to feel xxunk you ' ll sit through this xxunk film and xxunk a real sense of xxunk for the actors xxunk , they ' ve all seen better days , but then you xxunk they xxunk got paid xxunk a bit of money to do this and you ' ll lose xxunk for them just like you ' ve xxunk done for the film . i can ' t go on enough about this xxunk movie , its almost something that ed wood would have made and in that case it xxunk would have been his xxunk . \n", + " \n", + " to start you are forced to sit through an opening dialogue the xxunk of which you ' ve never seen / heard , this thing has got to be five minutes long . on top of that it is xxunk , as to xxunk that you the viewer cannot read . then we meet mr . xxunk and the xxunk of xxunk lines gets xxunk , it is as if he is operating xxunk to get lines on to the movie xxunk tag line . soon we meet stephen xxunk , who i xxunk xxunk ) and he does his best not to xxunk in this but xxunk he does . then comes the ultimate xxunk , tara reid playing an intelligent role , oh help us ! tara reid is not a very talented actress and xxunk she xxunk gets xxunk in movies , in my xxunk though she should stick to movies of the american pie type . \n", + " \n", + " all in all you just may want to see this for yourself when it comes out on video , i know that i got a kick out of it , i mean xxunk all be xxunk here , xxunk its xxunk to xxunk in the xxunk of others .,xxbos from the start , you know how this movie will end . it ' s so full of clich é s your typical xxup xxunk member will not even like this movie . i give it 2 out of 10, only because of the acting of william benton . i can ' t believe people xxunk 6 + for this movie . it ' s so biased towards a ' certain point of view ' ( once a thief xxunk people xxunk ' t born bad . neither are they born good . they are born with a clean slate . it ' s society , parents and education what makes them who they are . and if they take the wrong turn , somewhere down the line , it xxunk isn ' t going to be the american xxunk system that gets them back on track ! xxunk , xxunk this movie like the xxunk . i bet you have better things to do with your time than waste it on this piece of xxunk . \n", + " \n", + " ,xxbos i was xxunk enough to meet george pal ( and still have my xxup ds : xxup xxunk xxunk xxunk by him ) at a convention xxunk after the release , and xxunk him why he xxunk to do the film \" camp \". before he could answer , two studio xxunk xxunk and xxunk me on how the studio \" knew best \" and how \" no one will take such a film xxunk \". i had been reading the xxunk xxunk for a couple of years thanks to a friend ( xxunk xxunk of the 1970 s will recall xxunk and his band ? i was in a couple of years of that with him ), and had higher hopes than what we got . \n", + " \n", + " the xxunk xxunk that no high adventure would ever be done xxunk , and so doing ' camp ' was the only way . several other xxunk xxunk in on my side , with pal listening as best he could . at the end of the little event , pal came up to us and xxunk , xxunk he could have done more and better . \n", + " \n", + " xxup star xxup wars put the lie to the xxunk , and a year after pal ' s death , spielberg and lucas xxunk that doc savage could have xxunk been the next major movie franchise xxunk if it xxunk ' t been for the xxunk . \n", + " \n", + " tear out the memory or history of doc , and the film would have been worth a 6 / 10 rating as nothing more than a xxunk xxunk seller . \n", + " \n", + " but xxunk the legacy like that was no less an xxunk than killing a baby in the xxunk . \n", + " \n", + " doc savage can still come to the screen , and survive the xxunk xxunk by the xxunk to indiana jones , but it would have to be done in all xxunk and xxunk to xxunk the glory that we should expect from the first american xxunk . \n", + " \n", + " xxup xxunk : yes , there was a second script for xxup xxunk xxup of xxup evil , and it ' s a lot more serious . yes , there was xxunk xxunk shot , but mostly xxunk xxunk and very little with actors . and , yes , there _ is _ a xxunk of ron xxunk xxunk over a brick wall and xxunk at something over his shoulder with a xxunk built bronze xxunk . xxunk ' s xxunk a xxunk over a button down white shirt with a bronze tie , and the words \" xxup doc xxup savage : xxup xxunk xxup of xxup evil xxunk coming next summer !\" xxup xxunk : if anyone knows who the studio xxunk were that xxunk george pal in 1975 to san diego for the convention , xxunk the xxunk up the side of the head and call them the xxunk that they are . at the time , they were doing xxunk and fu xxunk in stripes and xxunk canvas xxunk , and carrying paramount xxunk .\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: /root/.fastai/data/imdb_sample;\n", + "\n", + "Valid: LabelList (200 items)\n", + "x: LMTextList\n", + "xxbos does any one know what the 2 sports cars were ? i think robert stack ' s might have been a xxunk . rock hudson ' s character told his father he was taking a job in iraq , isn ' t that xxunk ? i have had xxunk malone in my xxunk bank most of my life , maybe this was the film that xxunk me . xxunk xxunk sure did have some xxunk in this film and xxunk xxunk malone but xxunk ' s part made a more xxunk impact so she got the oscar for best supporting role . was xxunk ' s part xxunk a leading role ? old man xxunk character was was xxunk a pretty common picture of xxunk of his era in that he was a regular guy who made it big in an emerging industry but in building a whole town he had xxunk his children to have his wife bring them up . in time , being xxunk he xxunk that they were all he really had and they were xxunk rotten , looking for attention , so rather than try to xxunk to his children he xxunk his head off . an ancient morality tale . but xxunk , what were those sports cars ?,xxbos god bless 80 ' s xxunk films . this is a fun , fun movie . this is what xxunk films are all about . now i ' m not xxunk horror movies , just xxunk films . it goes like this : a high school nerd is xxunk on by all these stupid xxunk and xxunk , and then one of their xxunk goes xxunk wrong . xxunk and back for revenge , xxunk a joker / xxunk mask ( pretty xxunk looking , might i add ), marty begins to kill off those xxunk one by one many years later , after he xxunk to make them believe that their old xxunk high school is having a reunion . that is xxunk the plot ? what ' s wrong with that ? that ' s the beauty of 80 ' s xxunk films , most of them i would say . a lot of things could be so xxunk , but they keep drawing you more in an ' in as they go by . xxunk this film . \n", + " \n", + " it features some xxunk xxunk , and some are xxunk creative as well . ( poisoning of a xxunk can , acid bath , i can ' t remember a xxunk ever being used before in any other xxunk film either ) it really is a fun , fun movie . that ' s all it is . nevermind the fact that the characters are complete xxunk , never mind their xxunk , and never mind the xxunk , random things that xxunk in this film . such as lights being able to be controlled by the killer ( when he ' s not even switching any xxunk , you ' ll see ) and xxunk being able to xxunk up blood , xxunk being able to have acid come out of them , just use that as part of your entertainment ! because xxunk what really makes it xxunk . \n", + " \n", + " movies like this represent 80 ' s xxunk . never again could movies like this get made , know why ? it isn ' t the 80 ' s xxunk . that is why you should just xxunk them for what they are , good fun ! i highly xxunk this film if you ' re a hardcore fan of xxunk such as friday the 13 th . \n", + " \n", + " one last note this movie also had a kick ass xxunk as well , marty xxunk . a xxunk , nerd , who kills all his old xxunk in a xxunk xxunk mask . a good xxunk makes a good xxunk . simon xxunk , who played marty xxunk xxunk suicide xxunk after xxunk high was released . that alone xxunk something xxunk to the film , and sticks with it and it even makes you feel more sorry for the marty character , i guess . all in all , great 80 ' s xxunk fun ! it ' s a shame it will never be the same again xxunk,xxbos the basic formula for the original series was ; take someone , get the audience to like them , then put them into xxunk danger . this formula xxunk for the 32 xxunk made between 1964 - 68. \n", + " \n", + " now , we jump forward 40 years to xxunk we are xxunk to alan tracy , a xxunk xxunk college school kid , with his friend , fermat , a young xxunk . they are xxunk off by lady xxunk in her pink ford xxunk to the island paradise where the tracy family live , for the school xxunk . almost xxunk , they are left in the care of xxunk and his daughter , xxunk xxunk the xxunk go to rescue john from xxunk 5 which has been xxunk by a xxunk xxunk . this is all part of the hood ' s scheme to take over tracy island so that he can steal the xxunk machines xxunk \n", + " \n", + " xxunk to rob a bank ! \n", + " \n", + " yes . the plot xxup is as xxunk as that ! \n", + " \n", + " the dialogue is xxunk , the acting more wooden than that of the ( xxunk ) puppets , the effects , anything but special and hans xxunk ' s score xxunk what little there was of barry gray ' s glorious theme xxunk through xxunk ' s xxunk xxunk . the rest of the score was xxunk xxunk . in fact , part of the score was broadcast the following week on the radio and didn ' t xxunk it ! i didn ' t even xxunk to stay to witness xxunk ' s xxunk xxunk with the end titles \n", + " \n", + " to be fair , ron cook xxunk xxunk well as parker , he and sophia myles as xxunk xxunk xxunk . with the right material , they could have been show xxunk . the xxup cgi work was what i would have called leading edge - 5 years ago . \n", + " \n", + " the dynamics of the main craft were just wrong ; the original series models at least xxunk as if they had mass \n", + " \n", + " another xxunk point is that the whole production xxunk to be one long set of product xxunk , from every vehicle being built by ford to the entire content of the tracy xxunk being produced by ben & jerry ' s . \n", + " \n", + " my son ( 9 ) xxunk the film but this cross between spy kids and ' xxunk ', xxunk xxunk at his age group , added nothing to the xxunk legend . when star trek hit the big screen in 1979 with ' the motion picture ', a whole new xxunk of life was xxunk into the franchise which then continued for another 20 years or so . with this film , xxunk has xxunk a golden opportunity to do the same with the xxunk franchise . \n", + " \n", + " i xxunk that this film , like ' the avengers ' and ' the saint ' before it , will sink into xxunk within 6 months , leaving the original series to its ' classic ' status .,xxbos the views of earth that are xxunk in this film to have been xxunk by xxup nasa have xxunk been xxunk with the historical weather data for the time of apollo 11, and show a good match between the cloud patterns in the video sequence and the xxunk xxunk records on the day . \n", + " \n", + " this would xxunk to xxunk the entire argument put forward in the film that the \" whole earth \" picture is xxunk a small part of the planet framed by the spacecraft window . \n", + " \n", + " i am waiting for bart xxunk to now xxunk that the historical weather data has been xxunk by xxup nasa , though that would no doubt xxunk them in also xxunk every xxunk newspaper copy with a weather map , and the ones in private hands would still be a problem . \n", + " \n", + " ah , a response : \" xxunk to xxunk this movie by xxunk to xxup nasa weather data i ' d say is a xxunk , but weak and xxunk argument . what about the rest of the xxunk and xxunk in the movie ? a certain wise man once said something about xxunk xxunk and xxunk xxunk . do you in any way feel that maybe this could xxunk to what you are xxunk to do here ? : - ) this movie is just packed with xxunk evidence against the xxunk once made by u . s . government that the xxunk were a success , and that man now are true masters of the universe . things are xxunk never xxunk what they xxunk .. just watch the movie , and i dear say you ' ll see things a bit different than before .\" \n", + " \n", + " first off , weather data doesn ' t come from xxup nasa , it comes for met xxunk around the world . second , the weather data xxunk a major xxunk in the film . third , far from being \" packed with xxunk evidence \", the remaining xxunk in the film have been xxunk xxunk . xxunk thought he had a xxunk secret piece of film , so he edited it and added his own interpretation . xxunk for him , his source film is public domain , and the xxunk xxunk edited out xxunk his xxunk .,xxbos xxup swing ! is an important film because it ' s one of the remaining xxunk and xxunk films from the 1930 s . many of these films have simply xxunk so xxunk that they are xxunk , but this one is in xxunk good shape . it ' s also a nice chance to see many of the talented black xxunk of the period just after the xxunk of the old cotton xxunk time all but xxunk today . \n", + " \n", + " xxunk , while the film is xxunk important and has some lovely performances , it ' s also a mess . the main plot is very similar to the hollywood xxunk of the xxunk a prima donna who is going to xxunk the show and the surprise unknown who xxunk from no where to save the day . however , the writing is just xxunk and a bit xxunk at xxunk projects images of black america that some might find a bit xxunk . this is because before the plot really gets going , you are xxunk to a xxunk xxunk who lives off his hard working wife ( a popular stereotype of the time ) and when he is xxunk with a xxunk ( who , by the way , xxunk xxunk this role ), they have a fight which xxunk like a scene from xxup wwe smackdown ! and , the one lady wants to cut the other lady with a straight xxunk xxunk scene xxunk ! later in the film , when the prima donna is xxunk xxunk , her husband xxunk her in the face and everyone xxunk him ! it xxunk like the film , at times , wants to appeal to the xxunk common xxunk in the audience xxup plus they can ' t even do this xxunk some of the worst acting i ' ve seen in a very long time . \n", + " \n", + " still , if you can look past a xxunk production in just about every way ( with xxunk characters , bad acting and direction and poor writing ), this one might be worth a xxunk so you can see excellent singing and tap xxunk well as to catch a xxunk of xxunk black culture . just don ' t say i didn ' t xxunk you about the xxunk ' s really , really bad !\n", + "y: LMLabelList\n", + ",,,,\n", + "Path: /root/.fastai/data/imdb_sample;\n", + "\n", + "Test: None, model=SequentialRNN(\n", + " (0): AWD_LSTM(\n", + " (encoder): Embedding(60005, 400, padding_idx=1)\n", + " (encoder_dp): EmbeddingDropout(\n", + " (emb): Embedding(60005, 400, padding_idx=1)\n", + " )\n", + " (rnns): ModuleList(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (2): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (3): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " )\n", + " (input_dp): RNNDropout()\n", + " (hidden_dps): ModuleList(\n", + " (0): RNNDropout()\n", + " (1): RNNDropout()\n", + " (2): RNNDropout()\n", + " (3): RNNDropout()\n", + " )\n", + " )\n", + " (1): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('/root/.fastai/data/imdb_sample'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False), functools.partial(, clip=0.12)], callbacks=[...], layer_groups=[Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): Embedding(60005, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(60005, 400, padding_idx=1)\n", + " )\n", + " (2): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)\n", + "alpha: 2\n", + "beta: 1], layer_groups=[Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(400, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 1550, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): WeightDropout(\n", + " (module): LSTM(1550, 400, batch_first=True)\n", + " )\n", + " (1): RNNDropout()\n", + "), Sequential(\n", + " (0): Embedding(60005, 400, padding_idx=1)\n", + " (1): EmbeddingDropout(\n", + " (emb): Embedding(60005, 400, padding_idx=1)\n", + " )\n", + " (2): LinearDecoder(\n", + " (decoder): Linear(in_features=400, out_features=60005, bias=True)\n", + " (output_dp): RNNDropout()\n", + " )\n", + ")], add_time=True, silent=False, cb_fns_registered=False)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 + } ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + }, + { + "cell_type": "code", + "metadata": { + "id": "SwJK_G80KROl", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 122 + }, + "outputId": "2d1fdff0-66e7-4f96-93fc-7e112c4306c8" + }, + "source": [ + "print(learn.predict('กาลครั้งหนึ่งนานมาแล้ว ', 200, temperature=0.8, min_p=0.005, sep = ''))" + ], + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "text": [ + "กาลครั้งหนึ่งนานมาแล้ว คุณวันจันทร์ได้รับการอุปการะจากแม่เธอ \n", + " \n", + " วันต่อมา เธอได้พบกับ \"อาเธอร์ โอลด์เดน\" เด็กหนุ่มที่มีบุคลิกคล้ายกับ \"เอมิลี\" ซึ่งเป็นน้องสาวของนาง เขาจึงได้รับการเลี้ยงดูจาก \"อาเธอร์ ดีอา\" และเป็นผู้ที่คอยดูแลเธออยู่เสมอ เธอได้แนะนำให้เธอเป็นผู้หญิง \n", + " \n", + " โดยมี \"เอลซ่า\" ซึ่งเป็นทายาทของ \"อาเธอร์ ยูจีน\" ผู้เป็นสามีของเขา และเคยช่วยยูนิตที่ถูกส่งตัวไปประจำอยู่ที่ดินแดนแห่งนี้ เธอได้พบกับ \"ยูลิสซิส เกรย์ เอลิซาเบธ เอลลิส \" (เอมิลี่ ไอเซนฮา) ซึ่งเป็นชาว \"เผ่าเอลฟ์ \" และเธอก็ไม่ค่อยมีบุตร แต่เธอก็ได้รับความช่วยเหลือจาก \"\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "MyKkpWZbMOzt", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] } - ], - "source": [ - "learn.predict('กาลครั้งหนึ่งนานมาแล้ว ', 200, temperature=0.8, sep = '')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + ] +} \ No newline at end of file From 65e3d6e66ea8784631812d612624fd4b293ac4b5 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:37:27 +0700 Subject: [PATCH 38/73] resolve conflict setup.py --- setup.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 450fcfd7a..26c042622 100644 --- a/setup.py +++ b/setup.py @@ -18,10 +18,12 @@ "ner": ["sklearn-crfsuite"], "thai2fit": ["emoji", "gensim", "numpy"], "thai2rom": ["torch", "numpy"], + "benchmarks": ["numpy", "pandas"], "full": [ "artagger", "deepcut", "epitran", + "fastai>=1.0.38", "gensim", "keras", "numpy", @@ -31,6 +33,7 @@ "torch", "ssg", "emoji", + "pandas", ], } @@ -90,7 +93,10 @@ "Topic :: Text Processing :: General", "Topic :: Text Processing :: Linguistic", ], - scripts=['bin/pythainlp'] + scripts=[ + 'bin/pythainlp', + 'bin/word-tokenization-benchmark', + ] ) # TODO: Check extras and decide to download additional data, like model files From 36f8fe49447dc1324f290541f1c8d1d4b7df77ab Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:38:13 +0700 Subject: [PATCH 39/73] resolve conflict CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f43233847..5180d7b04 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -36,7 +36,7 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod # Discussion -- Facebook group (for Thai NLP Discussion only): https://www.facebook.com/groups/thainlp +- Facebook group (for Thai NLP Discussion only): https://www.facebook.com/groups/thainlp - GitHub issues (Problems and suggestions): https://github.com/PyThaiNLP/pythainlp/issues Happy hacking! (; From ec4493402e76a24852d53b9eea6661e9cdc18828 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:39:06 +0700 Subject: [PATCH 40/73] remove rules.py --- pythainlp/ulmfit/rules.py | 123 -------------------------------------- 1 file changed, 123 deletions(-) delete mode 100644 pythainlp/ulmfit/rules.py diff --git a/pythainlp/ulmfit/rules.py b/pythainlp/ulmfit/rules.py deleted file mode 100644 index e5aea85a2..000000000 --- a/pythainlp/ulmfit/rules.py +++ /dev/null @@ -1,123 +0,0 @@ -# -*- coding: utf-8 -*- -import html -import emoji -import re -from typing import List, Collection - -TK_MAJ, TK_UP, TK_REP, TK_WREP = 'xxmaj', 'xxup', 'xxrep', 'xxwrep' -BOS, EOS, FLD, UNK, PAD = 'xxbos', 'xxeos', 'xxfld', 'xxunk', 'xxpad' - - -def fix_html(x: str) -> str: - """List of replacements from html strings in `x`. (code from `fastai`)""" - re1 = re.compile(r' +') - x = x.replace('#39;', "'").replace('amp;', '&').replace( - '#146;', "'").replace('nbsp;', ' ').replace( - '#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( - '
', "\n").replace('\\"', '"').replace('', UNK).replace( - ' @.@ ', '.').replace(' @-@ ', '-').replace(' @,@ ', ',').replace( - '\\', ' \\ ') - return re1.sub(' ', html.unescape(x)) - - -def replace_all_caps(x: Collection[str]) -> Collection[str]: - """ - Replace tokens in ALL CAPS in `x` by their lower version \ - and add `TK_UP` before." (code from `fastai`) - """ - res = [] - for t in x: - if t.isupper() and len(t) > 1: - res.append(TK_UP) - res.append(t.lower()) - else: - res.append(t) - return res - - -def rm_useless_spaces(t: str) -> str: - """Remove multiple spaces in `t`. (code from `fastai`)""" - return re.sub(' {2,}', ' ', t) - - -def spec_add_spaces(t: str) -> str: - """Add spaces around / and # in `t`. \n (code from `fastai`)""" - return re.sub(r'([/#\n])', r' \1 ', t) - - -def replace_rep_after(text: str) -> str: - """ - Replace repetitions at the character level in `text` after the repetition. - This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xrep 8 ย'; - instead it will retain the word as 'น้อย xrep 8' - """ - - def _replace_rep(m): - c, cc = m.groups() - return f"{c} {TK_REP} {len(cc)+1} " - - re_rep = re.compile(r"(\S)(\1{3,})") - - return re_rep.sub(_replace_rep, text) - - -def replace_wrep_post(toks: Collection): - """ - Replace reptitive words post tokenization; - fastai `replace_wrep` does not work well with Thai. - """ - previous_word = None - rep_count = 0 - res = [] - for current_word in toks+['xxend']: - if current_word == previous_word: - rep_count += 1 - elif (current_word != previous_word) & (rep_count > 0): - res += [TK_WREP, str(rep_count), previous_word] - rep_count = 0 - else: - res.append(previous_word) - previous_word = current_word - return res[1:] - - -def rm_useless_newlines(text: str) -> str: - """Remove multiple newlines in `text`.""" - - return re.sub(r"[\n]{2,}", " ", text) - - -def rm_brackets(text: str) -> str: - """Remove all empty brackets from `t`.""" - new_line = re.sub(r"\(\)", "", text) - new_line = re.sub(r"\{\}", "", new_line) - new_line = re.sub(r"\[\]", "", new_line) - - return new_line - - -def ungroup_emoji(toks: Collection): - """Ungroup emojis""" - - res = [] - for tok in toks: - if emoji.emoji_count(tok) == len(tok): - res.extend([char for char in tok]) - else: - res.append(tok) - - return res - - -def lowercase_all(toks: Collection): - """lowercase all English words""" - return [tok.lower() for tok in toks] - - -class BaseTokenizer(): - """Basic class for a tokenizer function. (code from `fastai`)""" - def __init__(self, lang: str): self.lang = lang - - def tokenizer(self, t: str) -> List[str]: return t.split(' ') - - def add_special_cases(self, toks: Collection[str]): pass From 72c394bb02fb564d5c90a1e0d76e9152808199be Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 11:41:20 +0700 Subject: [PATCH 41/73] fix PEP8 issues --- pythainlp/ulmfit/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py index 5db96824e..478053732 100644 --- a/pythainlp/ulmfit/__init__.py +++ b/pythainlp/ulmfit/__init__.py @@ -18,7 +18,8 @@ ''' # Fastai dependencies -The following codes are copied from https://github.com/fastai/fastai/blob/master/fastai/text/transform.py +The following codes are copied from +https://github.com/fastai/fastai/blob/master/fastai/text/transform.py in order to avoid importing the entire fastai library ''' @@ -199,7 +200,7 @@ def replace_wrep_post(toks: Collection): is added in front of repetitive words. :rtype: list[str] - :Example: + :Example: >>> from pythainlp.ulmfit import replace_wrep_post_nonum >>> @@ -368,7 +369,7 @@ def process_thai(text: str, pre_rules: Collection = pre_rules_th_sparse, Process Thai texts for models (with sparse features as default) :param str text: text to be cleaned - :param list[func] pre_rules: rules to apply before tokenization. + :param list[func] pre_rules: rules to apply before tokenization. :param func tok_func: tokenization function (by default, **tok_func** is :func:`pythainlp.tokenize.word_tokenize`) From 1986bbd20cf292699f5d0e0c840d2d1200ea72b5 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Mon, 2 Sep 2019 19:20:33 +0700 Subject: [PATCH 42/73] Remove fast AI from setup[full] --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 26c042622..84c1b991c 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,6 @@ "artagger", "deepcut", "epitran", - "fastai>=1.0.38", "gensim", "keras", "numpy", From a02a9d66e31bb2bdf693cee483b4315d576f4815 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 3 Sep 2019 00:30:53 +0700 Subject: [PATCH 43/73] Update README-pypi.md --- README-pypi.md | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/README-pypi.md b/README-pypi.md index 51ad0707b..29214fe3b 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -8,20 +8,12 @@ PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, pa 📫 follow us on Facebook [PyThaiNLP](https://www.facebook.com/pythainlp/) -## What's new in 2.0 ? +## What's new in 2.1 ? -- Terminate Python 2 support. Remove all Python 2 compatibility code. - Improved `word_tokenize` ("newmm" and "mm" engine), a `custom_dict` dictionary can be provided -- Improved `pos_tag` Part-Of-Speech tagging -- New `NorvigSpellChecker` spell checker class, which can be initialized with custom dictionary. -- New `thai2fit` (replacing `thai2vec`, upgrade ULMFiT-related code to fastai 1.0) -- Updated ThaiNER to 1.0 - - You may need to [update your existing ThaiNER models from PyThaiNLP 1.7](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0) -- Remove old, obsolated, deprecated, duplicated, and experimental code. - - Sentiment analysis is no longer part of the library, but rather [a text classification example](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/sentiment_analysis.ipynb). +- Add AttaCut to be options for engine. - See more examples in [Get Started notebook](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb) -- [Full change log](https://github.com/PyThaiNLP/pythainlp/issues/118) -- [Upgrading from 1.7](https://thainlp.org/pythainlp/docs/2.0/notes/pythainlp-1_7-2_0.html) +- [Full change log](https://github.com/PyThaiNLP/pythainlp/issues/181) ## Install @@ -40,6 +32,7 @@ pip install pythainlp[extra1,extra2,...] where extras can be - `artagger` (to support artagger part-of-speech tagger)* +- `attacut` - Wrapper for AttaCut (https://github.com/PyThaiNLP/attacut) - `deepcut` (to support deepcut machine-learnt tokenizer) - `icu` (for ICU support in transliteration and tokenization) - `ipa` (for International Phonetic Alphabet support in transliteration) @@ -54,8 +47,10 @@ Install it with pip, for example: `pip install marisa_trie‑0.7.5‑cp36‑cp36 ## Links -- User guide: [English](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb), [ภาษาไทย](https://colab.research.google.com/drive/1rEkB2Dcr1UAKPqz4bCghZV7pXx2qxf89) -- Docs: https://thainlp.org/pythainlp/docs/2.0/ +- User guide: [English](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb) +- Docs: https://thainlp.org/pythainlp/docs/2.1/ - GitHub: https://github.com/PyThaiNLP/pythainlp - Issues: https://github.com/PyThaiNLP/pythainlp/issues - Facebook: [PyThaiNLP](https://www.facebook.com/pythainlp/) + +PyThaiNLP Team From e852567f9f4e00e93eb0fc850e4ed513e2af5fd9 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 3 Sep 2019 11:36:05 +0700 Subject: [PATCH 44/73] PyThaiNLP 2.1.dev3 --- README-pypi.md | 12 ++++++++++-- README.md | 2 +- setup.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/README-pypi.md b/README-pypi.md index 29214fe3b..b39e99b59 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -11,7 +11,10 @@ PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, pa ## What's new in 2.1 ? - Improved `word_tokenize` ("newmm" and "mm" engine), a `custom_dict` dictionary can be provided -- Add AttaCut to be options for engine. +- Add AttaCut to be options for `word_tokenize` engine. +- New Thai2rom (PyTorch) +- New Command Line +- Add word tokenization benchmark to PyThaiNLP - See more examples in [Get Started notebook](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb) - [Full change log](https://github.com/PyThaiNLP/pythainlp/issues/181) @@ -53,4 +56,9 @@ Install it with pip, for example: `pip install marisa_trie‑0.7.5‑cp36‑cp36 - Issues: https://github.com/PyThaiNLP/pythainlp/issues - Facebook: [PyThaiNLP](https://www.facebook.com/pythainlp/) -PyThaiNLP Team + +Made with ❤️ + +We build Thai NLP. + +PyThaiNLP Team. \ No newline at end of file diff --git a/README.md b/README.md index 9a814a5ea..fab714986 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Made with ❤️ We build Thai NLP. -PyThaiNLP team. +PyThaiNLP Team. # ภาษาไทย diff --git a/setup.py b/setup.py index da13a2200..e55c68c3e 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ setup( name="pythainlp", - version="2.1.dev2", + version="2.1.dev3", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", From 7745a6fc923722442186f68d2a72ab933ead78ad Mon Sep 17 00:00:00 2001 From: heytitle Date: Wed, 4 Sep 2019 08:12:45 +0200 Subject: [PATCH 45/73] add image for #248 --- docs/images/evaluation.png | Bin 0 -> 87242 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/images/evaluation.png diff --git a/docs/images/evaluation.png b/docs/images/evaluation.png new file mode 100644 index 0000000000000000000000000000000000000000..170197d773dad7742cc6e6c441bcc9c8fae6d3d2 GIT binary patch literal 87242 zcma&ORajhMvo(mjySuvu4>S(J6Wk$aaF^ijF2UVh0*y;UaDqDt?hrJ=na%f|^ZYZ< z%*EV5_HKG>y;ZB$T9qg@6*+Vi5)>#XD0BsR>5ouQa05_K&^1U1z)$&Bkxt+PI4cQd z2`H$>1k@)}cqk|kl!CN`h8OgyA>tFR3;~6VdLvo%@>DQmkrNF|YjZ)x@>fE9)YfKH znqV^-!PDsIZUJd{)V~})1X!p;%~Uk}HB8Iz{`D|A=4_lYG>bf^V>>dq*$NttC+?Dp7rVr^*sti+d2;2HD;`HC|0L;Mk?yvtq2DE#$T-vlh zNB6UAIb^4E(|c|g^~*jF$M4Zrld-{Q5|2)2&K9=r3kAVqigM))UNSv^TbzWq#cWhX zFGHOvxin72OdikL@{d-Utpb-kfBOwzz5loFdK%GNbc`{_XFx`)`)-{!L3Ch2?Vv&Y zPuF#mYM`7JWg10V>$=PM)|p@*Et#C9hF(D;Zu{MKj6GY``u#z^<23|&dgpool{2)N zgrDLx7iAzY!fr4UyA|O`^wR!EPM}+Qc03;m^Loih1Y@+71_=@Q*VD%I-_1Y8Y#}&E z8MeQB{LT}^UlQ&94ku7L?nu^SW`@x%O}p~=P|a&X2USgsR!w8n(m;U6lD;)nNjP&u zU7e9P;UscHJ??ZcA7lNHs0n6#^gPR|2)}qR(%Aab`zppF)NFH|O0%LW{yaFepC9lN zJS3rwGP!13Z+~AW>`{ZRh&Vmos2o$op5}1gey~5JyG1!9qJ$eS)ll(J+m{Eug)P%F zO!1XS55r8i_^LTwYgRrSF=8=616K3u~#OBK-RiM zDTi}!`-;9>zs&)s#eEfy5o<`5qOL-enWPMP7&Q%_BFuLS(O9e?gX^jDyrbHZf@{QR zRw_Jrd)itMZR@?y4VS+C_q2YGxDyP)O{nVyZDVJg!x`+8y3V?z+0q!r^bdEJ!<22l z{A+f+q;-3NCI-#+1qda_Dnfgy!B1kZPp(s#^;v{$8j#Fq{*9LKQdPN|Cb_1IdfUyu zpsf;+Zxns{rdN9aD|8vjC?8CvxOt|7JuZ^K;ZZs6LUacDJs2Jc4NupT+O`e_vYw}_ z@$1=SxC5lI7F23yh-k_j_teE1`XR>fAjgf4cl8Mh zh-3jzmjqKr-9DMG&wm2;r$qkj@=cFCnTFwOjeNDX>sN!i4clO;t}#nl%xhM}O0$g0 z0pW==cJ7*x_!1gsAD%UWXZU<#;Is!m4srJRADsHtd&j%Egj#0DHfVczhYwZyie&1pm-x%gN~H zOH+rN89B;Kg%pNM(c^*;3?9ZDCzyzSgliVT>7jdvJW^S4twC(2W1dhbcb;&&m(ZbA z`?d|iD2iAz3YD+b^iMGPw_v=F5lASmWP#nx^#m%$$VGSx1EwXyN2jI zZZ;9ds3=~Lpl6*ul&+HExz?u(rR=}q-c-}USqE^-op{iv)yft&y>i5KJA4)ZO(o7i zhGJspxtd=dPO|A@8kwp&NT72@blhfSig}w8vldf)hw!zfn>v8cg>tnT?OzlabIlBEaQ&BUJhh?VddLF~Q7j>;N z#rz(;@H5(^In8K3eE2gi0MdR9aer<})QH>v5lakl^j3e24MD`*@8axz;*pEMx+3xa zyJDsPUE8a<$;;a%W2*-sSqiCPIF>niHNQRom2#=Lc0yxcZjd$;j!<%YZ&4qv zZjd}l?*mHYLJ1q>ZinnyQ`^wu5Soiz*o}#W)Vh$K=TznWt;Y2jYyGmhkbd|~&8Kpe zobx<0iGic?d^U zfvJR)-fltz^o?8e83sIZRbuuAFhzyx+c)WB?!o1PTYt*7mI#M_mL^@f@hb;=i`}jJ zKb{Hs6z_DQ3#@agy^bW&dcSXAUV^hQWL3Rm_;6QkMFu_4=h*xFH-|NMsngRjLYN3n zIkjUPD2j*_YSX6bT<6rNE(<#Bv;~eCH!aoGY%oc#O=DV>>!j$=ej_Bc!`TJh?zf3E zWIs12HhBDdX6rNx zmDwTl2c z%tPWk^mBC0YB6*4g$?C79lwz*gn!{xnv5^N_JU(gyxGEVSK*g=#G1m6&WE@K#EE|2 zo5L9ur4Spc#3RI-~KUq1XbL2g(Tcd#B)LI+NA41PKTH`ARc2W8^Q z4;LAFUznxjDuh&fKvEf0bYI577L7k6G2MKkaP`%25a06=jp=%p(Auxrx8dWA>*FN; zS)@HeKfxWr_k`nEf~L$LBG8Y0Mx^xRl0%WYI_|yvLCWqY_{odfaA2D@PQ|vb*_$jA z*Yk*u+=qO7{ri$pw(Vk#Zs5|*(H!iqX`ZballSVo0uvAqM2 zqpQ1B5F!l33m-GNu2^I^uP}8m`8PdLxl5;^&V}x%`@*NtiQPq$F+~XCRhSUNS;8tf z!k4%6Dcott@LwXyq9)jJVq<0mvO-aD*wm@8l_cR{Po?-?`jN7L2v`{x52AHT(XUy5 zXs;fQk!mtbES0L_^Z)?~y9=J7x3%*Eo6IX6qhMLGC0A`p>JF>?h3>YJkxlV8g$S~1`f2UKAP5U6)EavXD63>T4uq&axVfS*7A24Ogq{g~C zyEy-erT61F{uh1PJ^GiJ*oRWQ&gJ9PT0Hc{9zvJmx z=Hs(g0MXgSX`=+@cX?6U_XR?S)ZVCWaKjQZJeJ4m( zi!sMWwN7zQ`%8Pxww`@E4rj~uQL3^0`Z`rkz<#DxQ`t^miXqiU>mb~r5~Ip;oh(do zT`G6)u<RRNiWJ)fZ+`#xY{H`I8MD9t$_U|SO#q673U;oI2+nXgp>9REA zH8t7eGGV3d??*reR9CU5Ja@$SBsqQQ&v$mV^eoaeb;?Qn1D&pRp3Xb}oQf*#f96mv zY40SA9FIT~ypUwy*D|Zd`ODO(!^*cA=1rDU;`ekpykA}4v)z*1i`1gj(Q+VGdd2$P zfVp5cA?4@wC#zN0XLSPcfv!Ca*t2ez^ySLXJiq0<62x$J-)SGZM>G9oioEr^%N@w6 zOUZ7+kx6HhLBzchgW)Q<{##h2p;SFOrHvJ@yOf|VH8Cv_NmqSMqQ@-Oo2CK1w}mL< zFPns*+b!tOFb}7_M`SH`Vb^~}I)v4F^;<7t2eOpHRzjV@FRG1oQU`E<*TZr3Uv$iX z(yJfwZDMw??HD!%Ndl6W6100*vK=S?PFLoBl*THCa4o*;&dr^qM|!&3Nf7c)?r30# zNbb$$8Dxyr&%1=JYB41UbF}>+v4xZ8nkl1??Bb)FPRA%8C#7^KhD4nygsSJMJkVI@ zriM%^EGLD!GTxN1ug}s@c`#z9Hz5(aT6D|6pgB7b#YyDEUoJ15^V_fP9wM~Fadw_I zk)>amil4=^rx{o74V>A`d+7!wxuB6S7?}i8pPE>N_>+rjd^W{AZycuJHN#5*VG_=G zprvHN8z<{&|7O>Tb(Cjf8cH*hT+SPI8s7gaf*h& zXKRSZG0}tZCf4TjK1z5gf*s%+CW@^VK1z-Yj{iOx1)PisR0}6cp0O?SYJ+FmQqMA= zmrhrrU{gpuLRQgd>$(^S=kL?~nbqaBMGe6|geTAmRMoEmnh%52UpHqq7+Yz}n7v}S zu-SB5nBq^DSLM3QqCFHxPr_f>d*rD^Ey|;ah+NrJU1?;C(ho}qSJ9M zf)ku_q5Ez*wl&rY9c6u=w7-7LXx7^8n+CV{F zXyg(ZnNT^6yljud!iWy_dZ+p9dR2)LJh6{EpX#t}%rcliz=x!oNE5hg3I+vg7118? zHTGalTcD7{&ud}?4Z1drMHS_oU_<-$9XD$8&NKM+@yrw7GDVA-+%8uWe@Rif;+-dE ziKZz)qLTx?(x-ebqsN1mp^`#6q)IQGQd5815P}hoq#v%?S7htHnJrVy;P5?#AC_`b zW**yWm9N&SEo$OZ#vVa~>0r>3)i!}DaK*B4nl6eCq*3F5aes#kDY!Z;PC($orUeH{ zGbZmA94iZKm|JmT#DK7xvwn4dJs>|v$C`|i-zf41ZJX+q?8Mht6D1Tq?qwLGo&Ti4 zq+D-|D?+_~+0S+yQ1HHFA|aYtb6+yF_hZ7-{Czdf=_nsVi1}Cj-gz%k<#-RRl6_}< zKkP8EyePcEj^_CZuhPLB@8+u4mNqU){M)s>!^3Bsra5<^#IA$T0h0>Gc!Y%085Af9K#91m+ zCPntS-waAf2$B6Qh)68Wz>P^Jmrb|vofntmVD%wLIG5=_3RjbZ1zmeKaA8hW+&<6u zN}>7n*Md&tbpeeV}3E)MYiT?Nz@fVJJVAj+wJcp zi1RlDH3H?faaN7U>~do-{bAv!Tmc&-;QqAelYC{F(Cz#u*vvh*U&xF(CLBF?+eMh5 zJ7_c{F2-#s*x9y&4_oWO*bF|Jz4abA2T-Z^uC*R%cLWN?E zd>>zH7%B}aNvr2?CqRch=?@mSoXR1tz(-6mjM#ZkG|08QCA1814=ZEygHgDKJm?vb zAWb7iRiQn+N80e!FZU9im?5FRy>`%FJW1buNqsL8D%x~p&=AE%kMIu&r8Sq2n*fQi z&o_Hzt7Yh}sgnK>%D`^Cnoc0D^EJ-JtdEzaUM(xZ7bx+^iCt#C(R(2Ym~r#k$dtD` zHI!m^b6DPka=H4B-QR0Tj?Vj7-8?tkGpLIV4E*!`{&M3oC^dB1nWCqLf;17hjYs8O zzxRV++W=tGEDkhQe-APl17;(GkSIm*U=h$ZBP5lUoT(p#U!Sfg0j%WE`rH3&2qN-+ zK6gxIYD&c7o+`nwb?YLo-7{e;zbSSi&4g@9!gvt?zVmbGzGS1^rFb%6Y!s&$8@zOVVNp+P~<>x|S-QWh>LFs=7t_AfFrIIC?~niU8<3dp1(sG0n?)fuoXR#?VLkJwGAFfJ8^3QmKvA-#BFENd z&~u#Hp%=COC$8r4r~BVN*cX``y&(tBs!;uKD55pJ0oN#5rLzs^=BkmYl7kvX_jN`~ zzvs)+froalvVeOri3b)Dza7!=KuD<6EL}f9u8Oo88NXBGfHhp^6xM!Dp&EUN>>P@p zh4V6J7vzU|$F~hM5*1xbsJr*G_hW~&#@tza#~(rv9W>Br-iaTR5C&D%MYUTs)5XGz zfdTe0-p?93?VpWG0jITXH$an}-g7dCJOO2pq=vY!ws}f8oNdB&xwLZ|j2!7d?bVoqL#V??Mgp16CZfM#OGuKO;H?O~S)%6i5@1 z8x$FVlaE0z%vI#l(QfrKFXvZ}-`D2NU{n{P7~g^o0O`bjWzid>a?CLF;GWESH-18% zQ3kLJmND$DCPp-{PZaNcmRm#-@KwA3d`fFebygnaX50ayjw=Zd6s@nLv{b2|Ln!}q zDRj3P0$DL~UYArj#^DSc*0^{$sUO~jOR<23#)TPSy^WzPsWY4KYpZelmX)R*mW{9g z0eCM=xVaBcS+;dsZgd}rNxInH&(?#tQL-8NT13ojR7B4B5_PkjzVW{A$#Ov-k?Jz| zPtb56q#58AKPLQLNjgyT)ehHpXb&SCY5YrfqPiX8@Njd)|GZ+AzgLtTuo;Bt@&-Mv zjC%maCZ#x6F{>8eOFsqRF5FJQ$0jwgqsd>mx+(TgfIxzpp=#0Q19~)+OIwJMM%cR$ zF>o2BmzbqfA^fWt`2K`(zWObfuojNiLEVvw-bz5QAMcZHw%{qvl9(@jmSsF;wdyyVShwqJ@p|-uH_MD_!3j@-{gT}4~MkW-;Y{gZRY4U(+kPfRMPoY+3|SxvFj4*_*Q($9D^ac zO3s_KU3AL63d0CcWL^p++leyU%Uu;M)vM=pFQW#8-yu8uibQ+9HtMp6gobYFA@Lb& zmjkMPGE~ekS4#LS5&bdegjaQiXy)gc;w!oac8x<50CV@VS$Hb~vbtjrJC%(Iigs}`>bKKyJUM%U2JB!LOMECsPP;^@JZ*u?ytCb}7eN2{IaBGL zj_55(D`vlkFuRMf2nZ7?whL97RG525Hv#d&ta6x%lcYT6^6@u02iA$rHe~V_3#kW1 zzJoQbz6*6f5kvG!GT(poV2|F|3Tn(4x$vF_mmcT`U-w^ znBGVE2av#P)pC(@wf!8=UW>I4^%u3^ zs&188CvxP3?D&wNZMyfTiegU{qWOHDlM;u9JtSIictNWY1*4W`-(izJQ~|g~R~CfW zh;oa;;jly3nQ&ctTdG(+)pE@Gqg+R9bw_Gv?%c(C+vwlG^V75xX4)t8jI<#?U9SW) zq71?eilJiW*;&Cl?8I!N@-A}w{(Wg)dK3xRQel~bU)TkdN*E|e+Xi=%KnEml&uLYR zR+-%7{Lk#&*l5JLBEm8zh3L1P0aCF{5wvOfU#1b{=@}f}Z<1XttPP)_-YP)X>$WG# z|BDyGbi1Oq$A>1-{{|x|6;jq+LqU%^7N|vbtUUMJ4zCkRE6hDcZ77z^>Fk-IKBh=f zxn=6IKA6ErHyD-_L3%fU=nQr;m`zl%^3cKOv^V5EBlleI7xQ4-@JT+cQD_HB9%o+& z@z48Jk2QOH3SkV)+O;4K_JIN1o$27`2F3y0t$0G5kjLN<5lFumW0w+ z6dMkq@o{25YrM+gf}<#07Xo)s7@N&Cn%uT}kCHYwE{e$PuPoZQYN4EeIIytOL%YIH zB9J8hMY6*b;R)XnB*%WU5CrtGDD1G{(5y-zC^O|inwQhYox(C8>x=vVX@r*z%_zBXBX3K<2yJiM^I^9b+!cNu& zzeVLT!1ls09GHW02^6`w%1{mznYL2=A_@^lo@87~E-Ff16~-hm!G1QS-&1L@;sVcA zW;qpef%bBq14h$3x5m&*!v%OsBf>x*V$mI;AqH$Pz4a>aTQf@kA0Kd>5{An0;sagi zIMN{-+jw_z6EjRSNCM6ciWMork_Gs8I`%a5|KAT(pRESm{BqWS)w|uqXNWxl2jde< zgS(+-NL|qVxj2rW=kt%)t>~j;yAr5^nDH34f~aUv8um1?PE=sZcf#taa%lqPk4hSt zqnn!Yk(WEegky`MG6kd9-^xen)u5}QLCM4mB(^o`nky|lZxz9v8x~SS^K=OvJdsA+ zlzAXE#`h6X*EE`)mQONJa0M;Zx{pxr5;jxC94N9FEk8$_Br*_J%*`JpU6NJawWiw> zOT(eefN$HkxJa4o4g78wguDR+ieZ&K&e6%{!Jw#B19SEQ^w(Eowvr*VShd16{GTdW zVS^#9rA%UI<`YqY{s4qg*G;l1D5s<*wZQ+}V=6iQ#=;2A!j0=jM$~fC?o<@VH${_# zEn&-%VOwULEY_0F^}Fzp-O1>FlNCm%GvFL3L0-E0`T3X*j9{I+3*w4KaNmTLNt(}A9`an2Lmea zR1J7L`Y$Dg4Ny$F;)=fY@-Kd82Lm!M!#T90F@BpewabCNu$;VOGFYyDNg+3`x$+4CxY<9 z-f#rkM=ASjKN#~70WK&47OC_OA<}v`Q$|?acy6UEKIR-lEnnck3GAXPI}zMG?hKrGL`FY%Xr5DjM$_I|!yI(X@)9C*Ws$|aTPaUerX6CTziXw+sX z@aK4;vR#$oSmY-yU5m6}Q7+tp%|P8z0IjCVvF?o)d6|GFX_bWfb4P>R%T;o(zFZV0 zNt(AC1}BNVaPRuM7QEg$|)WkeJw=XNK`cnoekRAbU5}x(7(o zo9}Q-lyd~RJpp!V^&MUzfWDNQ;Zfn#Fnl+yorPQ34)wl&Nh{7oo9p_!YMW#rA@F6M zX{uOz#9%y~{qqqWtOXEAv@?MTq9ja$lb+~oeN?7!w^33=w%+RJz*G2{q*-3UrXr=A zk)asr*5qV)D^9dWR!?!#^sv?7e1T;EtEAck7NP`k-6#qyUNMn7u$N3=k-|QprSiQ#9f<3OsR4L=!lCobGTq3>rWpYC7h$e5 z8ZL9n0?s+-;Z3h?z0YxepvcT6-k+ky)@1+_pz)#jGsFQvYnY7Q!~OL+S08661D;GT zFiH0D-@$&(p6{1#*Bp*VryK9Brc2(IToh(;zVy2fqR>-5TCKxF52c^U$bX|G#BT9- zKCw~SUl(Gdv7EgzHv25n&)CcpEq*~k$j1SjhXdk|cQj$CmVn1$^A1prO%DhB&G1#y zGZC(^D#p;zSgZ%~MHO)90=2mhntyP)?8>LJPE_X*6B(f!cmm;`4Y(}Y6`}^V`hgB{ zl|Vb{?X=EM8#n)9LNYMp^sQF6-Ygy=7{Za_Z+4k%$DVd-Wk!R^bbSY#0z37a9Er;v z(d!AwaY)gw^+v~cY;tGy`>b@bpei(zZ|_IYBnR%W#Q;I?>9e#KAnLGnKM7v`3Fi`1 z=LI``3ey^+eFe-G`&Uh}(By`agdBf`R~8P@SM1bVis5(vOcwYHgeLkeiGQLi_1*sT)@xb-6fxZS1w^g zwO(1CH+HGpY_h2EML=xi8UjfJC-sEqFv*57WKax3PNIXywx*V@;-5)a+-uX8|OTGgj} z1Z?YB;%|{KuKI6`z8s}r3teMECLx-vbS8aDfY_aRX~YrW4Y>h%@0y}R+?Tn1ZWpvC zffnwF0dWAho4%5h9A0w{h@^YFoBCD-C6WeT#_<~|cUjr{_J}JW5kKSzN-ICX9U>+p z{{ws{yQ9$Gr}-4e6%#(Z@e?jBpVKnj0V@UEto4?d++TQDn=N^O8bN2IkB6*(tA7LX z*Of~V8JNl>?)UQmkf)8AohO7*en2^TVcZl|UN&vrJ^8!*IYIv8%0v1@uIDe^;6{HT z-UgjAc%Hw2iUHGpo4;^)R{u&-XMoCkiK5%SpZ24D-f(303vXAm=9d*8rik44fRm3I z$=nh_EBYdhrI@$j12yTSKs7H{KBQxYeRf_FY13Xo>EC4D`!zN|giI$Rbw*`xNsJ>z zJw&S^lGGe=ydjpaZKJg6-Shx#SdT4W(`+Ja!RrZL+K;bFEI*{K5HTsp#*1}+02H8R zY>Bl$_gVo*$eSSz!w5hY&dq{oJ97#-7$yqSRao{~R#f<394OZG9A66~D8}pud{{BM z25!&RG7@x55|e@$I40l^Q4hvIA;e=5l>SVkIpJ=^c1|PuUjD#h^qt!QJThET;b@IV zOlVmUO!4*Uvb4Rh6Ic0+E5XQ^1~PD(sKM62LOH*A-qMtE>PUC8zeeHnrkFxkkBFX=%M#tHjPklIgjO?YA_XU`&Rr}il5^qnd)-Q zu{Sq`4kKUXAF6IV0o4UrPw|6N{>CFZ63^31hyQs0|b zOl(4ta5PtrXpV)jb#*hwq`Ua%S9d4++pAy`EfSFETC}3tlhisnGj{C0MSI>PMj+EG zD;9yL29mWS&GqF&D5SC5=N4sG9AKY=yt4W2VJ@9^-yEBadqd8{1a*d;9{iZijTfd) zqoP22N=GEv6*GRMe*P!m^{Ymc_9_5qPGk^i$opstk_|^{?s4n8(@Sg94Qv)N2=c)m za&D%YDL_SpWPsHxtD}lM|NKJm_NvwMSh1*8a!!rFB*A0G`cfiy{dDeohuCD-&!fzwZg^^A8*baG_DjB;C(Z zeOGjEHas2PA4M@7$GSy6=5Mgc&f;4xgLBnKVHozVs!&Ak-@FnEco$*rely}qp?o)a z#^osJeq74IR&-ujI!j0z_6QyT*`>2U>lN`M*c2y%$7x$G?fpd`tdKW=a$EnLsJji# z4uoiP`w0BjhBo$6gGs1py`OF8!9#gVn4c;zMioO;NkpY7Zci|Ss?bef$SsNAXfs{& zBpE?upH!!CnRSZ{Lk5Yzg&&990g zA=ff#*d8_lM4b0Dz=aP@whk=Z&4!K34875uQQPDy|2f@&i%nM#R>cDp!rd;2Vn8jE z7UdTg?9qH>N^TSt2788mj{RqKhe&(d2^D~$rYNI)@58rDe8<%Vb=&K-3GHWgabytF ze%^5+7%Rc&fC`=`)*-;Z;6`%oc}tbf-ao?LUP*%BeR9cXsxS{t_Z6vvV^|^t98oO6 zepZZTv>c-Fli5<#N4LIszg^{A;WWrOVXIvdq9XT2+nR&O6agjnA*8)^)`qQ#ZF^$$?t{RotzM0F`k@T32r(^a5( zW4JiCI)T8*dVTrVOE%=!*lFQ=wejR;8$o2uNOshskafs=Gl`Y4E+A%&cPFNzzv(upK!nj-sbKier(E zyj4UMt7vHD$(4pTWB}hA-rx%gv&Fz66uwO2^uAK1b@quwo9YMieZ*i4r$-Y%Hg^)@ z<*MV`%6R0=X|Og-T#|a^X6-)!NL7p*!~|#-eq#|A8(sJzR9U2Hy=TRuGPW~(^6GR9 zg#+=WDp2l$laBg4779Xz5F%7CA;vK~fs2`bu*t@e4B42$pbPJFV~xC*4c&v4G)VQ> zk(VNy()5MOy$9=Hv@IEZ<&LZV4*QSX`HR@OL-9qZL56{gLV*fdqD;@Xk0DXKg$F1{ zCrNS?TU0lX{rz0*20G~gS<(C~cf;;AXgVA=em}O_-{O;0j)TZKw|fK`kN8pEJNw^) zQEWZp#7b!%H|9FGE)s;3{LUH%N=DHSw`s=&{8g0q(o+<+kCp9ToVDQDv#jwHc_sSPrsOCcVfXH>B{wlW1x=lX9n zt`F2W(!B8D>TCtgCjh^KVJ`Dd7z9cJG&P1S|7~jOkSx(SKuC^L&Ux;3daq3W^MORI z^m5eYu`tgdPxGH9M=WxlNV_k(1xyJxJt$ZtR4w5e$UQQ@s0XV{(OeuPx|+iNkVQN2 zpoyt@QI)N8=>!p2L~qzjlVABD8FS?BEMT2DMtUZf*y&zdh;c+dfRlctLy)Y%3`!sQNWO^HwAs{ypkWgWK$c)fFF&Q( zg#@{RD%fumeVY)8UcqjRrZgIC>~MhwsOW&=hLB{*ikTr$q^DzzkV);hdr{knvdC5_ z?!*Vy+Fb?6^ivO+3G^~ zJ>}XqKsUlP1;g4%h72hp$6)$BHr$=eQe%qHCWm`{YM0Fq;`4dTK5nm$@TqP>JwnB6 zHOCY_EYNX5)2f3=bT;T)<+=|jSONyxNyYJxl0YT_?nBpyz*afEYNGE8dg_<)wf|ny z_xv`WDperg+*tniuZX?iY_;IX`*1wSEm4Oo(SXqlL05KJM4IWcQC@bV9(yJ&V{S3A zGtM8hriAHA7$;&|DW?Nyh(dcN6(#S&{;cNQROY+AJSK80lbk7rM6fnsm~fC#Y=KYk z9n`sJPk5Uz-Xs;pp8t_loNf!9FY7hD025_f_?Hdgi{e3VcHh~Dpa<(5EXY(Hn*yZ4 zO=DaIu_3fOf->c2-!5;W{6bHQn48U5hL+#Z9o{&iiWpz86}>OnO4M3o`qzXCZH&C= zRs`bd!vVe+dgpb

7$TV55CBt+*+p#qO$#U_RQsOY(eBzKe4@ZC9@((J&C z&_NEwVxt~FA@RjDZK=){_W%=c_kf2agBmeT8`tm;DqL|vx8*}J0#e#0bu$9xub!43 zH=Bjb|6YgOSD-6iHAF$5V)uVIDsPmrn#!@cb{a%%B{m&dac^69i6MPQ>3ac<$pFZD zN!~z*rTdJIgoP`W90zhch*)pKz3i zYRht*xjbiE?S(j$g+g$)wO=l)!{cmB;PXJ!h!PGz6YEwN0+#{K5d6muZEL4!3T-Er zEvF%-sFZ+4I*Wl8mkR!zI%gRW)-oW*Ouo&A%!meDZ1(e`|1d3F*aD_ynr)n+(TD`| z!GK(&i*pDc;%fC3jD+4(;7)Gsb8~2#vOD_@!^;%4RkMjM? z!Wf1nz!}%{+h;P#vAEoQp|boectT7clNuuND_Rhyu%bAc)<)ilDK!Bs4`Z5uqj z1H++Q>N+pcUY_qSoEQO906F+U>eymhG7fLop%-LLm4LhNo9ne5<-jM0vI5gWgBVH2 z=I(zr%4nc93mZ#Atwa()|t6G(j&eoh`Yb}|A3a9uo3 z{6K|A8I^3Flax^$i=oJrtT}YDVcPr2WLU8uD5~KWowvgmW=RSAjmarR)ZB!t7(3hM zLPn2G2yMvO324CuAJw86Fa;Pxdj?>?jX?7}P1pB4e9_;I6ZqAz%9+U2vaLS=1Km+P z+Wt}Y$qBn-ZB(J;nC@R6Eaol>x`|t6j!oem28rN6NYprK$Y>BE1(8r-kjgIdBL`-v zOpm1d7_32sN?RVli;FTwUzUgoP?|<{-p~5Cp!yMbd_)jq7d*H`np^rz7P15|Yq|-_ ztewh6^@*m>D?I4vwl00lK3$9L86W9myUsT{ujh&OHbi1f(Q(uwQ0ZO3>*(*=>Oz_X zeT&H4XbZ{2zzm@_ZUDEddqi?WBJmiu+etW9>~&v){U9?My#QNFvOO?%K(}!_Lr5;- z;g(_s5xXkYWF%OT;WkPIIX8jN67<+Syl;;e#^d;vi0T7-{C;s2$V>T!gNi0+s30Xb z1^d#5HK{UUWZjR=sv(Gvbr_6=D*!j@h42?}=kH%s!7zFkd1!5yRIn%h#6DnkKtM=q z<3WYv+!6qyne6BS<04fwq80&=e+p!=E_}kI^{ugaJprne&<>axXm)9N?B^@eUSkI# zAPWiu^BjLK$D~O}*^!B18cb(4>~Te5OKxX-T4nL6?FofekD|tA`2fDSf*Wo>xQh=} zy&TZ)qCu0?o8J+%Tx&lDy`6^|YZ|(SxQlWB+S(QKr^3Q1I#y;;@xtjv`!4M zgzFJ)*2`hWP?z5KAZb`Y;hH-A!SE#It`^)3gca1Kuoj)a>VQ*DiQJ)}_x+7%G*K5s zAsl0(#E zB8Mw&?=rpZ^c}eJQ+b(K^&4&cui%XK^S3e_o`{pk=pCB(F9sKKvP&aH@_-dpJ{;=3 znx->dS}mgKNA7WFk_nij$Z4n_HYI+2Mr5Ajia|97`|-)j<`Y%M_L*y~_Su?D8g(SH zRJfkhjj{;Et){o4H?+L<-G51JQw+XTmSV9VbMc~9-#DX6s0y$hy-f}}G=;xK3j+4Y zAe2@JxN_vp$pQ^m)e)H=^tTG>&HO6&7AA7JmCEXHBZf-8fhN;Ee6%CzI?(9l9d8tQ1-;x zzWzT4S9lUWRiujc#1G4Dbj<@IrPM6Z~|NcW^j|#v& zfB@CxO`QQ;q6+_WUH{qfw*LzvbZ+Q^ig9Jh{~qtZ_eJ+Vc2YdaO@Q2c2lOMg)TlL< zK&??i={&pZ6Z6G+lM7tiwQBX~naf)K=i3-C*Mg?#EHIi9n*f2!U0u_JvLusURQ=J4 zR=er(yfgdjdHXQbi4=y=pd`1?jWy}w|K7zd_HqflST|0DdlDlBv3X z*Yoe`iori19Vh`D-vlu5r@T~WI0>}xx$M9UyO7&3w>B7P%b6orFPUGno>5 zJnh;yrGah(+^(4bEhom=s|M4$-@0N+I#;VN5F)k=417+$N%D&T#V;?vs-O*bi@$5Gie3} zu;})S;on{=G}7p)*d0T|jrK82B~nSf2RhE2YBfts|OS0hYkxeFK< z{*|r^aGtKbmJ^$zSsJ_6+t$~1K`=pYD(#Vk3m{gz5S^9s8l)$T{3?`+fXNBg1?EzG z&H;t@=QAK1K1p!ce?o{7HQT!iIt9q=t*jFANO>=SVQV$TbS8S6VqUreg5!@Mo3By8 z^KFrxHBNqo3Rf3k+O*I7obaAsj#VQPtK)zFFMt>WOywSX;nk8iIS?;M66ugh9VwS# zB1OEkB5J5aAq57ud-?@%@v&V3wi?rG3mS=PwH#T*%1qg@$b?b3tn)XB!&4B?S_d9z zQsqIGqF!Y|y%63@{lvxzSAwpt1CMkdOT`aipT z!-^uiFl0Px@!4Y-4hKZUEvR~6-Z60pl>&4vSG#}e`jZ@v;4Dop72xU$6- z?Ct8svcwM%tE5VSOUIv-bS9ngFR5OrXh%-$`mMYAc7U3RN;#FT@05&5qKC};25^J% zohwd&^ z%#RLw{I-h|F9?D4kIwhnG)gIM*H70!Nwvch#2!Y(#M3H1dB3?h{NC~u$%*P}ky#X> zz8&{V^AAdXF+lsm56JW0CtfQ8#%Gtg=CMuzs)^rd5{iMTg?-d&fQ#9UQKEE2V$JVK zj`0{j6miP+9t^|J~%ok-~XYqWYs!suBPz^3HWmHwD*4>?fk%m!Y3$S{Za-4q0r5XwpOeR6{Q9LZ{KT>p204_ zAZ)Vb&`S&DSV%nst{V*lY~}yHmPn{>p{j8zn0dvnQtaKd=r;F9goQMvJw zST>yxQw~_&x#g;@_>HIP0#)uzzG5DP&AM%!jR3Q~nJaV9e)^ops@u{=cYB;}8->On-1wlN~F6>q`L(X1f@Yrq^0@R_I=*?#yP*wIAicjVc+*&d*5qab6#`K z>p?kDSv&V=xSv7z3bv`9oEVXk*m#Xm-+~+hm!C-!s+0S$Adql}uEohbP74_-=cYNG z1|L#2r1~K6uIr~Uc&YpHfn8fep7YXk|LX)y(PN#et;1=F*&)9bRW3ZnRZM5_(!bz{ zg|5obXOxYDE4{a*EKrl$kIr8y0{}?2Q*iu|aT$$MexXTby{w&M@;Nf{nvrM71k!}r zggP%pn9bwZ`2mPe8wP*V=wyFO7o>_8l%`Td{Sy0Y*nLu9 zDBE|hOu7#1VLFCLE^0NDKm`J`r@OQ0B0uRL-3o!e0)+IRG&JX51NS`3#(uwEL{diJpN{?Q<0~nY_F#8ephuPGwPT+uD*~Nspg$R@fWO zivKkXr1}})Z)OjGkJx9OkgTNO>m%fPC@roPU96U>tF(Aw;v?REX?l($NjuN zP>9yOw!r-ey5Q!g0o>8b-TqgXGiFvjHVDXXRA-rUFsnOnXY5_UYPa;HY=+e{C0Xn1Z=fgwgd^^XkEqD?xpxSy@ z)x0?BP(MPShU@_pSm#6!z@~5ZvTOr1G(URj(`g}!Fe4Jtx{V)8_s&X|vM$J?mITnuFc3e1*wq zYRUJLnK+(Ht()g`^(AXC=LMA5P^o;g0j3fl1o;>Bc1bM<^`psx@S)x;aZ37G)ji6P3+HDGld zdIK==w;^4Sd@})_Mu&|LGO_h*w9r>wO*kYFswi(|$#+&yW|P?1MD1RywVk9G7%wIw z=e>F;esRHinGt06z5S>c1Iw7y?`$X0TX<4ZNDEmypB#zQ7rz;7=Sa1EGXmL4mbgH> zwZEnx5wf-p2?`Z+8&t4Ox`b7sg@F+E_PSwY4NH?z=k14`dRB{<_25ZC@6wq0yU^6`r2gf1@9@7hY(M;R z4jjvpV~(d*BZS=vkqn9APZw|?)czRlgF4TfC7lB#=91*_J4(!MhBlptkM*p&3os@b zKu;S#w#t{dDFTY7I)4iPBMC)d$6q>5`=IoKUMa)W<+a~JtyvnfuF{s>$RFyOQ^1yrJ|Vd6j& za$XQaU7+1rx|uXHv#Ff~mm4;d!X495Pp_GE;_c~5tB)@*dZjF_Qym5VR813gM?nvn zzPE{5@>5sS3V58GW(b*W9^u+X-`D(E(R$P_uMf~wShw)&IZ{U@X}s<$!dtsx0ir9! zj~^St>AbEEIQ zHLY{gI$Kf(G(4lle?3#v-1lU5n4nIuj49Tmncdv_n##Arb7hsN`_+ZUa>`BI@U~(U-Pvzkfj}m0L`TvNk{N&T1VE7E_BBwz^ z_u0iL-v+Af!*gX6$J!obL(XxWsE~5X?>Y~_KBND>xp1~{+)`qVne|bMI|0Qn*DkT*(dR*($5EL0|8}WX+7#LNS1tAai`NQafjC*_{YXU8E6Y| z9@{p2#WR--a`7rTV8#%@MqCEhZ`l3u@%hkk6KHy5#jv+Wp0keTv7eZokzorG;0PT+ zxPhzBy#r(zSc3!N`HAdA)FiPZ3mN0j@KI`h62x@0e5pYRrRl#dSof$ zlB?|?o=7mi!WB@ad@#L)BEH_IMcWUt;{aaRG8lsw$S68b9KhFvOIGJ6M(H<$M~)s} zb86cny4dJk1{kch^2II;!^+3#wwfEEVbByDhf9ku^N=E1|ySN!|zxZpP;Pn%%y8KA&Lc~0bZq7Cx<+q%6F zG#t^NIbIab_R^m%>iGbhr?_u_$``F$$D>4TrY@abVQ4thKa@sB?tm8~1f*by1T6A6 zH-LX-{u9wiDSDlsfM~5Jk-`P}#V{#LRGHm|&EnasctAOZG+q>gVG*1a9+G1_|5Ali zm@VE7JH<{PC`zF`I4mV2W4iYmJOABeQf5{(=vmw%h0hO?&G)|pwEfNxteab5=we0L zS;VN7eXqYBP}J% zvvxWbyUG;Aq`|S)0=-+N#WrwSb8Klnfs`N_bgDabUTt$P6yp<%UVi$b zp(w9YxdI|A7CKrn9rn$%tpVUNQ+!ri>jpl$2HuobE;)|5)m!>^4)~@l21CHaT@NDX zp7{5}GklXtYvk9r4Ab0MtpS9h6OSv2L4lrf2%2yu!u5rSa^dB{(@~u-beVNbRn%(C z2zHg&(`^alY#cv#WOM8RPZO86sYp1KMv&{FLqnL$E=C%vnTXeGjV5Mik#(o0a>M*A zJ^+;8_0+NmiK$wk(#|4*wy6b5+xRGVlRG2xzujrKp2#8$&~(|I#q5it`3^jYIz$HK znEhs7(%1~L07PkKH7Q$>QlMoYyQ)t8Pe zy1A9ZYp3r{d`na1z@=W{L)(Qj;*0bRnFee=OA~LYo^DSdhF7x88Soo254IG=frn>d zHMt*VupA%yq@WbNl@U*_qTI`YwfM^ZqWBcUm;lm4)Pk}DJ92soZw26n%hR%L3R zY7RkbHIitgAHtzxHwWC?`)@w4sf5Ju+X}JO9CY2EbI`+63bqcL3E~cWL?$%bZX@3D zt=nSI#^a1`(8u%dv`dgy6K{OQ?mdzC81Qs?v0`_95-Wv9YEgK0TUTYN*XnOwV|Jww zYa^Y@vsrrKT&^|VlnvmT7BaLwFLcCG%D+1@B-nAW5sfOkqA8qp`;23t;e@<>%|yfM_lE; z{;F?1r>(nXsf)Z*DgcytdK--f$OJDOX3^PB@)0AP2-_^{*8Zu!bka1qXSOr>x-^yu zwb|Le&8!p9Suok@Co*6OfKrF~(QipB2+_Jr$J-h+O5b9C@*FhF05y_`AMuRH=WFWh z6M(dH${%hd#dfUpns2a~5+|q&f=V5#{aI~?377e$8`zCLh3~(?rzgOk_tI@Bw7>zJ z8d?^p!PEO%wd;WSYH3Qxz|ramD%r!IBMC%Dom-l$KusW|hriPbeOmzfvX4zpikYImyiF3;H@U}pwaMT&CWOk4i5+M_5YCZtN8 zwe`k3UJ+4P^wwb>`fcY?DvZafAhRnsW$q-e@wZ}ij8svtdcCRSB{*YCUeb9R1GtmM zu&x%3L5YDf-NE78lK+BONR&o4uJJ7Pap?^>wR3y`b?DCN@Z5Y0V=>Z|D%T_R2nBuB zWlo#iAv=~rB#ZD$!rqOO;blH2zeEq^xY{t8rT2QMpz8(g=m-u9X|by#3F6Z|Vs6UW z64iKS9k-X79ZeQicp)7QQ{iDSWQf4xw{~0V4f`q))47+~{lzmHOL*+*gg&FGuyW6g zRWZs7*h^M7S;jSWF8X%E*7e#V=zR^#@l#PmiyfAMF@KCG@cOb_Ig4i`JS+%PXsA<7 zQQ*C_VPmm{TNJ-1$2$3myfaD7<6vah3=(`VQ1vS&r1tg}#`90NV;@VtzR6Gqg+nX5 z?G`bLbQ=n}3!Ll+B|@~E%S*A4*YF4YJtOAO3Q`36pxSq|*u}}~M0}ye#&_*s2#_@i zs|8{tbBO;CG3?(`sfMndmrHX^?rwQF?ob0^gh|cHZ;6%6Fs>WL_dl+i-$Cu ze?RG;sZtu>lx6f^#c)>A=*D>YzK@=@mX+L?)N`!;7Cr3vtMP%V#hif6w_g9zxgMm0 zqN=SE{BMT2axCf2FK-JVN}A1X-ffPoVX9h7qdlEkJ_VbMMI!1`8u+53T})*(ilR)L zm96}s2kl<(_K}>Bh|5Z9RO1R?2Y|Fn2fbKyrEQ(k7fA7SJ4SHsk9k&ob&3Xh`P!!R8! z5F#F)xyo0fOJ8@n-2nD%Gn((_UU<(3 z6R{(H@n(MrVv;Xt^SSAFp;y%5gwH!}NBhxiRxXzPoLjtmjzGWE z7DQo;z+ve$)P`s%wK*`owy?*M)wisVJndcgx5UGQJPFNYWTh8>!U_m{5gYbTXBuX@ zUxpr@5s~({@~uM#RMY87IHSQ#o79ssl1spDGV%k#&P7adPFwnu6<9-!&mzbaJ9}#D?}0W^aq{m zGia8w{SQ@jp2vKiH|TVd{b~^Z%1=RtiY0KJ?zuQdT3FCUp#G6nfy%OsF&g86Nz~ z^A?%@V!Xx5yA{+=m5y3S^g#n*h5=a@dm*LLFfL=WrgOFdGUGo*Y4N9<;paVQfDXP(Dab)SUog%Xs(^ zZeSKIn6Vu*hBSf=m{pCu{ZoN0;tRs){AHL@m2mx@cuAqjc{t&jgYIQb)(nQk_DC{A z$x!VJm0D{!tlOq_4}2XKks>!8a(E-OGXDO|YVn({jzfL0pH4iBAU6kQOx>?UIFtnR zlnm%~zgsu36|tquAZRG3t}vgFazy>tE|4F}XN&=#vCC9w)%ntK2MCx1fP3vNnMai`n_3byd^Xb#wDkWC9kF^23EXi=d|mNMp?m%>}@RHjpgwj8xdW z_8Hb0zST<3V*q%dRx-%+&tbm5MgxbsFsul~+BuBb7m%D#`7v1d>i5(R$ftfNp%Q)D zMeOtME>e@Qkde^049q{X-LaR<`lG3`?U&T)xw{8)3Zd-P0mB$Vm2aUG1e%$l=R2Q6 zMod=?_n>SRSFzo~DfSFc_(U-TGFV_D#c_Zn&0!rMci`lE2NNL02;Uy{AqQ;6jIp(R zH?Ii3qH)j#qK(_^uLaT=4FDW0-M$1FK(v@*;+RNbqIRC>%Zr88JXh!(3%9*LfpjC0 z)*$bu*oK=|jR*!ViTrs@^lrKu+57WM!TIQ8g~GzXQq_pBs(}6^D6RsH#SkQFJXxaf zX!&hAh09`Yc|Vj5R7|b|SoAos?k3d(NX3P8$`l^k#LZE=S{BOzd!Vh^NBjz#fddxv z0%R1&dzXyL=w!|};Eq#hb03?Thn~9C_aZOFxn#gS!=YUGz&h|=MX8ob?6OtG!Aari zQM$hMlUb1OL2qCReS~ng>O}m}pZ2Yz8W-a#Yj~i1Zw?>}*9k%r=S$xlI+hU?d43C`Qri=_DZk)ZrN=B6xbe97e3SkNn8I)DSyqN?K?_@o zAF_CS+D-~5o-4okyZ`;AyG~e(24pYG^2p&A{5pU+-OWQy*vE`{EpDzhB$88j^7E4> zatBHx_gWh!J~oM%hfLtG;QIqaqtbCd!!A?pm+EkxpW~d&m`geJhN3yuW6kota>%7q zAYpuz-m>!MU7eDiHvRDaE4uNVgHiO<{V}r1ozEYztWV78D7HYBC$&mnJs<053umKI z^OYxj!+=Z>P)RO&*&>P-!Xc{HP8jhVtMK*V_snE}Zz&m9z3Tu#a`Q9au7I3`IEyQj z&3-Q57N;LDXR5_)Xv8q~OY)2XGL|&QqRMIB`WlvJvEOe3b7k!ukoHQ>_3XZZ1m_9h zTgxg{*OTts!yWX_hDw4=4~{rP`H&oiO^YzpCG!Dt9bd~Pl*Y?ESmRM2SY^D)*>#pD zoR>S=pj87f0HdrTg6VS{_{?Bn4c+BEH9LpWJ{iX&;%sT8w5FZCub#BZt`;{T%MRX> zVSQ;ZPNioe6~rcUid4nb#lI8PV;{tMa#kJE^VuyM$64P~Mua~d*Ne;+bcd|eT__~d z>g~418|~*OJ3tc`+3)=Dy}ZNIVLtdYw=Gs_fnR0m_58o{3F>@zCpLdg->b&ye@gB4 zsXzK^Xh=4vVnxoCm5T&zH$r1U?rpT2%|NQK0KT2ABd}dK)&a~)>JNF+8;YGL8NuV& zBLM%0wFiI^er@{jtrO4vq&Z>oRgb6irY;#T+OeG@_X_Z)(e!xOW0T77Rv1O9eu48T zMEr+JujMZ=`ysnx6DHe@1>7W_{qGDZE5k_mqLn^oYcq-CfR zMc}XDhM#VI0`c42B03)z8}0hmj)<%6EqLQ6)AR+3asiDyW_*X0KI<)t;C9L&CaXyb z5C7{yX2;6$19bu$(%vhf6X3b#y1W#uL?7AHYQ=2E{BQzU!;5eng<#PuuJKG;jA?@9 zoCe+F2#x-0^$yoh#CekJ3R76sBC^LU{Uy1l9(0Adel6ZhjSn#vg^yu0Du`SAh@Zx; zG+Kp+JDr*!v$J){_v&HD;)4K;DSZO^RXR@$Q|+-}Co%7W9aD7WT^@DDr~<&c9iSDL zL9C?k1r>JM(ww>ZlQlhT)O{ug-Lg_l1ozLG`QAp@sb!jNd?W~mc}w_;tUc}H3Sm^r z#?_d1r^?ADL_Su^#eX{&!isjscyF5ak|pgB3AiBYr=ZW`!;9$KmT!VbdJ4;zK*AT#Ka#}@KQ$SyY8a3bO+2`ug2piL*~6bwLaEa!@WzX<~qRO=$IZ>b~}Rl z(3Bp)13vmk3Sn6_pw+fP{y`%_Cg}7Cn=bDFli9XB&0Z**T*~Wt24H|-cL%bO24$s0 zjAV{XTX8x(dY;2jOg=ob>_ufM{S-WubGcR};6GH zyO^J7IY%!HIPF40(CHa3VH;`Hof2a@A%SATqx6ZdpivfN*o)CQjDxr}Ez5M+PoT;J zn$w*x5Vzhp;Cn)fD(MQ~_KD1=A(_Tt<~%-DLs|DdX4L(gktNJjcyFPEAZf)do9sZ02;hLhKM9YVz zhYBFiim_AD7O2R-<9DTw7_kPv*gksM#as8nAcKN~KH8j=A~b`zw=O6riL%WR0$65Q z`HxQUB(7`C`Wyt;Ei1nd;!#x~2xP&zlb-Z}L_8knj^prDoGnW4Rd})zZB;V%a7DwF zBpqlFV?qdhJ+zTkT0@Kgxd|oXQ-qjJt`Gf`(w3r!zPzx#Xydb4oEmfU_jiE#Eh7mu z8T7WF@-?|^mWDaa^RH0>uwby>`%>1C6k3NQh(EyCP{RY!CMYpRWuen6D-cuR@a!CH zH?r2NgK8(03Q`$>Hj&ZEFr-<%j`3X@gV=j2wQRi^VjSwZ_w?Ys2GpYU0@os8c@ zbgEbL`J(7wI*8(M^?6WzTifiE>EvJsVqZ_gS0hEm44B3K`9>6qCFXWN9wB_)J;s$5 z*<3g^0f|F~C*mYyQTzr$lv2+kKpJCIUr2ZZTA0!)EZ(zYi1Lu1)z19e^H)<8L>p!w zD-zy`_%-*0@~}lT^MEvFR!_SlMnyO_m|*I4R5v1C%~oI7%wDMqv+?MxNiol+z^c;SrHt zM5~UT+=_uc^||Lh%(~1C!V9Xw%nJr*-tdZcP&g?A{Q(v|YksV|qg>u*HxzNAdCAEh zrfY(^CtP#vtD1;ZZIAQ3VWA3yfXd#$8lQk?=n%Isz%i&;7##RYMTC8nN8E&!F~pie zwLtR?D}-27t}pN<;O5)np--C)IL$)8iEUa`5U}qB=5lZQV^NA`xA>dM;2Z`aVy~X; z*ksd%hD<1!7`DO%pxv*+kMS%7vs-Df-FuSH?J2;;!>;v$xVABL(P*JQW>J~+y#iSoZfN}@k@iZvJC?&z z&Q(yQLMTU$;;KjgRwgFy;0tk%I94JTGCJk;B$IJj0WU?kot>S$3=IXfA(-FtVjs*J6Kp0C1wZVVe_lxlnMw!9gqAc8llkKvNJJPdgpyhk1bQKHz(9|NS|@!^g(zeEIlFPSPe>zx|6Py6a$-bLJvUEWhp$} zY+v)Knf1c^9XPi+PSZ}3Ap`Kn9arl#*j($5Qvt=aHIFIpLT!SK5fqpx7?bJW=sT3$ z>eO;!YZ}TM&G5;uMBGrirJTMk{`OOzQ|-D~3eG}?sX|Seq3-MVFQ|phPot`2{9Ra} zI_4*fVG>v^JP^Ah*9;G{U4nUN9TLVLz&SsuTOk=jB;aNJY>wOCFHgQl93j5|=ur;! zeG8_?V{NXAORbz6%cc?%3ppOnk>XDC5^);jX=jj zpPvCfs5P%HQ7F9g+d!j9qXL~E@Tw2StDASW%7&3ZBeWh;r{jI$LXN6;BzZff*{00N$si54(bX+!D`wFXzDNCn)bY+@i@n z-i&XzJy-0GsOH4*kuceNd%~cU(QP%7&p#i+M)dvzVvUdgt|9!-j|SY5T5n~^!!?Wp z{w>b=TXmE(ytRG~iw^mZ4ycq5p)41zqWS1$-oNy%|2n{$q5;f))$~)#>3@FJ4p#5X z{wV1M9L2wNq*c^_Re1SJAk_dS+WFTDzaoW+Ts{7j9sT>)K(SPTxZ&R%Ba{QAoPIqE!22V7j$`_Dp@H)pY~C9!Xh*MMwN85C8u^o{C+<>Sax7e7;#Clm@}hBowx;#PY4l_+K6*ZxMa=m|R*Mh>kq zEnMW+s%095rKF6$pyG|avKm#_am}C)hPsg8GQ573M$9B&MN@wH;aO2(_~V>KVcWxq z&kMC75_*&q`(_pkrO#j06K|Wm5;inp@}F5%SNJ49X1e}}rqRiSeoLLnm*n9SP*^ya z*r>~_G}}C}S>bzJ5R&*aRp;f%TMCiIgjiWAg)~jhr`n||u*L*X8=`?ABQ&WHvvi!V z-&g#8i!@_hT`W11DGAD9bix!C-(G%F>IJMg?;vX0&oyk3Gqq+5)Q>S-@O}N*&7@nZ zp1}8Qttm^gz@U;noGf~|`iNT?@*07G-ldS7A?+ya1mccpkP>inVYQSA)bjCdAQQDp zsa}J}@4~?Vk^V!%(+ipa?}cD1B)~>&a4Dd~R5yKnDGX0DhD@*AsiD{=42EX7nGL*Y zyB%et#HE)goZ%me()e|8zd`x1ibkU3*Mp?jv>)FmG5c2+28ZyP8!co)WM(AZPLEFEu6tlZU||pNgxW1WG?}Y z#1`l$?R4dFacbhOR3;5Kma5;o9=%usncU;atVa>>&I==~mfb>eD3_`Gqr4t$B=Gi` z!f7m;hF5NugaSo4X0&hGw$e6i7^*wXcejQ$-+J+zY}O!8uC>z{S6%eh3OOv8->4gp z!%kQZPU;w#Rw5sjhZlD9hzUT<$fW8Ec>)cQaCPPHnl0C6xuNfXne2#e>dWb(GE#cl zWV-tX+Y(`sDZMeTPa>Ww;wG=z`+xk3=`G7kOk!5>mT;t|>s~OraAdX?AN}XWPf)YH zGpwdX_<}GgzephK2db4PN<1KuA7QpamxEqv={WcwVXe|L@eoa;HkBN~I?jF2k*BUP zf+Dt2|A%pDm8Xn*AD9WDi94am(dkJV@jD&>1-VDQ^@xKJnZD2{K`l z9OdhQ0MfajKLFoR*Uj{M;HhQL4rsRhEl)wu@Kv{vrlvXJUzy#%o>(809P6(-caN1& z`@bu(_tzYtYVu0)-yZ=uKf+;4hV?ov8Wz_0&kCJ~fR)%Q;b@}!pCyw*9oLkqFNdvS z0yP3qZ$*r3xsBI>LMe(L=FL@l0H!Hg=92z@{wDBQK61xHucTD8xDm|}fIk_)leX9h zY;9o7NuS5B#W|%62y}@xnbQOgloZuTqgN-ou&5|CB!InhLV01dG8ripraEQ?)U9=* zG!f^L)5ofwy*a>$B4+H+vcg5k{Cz8>=m@^_mCO)(I(7s->7EkzpOe7v!4d>VubUBc z{(EI*JX>J)JD^{zfeWbvt8J$sMkim!3Kj(j?shc>yxqS!Rlpv?55TPc9+00L_v_Mk zA9VqdU6KLMn_KS?pql)9tU+h+rbDiZEK2AIq2n|`+@=7v6n}%+2teI!ng-Bv2;vps z_)>iUCY*qUTsVC?|Jwgc&mxu=KmGH+7g+-7Oi2$HCsrFCzOuI)P^&Mkzqh(-m#SC% z=b)>P2y&9OE>s1?A%o^Y^4XDx=5xn`g}P$DKZo1J@0^$wACuFrfFfxeVBGAXpPe^{ z+(1aUrD|-Q7|Llen33bTIHHjQTDaT55Z!#Nfsk(Urfv6Y)w^%N2R#Ao3}ztPJEq^E z`qRbB25{76fYD!!YMua76@#!~|31zHbtybU)CDkrY$6CA@uU#hb0_4Lkk6Z=4*J)6 z>cTLa1yC87H!qlDfF>j5DUesvS$#MIi#lD3I!*bp9g|9Yit;fZ;cbE-^b2hOzyR*B z7GI!-#U2C<(}_WF++C>qFp=sxqmZE-gK^Nh5jP&8GqoTF@`Mo!8z-#y(JUWF!9u;# zLWIE?<4k2-=xxlG6OW#f&esbQw4Uy4;KTBS;3(~WgAEB`7iLx^Xu$5CUU^*T`da3E`h~U|iX$^upwIHz*oW;R-_CH$ZK= z-d;#Ca+(pQD}F(e$8h)b12-P*H5 z!X1_hbZZOxG&mzK$keeX9cw}X(mE7xPhr{qAp9MKqkoIY_E1dflkXf|I|aE)bqA%t z!9mJ)&@nE0ekV2gIn-*UB=#YU0=hoy*yeISk#E2VahPBk|()O z$$IXb+qWr^eY<<5)JtUOBEh~=W6~w3q>vdZ4`R8)Kd?#VKBf`|U}V@gf%s=Auie zm^40-!P)nfF4^Y{WW>UrR`h49f2PzTL$-_J#Ovf2z&>O5z`u!uSs}De>j<0OM|&$T zzX*0FQg4HF52kl%9R6z-O(S<7(uMQnNjCL?psMngalhS--ppf=8J58O^O)%ByDxfJ z9UaVEpf7sJya-y$jk7_*F7?GWoxD;E zPR>3?qhlY|^G$yGB^P-HvqNWT7pSVuO~*Titt6%1Zt`R=CYF0(dvWt{|5EIS@jevx zG$o+!^!SsC*pFZVztf&gfg`y)OzU&0CVd33l1oOj#X?Do`zdhUWC4=ZEnBAu?1LcI z%K^lC@Tl|=S4DqwS%fF2eGv1rK@gieyFX5rOr$o5)L<+qNTRbep-~)tdGU$X-ZH*E zh4C?6{fpu1gh&l_X6Z!6B)P;bhNau7pSL1IZC;1t-!t1zb8d33glp#gf4ABA9Jc)y zTFJD@Jef7KJ3&9kni#m=im4>!u6z)sF+b*J@cwd9j+-8t<7(77rRq zQ7p0h_(go22+#Fk+*5mmO(1S}b&oT>k&LpXT(C_ z>1F=r-aq;i*Z*60=^n_W3RZsv%PXHyM+sii4-^bq++4$X%ZI8{z5=FHD3b;@G7J@C zq;}Uo)@u_{IROOByTh}CZ<=P-6fgYVk8{q8fd-vKweoE9*y|4tE&KW1#k8348F8q+;FJo^t(6+Hj0Us4?4twwLzw{jB3iWASn*$vxvmJ0FWz&)JtvkjIlhR9HQU*ZWo-Z zglVK$711fCEf5`*N|N%vDm*(XpRfr=k_$rO!pp}?i&9e>J026;NiF}_hS^Xwld3FA zWHDE#*enmGr8x@DBOQ{Nbm%E6PD%2-I&&j?KNs~=m18#F)H`Y?-${OOkHoqaq8(wj zTB%n@tz9=VpNuz?_5lKJhj?$LD^SgIP#6<_e+13?60fgCaB#Zub3^1rmfr#cH(3vn zs<%m){u0(aAs!8`5MBo$Ws)Xn&=TO#ENR7QSZNBpb-EEsBSVh63R_~yp7Y83ONcKv z+1tbs-pSU;TC)o{FAIlx=QFui?)=XiGx|>&48~)h#1S4w#Gi{X5Zr48E#F&WoKVy!C6sT*A;0v?1sXcN$7_445)GV5X6HSTLcbbkCAEnWaZ^E#coA+P2Z4DniR*jz zUIkil1Ul6q89h3`{=RUm24uwb-hM`V&}bqRa4T#hb9j#+0Ob3Rt>;G6Xsv&YxS%!b z;q|1eK6_*mM?G7toNPHMdn~*Y##IH6-iP^zRL0c5wIDswMb#pGa!^fS-X{~DS*j(2 zH&YSQP|J5(w?p<64>P$y3~ZaRXmLaFIN5n%DBt_JNw8;s2}|?^ioLL91dP6P+{QlK z;+-Cu-N*P;@c5`wL1f+$?;rOGg3&b^K-;eY5f$`0tSr~9$<^e*BM!^I1FqGrFP#gq zwYQJb*1Kom9^(&3z|qH2Evq5?>f7sekUJg>ckJ^8@0S)1YzLR3;AQ{$fl}kHx>(}n zpC-Zn_n#k{x4-}M3t$UD03WV&(4=A#9PIn}_a_$8i30oJ|MioPiewu~2rtMX|DTtD zU54WSx=cQv4$@8s?SEbZZ5nmM?>{dQgcL0fK8pVz-w1RX$@kaCnqI}wpPM~@q&yyQ zP6D;{kg=7~(z$O6FY2-|+KU(Y@)Ngv)f=iXqv@H9 z#Eb&fxX)VznbqS`K_&Yj2ZVbUW_l9M0B*aiGoU~y0OKe=K(tWtYV<%d`Qr{#m|f>( zn_Cg=OC5}N4uZq7d{1DJX4+!6MIH1y3Q0+O!0mF?3aQgD>0!993OtCl$k(XJ= zatOy%`71|{@k<{gU7zo#FM!Mj{A*q3b-<6yjpvDpdEUjjQ-0Vp(DE(t^}f1bSh;Yc zRNuu5a24BghV&Dlc%A}ez-a~Co2hJFAWT?!88`;Br)ARR?Q7E6<_A+pU;~_#b4?z~ zwJVf-Ksbr;761h;b8F48uD0@BPYBAAnpcKpi`I1rPc20_ zE#QuDio`My$3nO^mWN7jhzYwG&tCXA|9uN5%cHWY84pZtD&OOF5l-_!#KLR9X`*;O z$z{frSOzy-u9_zbR@At78A#g;znTOF6vaY#U+;Zs%$vJboK2%)TfZ1lR=fhgTH@4k zEQ7p_flCpil5ZJKWB;Y}J=Z4~c~t{DEnqOjtGeL}>sP&I~_<6?7Y z*Aj@!743rF=Uu|djH8CLMbo(+T4)8tD5z>pF|OQiE*T%vFpCt_XvIaY1s_xdwyp(! z3HStBC$B(=CS;2w{PC0LPT8~~vMp7GB7_>*4(+?e#g?B7q?-r9j7`TZH2g?JXcA}d zI$Xn-n}iVzqVu~AM0d$yejaamy_ zg0|E7jhg+MeC%!#3pfTI1XEXFxI@(u#ZX(k`Z*i3h`LC&|4p*9Ut4_XS!Cy^FFB`q z=t-9g2Y5rgEsdaiO0t30b6RE4q9aT08ak2=chebgQ?98ZAU%Rm&h_>ZK!;3U0Cg+v z0s)?2Cb5Qm??uIwA_=L`t6VOfHY;$t&=cYnN>-xUf>}W4a1!2Vts{~SlVv`|q_qTf>WkY^LidF8& zEUX@^|H>OVc@Yx2%kFZ`lDJ;nVkJD;N||c>KfJL=iUNoDKSgEcT1wAeoU@+(QtkY; zB^rFze;255P2u4YxOzQX8VLFc?C;#I}pn>kE`S*z;HgsA9o2g1})XspnpdAwjC^kBCzYb76+7@>c^ib_y*gMdPV(b z6H5EMuJSq0zZ-5g8I1PEA z{*rR68z9QPt!2PE(;x;S7q^AIN?8O#g?xuth+4QGIVD{6aZ*(0_@)m~9LJi4G&xpx zoG8p<0wxKsb*zpaTgPh#yyXhP z?MrTx`@W(3J&fGFtt;A=;6I;}@X7!EIi2GfgED@JtO+vt)yT`}RKOEt?!B9;D6NqV zpvBciuyPDFD`nRe0Hnz;4uO6iabR~ys}P*UhA+=JXRHt|Ih`IEr)J~knP6hSw$RpU~Rki5~ch~b#wkbL`8RBqUys@q5hwJ-G?##8QOAMs7 zm|;vhff}VDka0-nY>^>q!(B)&W3nLl@6AlFICF0cb{)lsf*G1?Z(2}=i6NYYJ`#3E z3Kj)kBJ};m%PoM+p6egI5CJP{{jv%3$}H;z#pY6$Ib?-(wBaVO zypOQ8A>cguRsh6&-wHV^2!i<^EA7g_br}L_l}|D2lGs3`2~-!7Y$`EqxbdB97h99! zpU*mU?Mbmp=iNxlr!!6yk%7y+q|Q@0P-FOmA}`y%jg3S3SqItPnv~$q+szt!{1m%!csrj>pdYykDdaB-3YjCu&KFK_IhL$vSv{i2ono#Q!W6 zH-c|Q%sNee#3KS<7D|)qKU%-t)21Z~(LQyXN)DO&V4U*L8aU84K4*C!fhT1rVREcM z=bLK#L3OHHNt@2&Ui;T#SoMBQsc!#;&6Yn!5qk@P`hKj23Hj@RD%F3Ty~d+5s-h2? zZXy%_%rbMp4~9M>`2mtBEXAvzW56F}s61_Q01o&;L?RvM;!g8nd@`KwUBzN64 z67S@~N=b{S(lEdqpN?qo;F}+Ca{kYX{hd>-9knr$5B61Kk1wt#0ls z-{#?-zHAyTUs66}wMIuv5a)t2XJ{s5PmtqBron~#7NGmbY3Me3?amQdiEWg!I0Z!4 zi!C!`{?IqPPBf}%s8FT>)p1CWhc_NPD|9)v1E--ure{H2k}CO{JdbNofcob6VbTIhz0+FiEnsOxwKPl&D2=p-&Bl8L;^*WCzkqydi$h93zKy%>bifCd1Ix}3 z>-R&Bn3HT7y%e6Zn7_z{iUk3Slt&Ge((NS^Kni0%SvS(cLed&F`r&YIf)DG19himw zjj!Pxp#KWr^F6C6z+9HSny+@88sk)F47pT#N<{r}y4A2Q!Kawg7^J3uAmz4xyTp3J z6I(l+s9|PBhZ}+OXLPjs@%6+{lkAen9NAlSPEbfR^rka3hZ9_Zf?f%hIFrWej=4B9 zq4^#^yPK>5MC1@Pmh{%-6Qh-N6kG$DPhJLvBalYPi+tQlGx!2@B{?a-rxslLu;xH% zxjz+mkPW>`d|ycYFR0MuBZ)blXS zGd?iz1A*C$4k*vTa7_a*umdzA7j|&~M_t8J-%$25ELp>v)ZC*W9`kgVqoJY(z=d+q zE#1Ba?6d|fwZ-b_Yqwm2U*n?(`r4e_W$giLit&HG&Z?Iq2xWA@~p`5@*FCkzy8a$r%pm zDD|pRU#R~LM9xY+zwd7T=z;(y@zd-_{znezbUeN2(rLadPoZw8rPUrdMF>LUKTV*u zi0tQJcPgieq3XeAS3 z65;t@#kkItX)-hcihk+oi|-uXWgN2J+onPC2Ah52L3rW9g`bT(SIvU1_NiBw&Yju< zdR#Wahx53DQvv}Ia}f7!0SJ0GIp3e3A~l0_n_hNj1V_RX6rvoM2DODJbcw$G0jr?) zg8v)?QcVq2DFUAU&m;QC5%))TB%lH;*Ka5u35Q1pgW{B*^ZtYwyVRJbR&(@z`9!q1 z;|MB5`0eiW2EEj{k@h7VP_8BHB)xksTvF4A;uLs(p$+GvAC?m<4>qt^{kZvkaqc%5z*0aNmdC*#+ zL5oxK_q?Eza?qdsN)YBoq5^@F?uY9W=!gO-Gi^+grtN+vM>|(rfckGjCK**T!GvV* z2c)JpV$2dZLcasYFM3~)dC;@p06SjvyY$#+O~Of$$pIemUJP%x>nwEOtf$t1P*q%X zmF0f4!<-!T=!*C^6BYXbNR6m{8g%??qONbGm2@f>Le5%k7N*qEhJ9B zL_8F$lJOANh_!K8v=A|KF31gGo-_VLBAU%*75(`qX*d%{!h2KDEXJvXu8BTqpzpqY zOQmAoeFu_n<>!N1CS&VDI3vG>;gYz&=_y3lHEj&e9JYSvXpOby5s-N8)~}>a;_?PO z+8Y_si3TXU06qT5Bdl;IHLLR_Kfr_$Cn=Orkhh%qPUdYq3k<9Z3?yW;M7F(bcF25J zM8s~mBjBaF_qA$2H(a4liq@E#L=VzP`rOIExEcWy3RZS*gE#n?DwnmzX~Cns86rFw zPx)J&W-yQ8bcMj>gzQ-i5u1htbnpIxB&ol)mMlGek}71F_`2|RTTLXnfdHvLpfTGZz%KM|m)7(^ydO^L8W0&hJ#u^junfQ~vA|9qURlhKqqK0U z5?RTBCMgFVbq?j&ap`+XxJ&~}-4Gc)&yLY{zUGxgHgC9sFOw6V+}9M+C!e}X&q0A! zx-|6b{%;+8y2bMQqnpSfFkf3UV)nhMraDBGvOIL*?BF@7q|zfhmg0{XS_h*hT;BQ4 zkbLa`0$e3b!p>= z;O(v-LM|*>21DGMsNYrRSMr--qT1uVZt?ztCjQ=5c{)_{h@LQC)GS%kPfGN1b22Au zk3wxQl2N*eB~B8+FE5kd1=4zAy*6WA%aXGo!LM^LyAJk)zFz`ko(TxT@4>*J9Bavb z;C;8Swz#f&cSufvSpPKoVjmU=Qz`q!glMF#`7INznhV19{7cLENDW=f9CWqE|1AaJ z zuG09^;^zKkQRqb_-B~?aGsR*t!PDB@ZWi6dWOwx}T53 zqm-toa}JdpW}C1y2QzBrl)jy)1)E&gm$~|U=lq?7gsvF$pm}vHJ)Y4Y8}>QWOrA9a zPp6hWt}o!uqj4Qfyh0VxolCsD3)(gF6n)IF_wiBJQ|x>H^6O`DnC_TcmHV zo7HH%pRLe^zdpgE?4t&rsghceuVJEB+LV}Ep%w`zJc%Df$yCd9hd{R9$x5j5iEE2_ zjT3N-S=;iwi(3X^BJ}i=&M&M#&?p)C_V%ioKa~nHhIHb|QN2XC&5KU65wPuxKEdsm z3o|NyPu^PiQMOWG+Xb+y9S-N1v48SY&Dizcdm$C?z?JQcf(tY4w`R}Y=cq< zxod*E>tQ#>V{@9lXhta=mVEwLN zd%NqdDkxmz2*&`My{?!~8$naY9{unr9?cjy-QGy#g&WEeo%fl0Whe*dEHvOb%tO;z zrjzZgqYcFNJfw$ecI*c1ups}3y|)a?s%`s61wlf(rKLe71S#nTL8JtvTO_1KQW_Kl z3F(jq>5`NN2`NEZ0YO4Q=}=%F%lp>nd1v1Lm%V59m%V2kWfNq$> zk~I$aEQFD_xHY*&jh%@Aj!v~fEiCXBBq#YRX9W@&{YWoI=@-q{7RVyP^y5jL(U6YT zWOL8!QLYZca=C=)h5_Mk<~g)_CUZ;JyK`EcBOK3OsknEvbvVlw^FHua6PwOICW}kU z-@L|o5_U^Gb3sWI5MNT&jO+;R{@@mt3<|uC_$phkrxCiB-*pe=tmEFW?57wcZsB-8 zLEtu$t(fL@fabwdeFBiDW2GJCPmk3gw7CBJ<>aA$lL?0e{lhSugWl<%+p_e@6V%kAGSiNzd>B%!Z@TN&>m0 zRy3vSELRWj#_1~KcVXA4J!yH3#<&g{PvxHGt@-}OubLpvXDEAV@a+^P;`Lq{?H}^| z5+Shuc<9dMj}*FP#(rxvb(N_M&rHHIjD}Y(uGRAxQGDtsYT`7gpE!A&9a-xDaBytH z>b07$61F5byj4N(w&0)^@a9iEJ9TueJBs5;wP&2x|IYgRvG2Y0pOQsDCbT`C_{Q2@ zR?d@mL6@iDTqOyXQ1op3_0IT-WD=kC0=ts{yDZ8}GJ^BAAlAY5l$Ru+N5;sQQY-$jK~FrL*fFIp)*Aa;lxej32R{F6b52f zyT#)XcJ#ACDbyLXn%e{99+9Iky zF+?Ue3+K0OuyN*U!XnLs+g4_6yp*daR-QtcK%j;8r(-Lo3vO#HHY)ayK8=nW z^^5n^g9REC5GsSn@u5i$pgY_r;#63bZH~wXihqA%SxBYzIddeYpxLDF%SbxgvD)yN zhRLOn0uWJ)xI6R~9*VRtvble4$FwXlOBZ`Ac?L};F@-9X9Njl_mUHqFQ79>O7RQtc zlfIA&Go@5}K`!>|K&p+Db1*)geV?&;gNay8hB6WJ23C71Wz%_&F8ND*%1`8ecvuiU z`f8C|&qmHz0z{rk;j&iu!%?k{8#`-drCXqHv=d1#Wn1NATM;UV9tgc>HG1rSoML16 z8(}Xwn14tff7+2l=Oaeyn6|M^ef@~t;4{K)&(Kwb=7HeINM!g*V8Y+G60{O^w)F8T3igLnb43^$jg}!}xm2zD<*OO@ew(X!mW(NLQmk(7B z=I*`JspM`x#1mej55h0&d`7O<1mo9Y_QH>FS@qaeP|F`A@SSs<{&G)EiDyCkZjsI_ z5K+bK^E-S8en&~K`&u}1-7{~$CO=0OPS1f)wzvLq%tP^~TjI<(J3>nOHWW06%5B>` zh|xb`EloJoE{@BYq(50;5XV`OSy{OrXqhEVS4a+AF9^k1ClB4D*K*p;3rTtQzu=j1FB#QX_H3sLu~doXQxvza6coN zuGqvKGTd7m064^JXjMPZis?FFk#dybZoPWnDLFQNt$aP0v0X%=d2Rf zY@ttWUhM|%X*P_@+fG8XKptZFjENXT>9{n%Lt%94ZsLCHB^l*)^HAM3`Cfk6< zDyiLL?>#l&%kYk?T$Klu?`_}Rl>)EaKv9rS^4gubOeip{?MCj74}#4b8y``8~>UamC0Jg(_)!~$x8>y`Wj2SFKb ziLGpQKWr{kiXv(~-nTS-F6_;6*|^USnf)#$z^30ga+a``;yj>+_|)|}-F-nxd2`3c z1WEd`kUYpab0oFQG*;8yML!ArHXE1WhG0f*&$j7M+;6A~uV$ZJPvE!$*kq9BjHIE@ z*%-mQjjq5~)a^!xjU5COjh_ztJR4-!Y7@D?U*i5V3_3Bk1LFmpY56K)?dhK7MXp_ zu@9KkTiqn7DbtWdP_Fb9AndkEyq-Ug$uLge!FA7`&3X#nU%!G4G*f?n6BBAtm#8Q!Fk&Ll~gQ^wZU3Zac zgXMNgzQ_LhrF*jc<7vEB&zHOF-LnGq_`Ut&`@W+GNG@;o5*@Rg7Kbi>?ea9Sq6qy- z>z{{fWBxKL@Mq7R=jQH%ifCoIYIVPyRO=sGMoFMxYJNN`czQ3bod!#yOd$L^TANu7 zEe;2cYH#KkCYiioFk0`=czj`qMR=cN#vMBM*bnbf!h7#V9a9wE<2xAbUc#J8tM1o{ zv!74;C!)zw9s7srC)?E|VqI}Ozw@DbaDNK(xO)*@*{3CnWfONCEfP*rjht`Kl(E}S zOft^}-!O48NfznX>}WCze!yQzHt!dJa@9u7^a7Kmf)rP9@;u@0cl7|-_IZ@6w}wAn z4?>e7jG{NCW{wEXapFW7Q=3&CJzl&lJJ#&A*ZFUI?mWMGN~id<*J-rRd7?-3xQ~~+ zI6gA$mz&GF%>sz`6z}8T$*>pQwcH^j*}O1h!^rIS#8vdm%tytzA-!>CaDiGri{~^* zA~}3`QG>}}M;H2khD|-0Bs=oysqtxc7jp`VHdhcD_HBu`bD6bwV(Vv4Wr_uq zyd6hL--!KYfciYyfjsBhlou@`x&LHU;uw3QeqV$cl#XJaDimQ$zn1JJNiM&@ebj$C z`b~uFRs2>hbTEA*XFbVy3@r&6^c+hu-OK6Im0d&T)Myv^%@hFi{dC>YoFd8jOU1%a zx6|tRgT%5j!sc5|~KuGB1wbp3$_r)`bL2I-t--6c>1TVg@&GhsG zJ^)Y-%UX&VDIhz#vYa>Q3EIeiex$KrLDsCYk{FK)i-&E=ek0j4y0@<|-(B96$9HCo?q~SG) z6SS`Blu2CEtzfEZgAwOyt+h@|&;s8@1pXc{kI{A?W)E15z>W-qpyCd}2B<;W$9AD;#^%&H^Z4!$_F7% z^GqidPhB!WG_i(^)7AwA{qtkcSK0E2#7t1rpq_7l4zT7oybM_8cpXSONgJ~WQrc}e zQntJ}!J8W z9Q~5^<@C=nT~V2sIPv<6puD~zoy5I3r%No^PZ%B#lKI8)gE|AsGqbh!`0Oz$!|374CIQCvXZoM%ieUKN2bVEgK36ig_v<2 z=mDM!Iux7;sd1r-A{CUO*S#f-=*RV)BZTheic>!?QdWKDjcg`<2}Jo?uU~FVLI#Z0 z2NPm$FJp5_0y?)T>%2t1MwC}msEj2bopLG!@s$1B-5gPD?hdT;I3xA0 zHqIS%i8}NbD7NcezXeN27d4+Av)~(EdN)(&{9a8&_A65-ojcYe2w}2$@`^G%0a2-a z$OOd!c5pF;FJ`9W5 z57a8`?i?VChRqCHQQ)-x=$Esv+z&SiG02!TJ2bX5tMRWb1KId6f51PSM-tE%?>~ba z(N>rZW$?unh)OJ%Bih!$jWJbl4#4GiNG=4?lKa4H48$PzQE&r@DmYdZuTLDuA2m~K*JE8_)$t5>(qoB6uGZbv9_MUV@|1s`fw>I{AnVsmwN#?Q$ z*rwjAn+7TcKn{Unw4(gu=WI_f)`2JykmD9?33vl^Q4+sNZS6tu+gShDQ3odd!BsT- zJ31q3VfqgNfT?LN${eA|#@Mt!vYLi{AfS*t64@v@V$YezLRcsj!YC91pwHp{`iJf_ zE}QY!u>~3tlsB$xbu=*Ty_tG&C8lG@ghKL%STLKcF$KTfl;&*>pSxc!8a9fkPu@6X zQhkn0Rg1V0AdkVLX@X`8YL%}A^$zn=gF-m(0SdouQj6REEYytHG2|BWpy<>1LD&<6 z{l;n?&0Haaoki=#D|ug(M)QRmvDXDIsBZs~KiZT!<9zK9n+w1o#+}H`` zb_O@t(;!MDcm7Igqv><*CqPuk#G!mOydSfPbE~|A))q!%_P6P`_CMyB zl^mB2aH+4BeYlA}PB1JP>gO#}cUm1kq>(^;OvhI4vG>55$X&nczM7P3+-*eOTGmLA zcudE4y;^O}=_YCQC$_y#HSL+>*oVH`c3j_cKq)9jTgjRsPedfFApR=8{@vSh`&aSJ z+#68Wz8tzBcaJ8qH>dph_vmb)1!o=4j()2*dGsAUU<*p3;I&ki!Vd_}PxT}frQ&@h zYU+pWBH_CL$CTcK5ylc|91n?g3DX*UH7w=vmQuTvGjg4M ziumqueC2>nN^WGT2+1_#$DFN(fE)r|8S{4^#v)8^g)Ul*MA!+A|9>n8C%wcfMJXf-dIS zDSdwM-q4Ta6)J9HiK#j7f{-jhc6d$7UcA3|5=frzZX_`Z09-V6?}Ww4y((8L`0ow zhKVj_!>y~YqpLcZ3&2;dpE)z9Tw{AA zmg95gb((V;_~A2M@H=BK6L-A!XM(0`Lm4Jg_oajuk~qFpV6G~bRWV@2rn2HVY^eqr z3F<`Q@!sR^)HX-5M-+AUbjW^YcAp8bGzAS|F*-Q~-%G^w1*n8aGy1~pA z{0!UW7Uy$kO>6NSbrNWZ;$(8B_k3i>1lBJ9smb#xP%VV0v8pHv+r@O@3@vBg@CY+M zKWU4TnjzuyNbf@39nSikVo&88%8CMYx7fX=_5y<+b|&RWJsE7)Ok?^UfX0FE7rh&p zgWdl~)HoC)zR$~_f`(H`?-eo;!rQX>6{>?4NmLts6xNZP-OpzM>wxeH;(dK zy#%-ZQ|}MKcr=CF4MfooKJ#k4`@PtH+aqL|??i(rHnYN*daWV($$~zARwti6#ls3& z!d^skPaTc+47K-EPp1pDlDPaZ!>DaV45rfAR=VhsoIs(S>-%GZ#%;pH?OQnfBZ*5b zIJ9QWz>Cs_W?`w>djo`FN2}YP6zJ2?lpY9PPktpWf|saEv3tKv^^4xarX1jBX)1*s zhdekX$J2q>VIGlHAREi#aX;m@;%^$7nPc`<2(?_^VE)(abKHmAGxFdWVF z7I+$L>2Pn#{k_&+u$shee^_d=kcB=*cb=oJ*Xxw|K;f}86M-kH*6uyEu!)0cnxZPS zUq-gcwiRK*KU&T^*Qc|&*3pFCdl-AXfGE~_--sX8#2=_@ddL`Ai1S;I$h=P%H)?{e z;Q>^`gYGwx%z0sug1@g1QLlqq5c`>Nx7r1K+9I;6IvE#mDDDhxBJy5KU$d*!v^mDO zO6$}bp>~@smVipG!N{j)6@9wjr050!+VGz-`(!n`*V8$m!{E{PJrW~5u| z@an0Q>f4hRsRd~E`n7SVmk}>W+PO&)QIzqOZC}Nlhl%=>Ih@*_qaInKa~M6;cY-Lvz8@M+-`^hsm6p z3($aa#ID79Tv>R#xPWZy^&q8R5hBEl7H9KCzNI0jDX`2Y?@JHU$-^c+22xhUCp_@Dnp?Qnfrz|R z6EsfXG$6D*QM}=B`i?PjVq@DO0(ztZJ^MOzSvdq;t%iItDrwzdMo?(zxx(~tB~s5u zC>Tf@lO~>i2klnb(ZtS@m} zZoWUWjlZg!S^wy}{ZBP^w-=Ji6o3_}%t2%OY@O);c-X(~b@heyG)pV& za$oA<&dc(*pQOh}KNu3~y@59kBV?3IaNbbiT{^xs`%;?A5RoPJeE7MZv;!UOr0~>) zq6n=2ufO1?2`o*mlr7nRp5ZHb1X8bL51>dSq^Qtq-uK2`iJ(oMXF{3_)kJ;KZ=({(C$)Y;<7vY0Hc-cd`1_4dBh#D)8|s^ zWJE@B>kr*cyEZlF1->E$O2QK-+HS@W#^U$=GMBMr=FwwLWeaY(EKvM!mC;*4y7~DM zXoC^|p|Su^Hl{T|(n7&U=fBHS_-R`Mx1$)7COU^;dw2w50oVAG>nQ+3jX1RY^poOX9Iw_ke?av$sFvtQr$6uV4-X5u-JK z@LN(^2C@D?RFtY>N7>dGT$eh$jRIms1l&nmsZWCGKNi7!J(kop{Nrm*EhrfIaQ}K! z{Lyu!+8i9yH-Jr4X@HeBNxhYK9p}yI!R^Y~0dW-fDF$ue-BN`YLK*v$KaD*VAVK3) z0qHNe{Unx{p3H0G9PI*gi$g31ps!o)wRM_m``IZkqwK;n$Q3Mvtw^=COmHnfOJvM= zy@y;78@%DpszSH*%~k^N#qLa*#a!nWTcMb$&F#&F@y7*#N-E)14%dfa4PG^ zWyBXaUr?G%G23(%o(SMxJOVm{w>ja?-8c3IH(54HsY0BUUw`B@;;pVGIB8AZfLMd= zUrrnyu5BwFr4u(dYo&iXWL!V~KJoDV5UhiOJ?xUd2JW7Lz$peVmTSedgUdK+W0uW zrDvIM2hwH!Nl?9K`{%ungNqwfgwNq&Wo?7U$AC2Lx2J2sbtc2xja!1JRSM^^CT4r57S5^9 z3q`8aUxKOfU$J7!kKw!0;p}^e5zs*Nt>6xEYvIV>%8fTt?7mQ+$M#tn%* zp}N_%0y7xFgL7!-y6IJ?kihRoJAi24ZV~uXz1(t#)GAHL{TKsLi0x*pKK(OsqMPXL zm|QQw0M?0vVn#txc^#@2`D)Wn>^+DFow$^lzXPWpRc_h&+}o*zNNk~o{O(~ScNw=Z zi@(kdoGT3%Ecx{_Fb*OiCog13)t#7-cOOK=y%r0PM_>AU@}I;y-XxkjcQNOdYNX0a z`AneOR+8KOXYsTjsyC{~HrQi?Yy5B1f3I6ktK7?0rt;xeoih3O?8-4+zlMB+rb;>9 z$nN4d(ibXXmSgv|3-Xj3dWw}!GE@?Mh!e9;t=?FiqFJV87$wS3g`|D6e$!-^88InA zGxYk*#J0&r_zu?Iot7zYeu0jOZqg?a*Ww~;;D9#JGL^atg35~ zr$6`}CmfxBv7_waddDf6Q^fa8^N`;T5(HIT4;8?q+X!tp+fk3Vg=f0GFH^CZv90e? z*PWMZVmYavF0Kl1Pf>|n)66);CO^gKcZ*uKPuQ_qABbgtV7t{Rwf5tX&JW-c<}H|J zz`9#OdS-E~F8q-V+GVVhFEiRirzD>-hIp(7Qd;B^X7kCzRPIY(F8Bph(5>DWoLw4T ze%>%z8YM57ikJ4_@U-2NZ6zt~DGKY<4@FQLvLgAAnpol~^l#SB_&pk@ zzRv4+^F7X8S=+lK;&JCOBA>Vr%NNUKtb7#I(&Eo)x0>uMdWBgLqt|38bfw%%L;hip zNks>yv}mIJ71xt)Zn*@*o{5$C)d|zsH3`iuyspb7L8~nc9ttgsaO+0=+=d9N30Y43 z>=QrXlSh+%;t&Q1n{D@^kH%8O-pGDyv71 zVpXk~Zi&GX2D1etIxPh4W!Gt_P@d^HsOt3L5RH@KKjM+uyb|+VbDYyfEVr5wQ`1v= zr=G-e+x%widNaMH_B^vREt~w+U~$g2&z$RZ{3{X324{>RcG2aAn!gr2(l?`IGZ~VQIIbrHh7x}`=D_L5Vsp2rgU+$1fjJNg)*@ych$BVlI)q!l37Du zTvtfJsjLYqOL-~9$7@(bx*9UQ1dU=ddN=I7>w=hbKEv+~t!_T4h+12_R zE85PjJ-ykE{vno!|C+z=h)X%DI{p=5>0bxI8Mm|vsQkvVsvmWlo82%Cv%`GdKBi4* zP0>hD*4{gC^bW~4UARernf!Vk+J@W$s@I{HZm;CgQub&S=h1u|Z7nKdV^Z>QY*3}( za+fSCbmNSEFFK$2Zl@c2fkBP?`F7&1X7_L|l&i1&$b---m*c$qzWYASzcd{wv9c3UpOQR?ZAvle3p<%|9YkV&DHh*ljcxk zuX*$@ZD;7otLKU67Qhwpy8G}&Q_m19O{qzJ@GJR8lsw@g?gK6@xznz|>@boQwR;Gb z7_(ppUsK28SCBQTAnm{Emr-ovV#*(er(Cnv@h!Niy{slt*-S2|Ot`2*De2sKWtER8 z#VOIkf4+s~ws>nueu6r*rZ;fjQ4)&)K629y* z3?4RU#LBNlm1(`F+3H>C#ZD?P!#1X1b zKETkQ(JwK*!x&42bys!ZthE`Arq zNdEVyv3)M)Jl9`DCMFBk-(p%c4eYt`OMX|nyJb>rE$)fk)J9fzPE$%BPdHvx4k1ei zVzET{V85q~chKWUj5=QQLIzPkj;!rr0d84}s~M>wPV?##8i}Q()b25zr4EL)cL7?t zC=oeBCAr1joQAq9{^NT@#4K~4#5M4xeLV!+2IZ=BcqkloCa_U1ZX8W#uR#@}gL|nx zdt#B~;@%R2+RD47dRyr~$VOC>gk*TxXuKKLe6hdJK87Xz2ld;pkh7gaIU1<4E;)%D=?*U^K8jxYpZ zij>bg-RK05Y=NQAk%cR0yb94TlI24*QAeTgqzlVHN@ZQ7rfU{of`&oB5%MUa0M6uY zE2;ZR*=`J-<;%`U@zeVwkea3jyl2^w!VHK4(wv0=miHZncIHYRdt_fcm#XgFQ2)3q z+Fy(Lv1uMZVhHpHV(qp!^zhghwjrXOG*hG=fBL!56J^yk0rT-z~#n(r0`hVQBZgwiY@qS6)9D zD|~>!)vJ)?JL$y4ZTrS{^1Aa3e0A+6At&Et`_A)tPZ~Y;ZW&+!!v`RGA;AoUlEp0U zJPxAG^rfJi;J=xQr^hZA2IFm$KjBdEUzKY*F1-S7%`#RPCRnVZT_ZCiB;Q)8iSw0B z=MDz6sUldK&#c~5dIpo?y7!9d+x7o+rtXAPCOd|$) z+`R%YeX7$qw4l@&B2+(U$4o*%1CE3d@)MlFy*!|hs4>CA(mw;tNgWCNXmfRtS*L20 zh9N7@Jpg_fgMKIrpeccBQ0*sNLDI^<3VQ6`cka(-46#S&4||}n2~Xb`3?&YZ{r4@Yh7#|qdD9a_Wf;O0+5p_;$^pe)NMr)$=sB{10``R(i`((s+s-_*vlB z-7FKXP(h~oPV$%URa*`39t=t2-*}8fN93K=n978C9Y5tB06?E)-%A0OQBVGAI_^;dfIi*Zv zRQT1j^&NvekZ?+G5r8w4f?qrvO09|ywAuqTg4&d^CIvP)($iH=fTYf`BM)io?GWNH zL@)@o99+vFhafCs7MYwR4`j!rzmCexOU*80-+ba`ri09|jm~j{1}nx8Kv^DhJn>%+ zdNm*!T*Bi~M0rW8zzR)*g>g9!ZYDvwF2!|{Nf>F$l(-+tg%`RzT^vrq^#VvnO;J#4 zbhQL-VG)|~72yqQ6`8?oXVTpDbcZ&q+>1Vpbvh^8M%tzqO{%21`9!P+sK=_THGIc( z(zMA2)LJfHPXO(M3VJ{tfTC*p{=&a6lnBB2Z#)=PK!gSlU=8-tO+?gk=oYp`U`Y3m zgLf|4*mPe{SO{P+)ba6XF6%1X?cAIX>=rQ6woDNc35(O_JUe_Lw@an)lJq%NMDC(H zg2H3NVB)H9VoFPhapB=Bs&v??j& z0za;F2DRW9(PG@M+=4nG$Qk!kDFeHtM7&B#&V2I1nO)Di)J%Re61%ApDleti5n?j^ z3fy~D$C8w%Si)mK}a$aXj~U`tNK1-@ZwOOjWU~ zP5*V>|6a(SDyld1oIGRSipkF1zn|=X{0~HhNO*TZHT~f4t^el%{Q28L=E?K2>mC!0 zT7RD3-!ks^4{TPbE`f17AC{Z`e!jm;1o=VY5@7~oQ8+FC!?^z}l>fej-o$e>CQH9| zAAkRQH~z;DcjVAkFt22Vyx03H!bGa$zkmFkSR~|igv7l+-Td=h{;ZgPE+m!^MYyf^ z|HWGn%j}0l<7gGAckUocN=aQDhzfOX!WEb)%NWmc`mc+JOg#p;dIHj~0N?$-UCfMS zP`ZsZJbI|r;AR)u-s9>HXtOlDp!FJHX+`Fbx;Ov329J{9Lo1Zc3ciQjQ~I}PNaPgGw$J_2wsBte^pfb&cbfnivgxd#i!FD7L0rr4vo;Q(l0?Cy1fXpsL zey7RsqU^uE>A$Z_2ziVtY5d>g(0XHA+JY`>K@yedBP2vNA8HuS-)|Dp63ebJ3XF<< zLlcC#6#xNH0Nbffv0>A_RxBJ=B=xlbiR2-^<_5|nO8E&SD`ymbq!FG*!Qr6v7Z z=1?bM_lXrNmfa7k*H~DE-upQ|(>mX5z<$mC&^hA9$2%Y`)PMkN`Cei5GM80#Er891 zmR|gig{BmJjzMpH(fdFoZ;LPK(1fmsi5ckf0oJk#T%Ldmt)rW?nnc21^dE|62 zgo%73l@Eq9gTUkM6ZB~gcq(|Z8_~!92guCxG32w=h{MU|pgs-obdFt&>OANSULgne z;0+${O}J+~Hj1yjKy|Ck@g2hb*aw%VOr}fjRV%r!j>wuw2PmJ}xYHr^$r_XmSg*xi zdrTOVTlBs3I}zNh{eqhQJrIj{6q4&=Mj=GK$4SJ%*~DtU_<}Kbt8hTA3Vh zfeLeHSTkNb?{#>&qT>zUu)fqW@>tEu{`nJNNV7?8-@M&T_b`YFojw(~KSaL;>RKkt z!L-x~$nmp11ySCzb)oP5sq2s5dX#&qrt06`oie-2wdSJ5mr6fKLSva3?d^f@`k zYFhC$7CQc1W+L4#{v}B2?$AJWpc_%rw^bTBV#x85GlfP5tBEsy<9r&|CnE+#kzrAg zfSj|1@*et-X>epjkOOrgw|DDuk^&U$%4wfW)GM&vu3GwXv{&AQh}R8I*@sS1LwMSr zKJPAvFp0*4qG`&h@eL>BTxAQ_nS*Iwl(uPTBlH?mUcctX3m|?{{mjNAR#0PnbhXkb`jR7sxBakO6uz_*S%Q^4Ek5xyr$d&x}y2Lxo$PTD-}mo=&FMiZ_5n zX7Zo|n7jL1;t(#ye+GuZ@*~z12mkloo_J^RYKpQ#Z9hJ_oT)-Xz1#u)X7Uz441L(J z#-KTz$^PdA_`QLZz%kMnM?D=!;|EDSf;|+2EKV4=`G0XEG-WicXw#M9BHhXksOaJr zC8RZj(#WDmFAVegz=S0$bX>=&J9OrXh4D6Byn`Y!*`GIvN8I0rV!G^b)h!Zv z_CTKHv_`OpJxOA5g0z&Xg$lI3H-)Nxov|Rf$l1f8WXW)}P5kuey8|KTL;` z(u_DX&}HWZ-f?)}5pD+6-B?Ct7#j^E0}Q?Q#yUV6OMEyW#h2oF1~jI3=EKi#!(dFS z?+6pbyy-DHwpbDWttFqJIbDY13%mijSFsk?Fr_tB0O*JgcR!1yI?yjojqW%Eg68&! z9~~jX;qc;`1m!R~gNJBVwv(^7@{RCHG@-WV!fW{*utF1Srh{Ap>&UxYiZTuq%V|tJ zii@+YBJW=j=tcWJ{JNx`qa5zvRQ=%=76vDE^`D~&7M zoPe?i)vRt#-UW~C&BmSE2-w4R!YH}Af}xk|E0PbNb>OPp)x<3MNdEB;8jQ?+w0l!` zbCuNF!8}*ic#u+|2vB-Cd2UHwCyX<; z*E1LkMiF&;)t8<{o9@^VasHSoG7f1~S1pfw>GDm6HmXf95*YC>>@AXikx`}6ftW~=>- zby{WkMRk=;r82*%>UEL))#y@2b(Q%iZS&HE%I}S ziDq%-hHC7m**hCx8BydC^&Ze5XO$$p7i#d;ILfc0WH1d?Qa<`E7$+tLQTBrQSdKMHTt^gH9}h66|FvkL5WR~>76y^I zCy|1*L@DkfduJrgS!}$^vV-yA5^RDR9yB=;%XmKhCN8t$P5q@=u>&xZAb0mzZU1G< zOsdpVOyOTouIAEX6Nu|0f+1cSLN&sYx$ZK|O!O=_iTNo9vSaiQxP&>F!3+)^tFlE} zL!PQ@KN6HB7(AkHVW*#G6=t9h$Q2E#t2Z7s;-2TjOLP9sNFaqUT0+{K&Jf$=g!DBxASW?8`?Ayq!;w zO)AY4Km6??%W5m%DIkg}5H;-GB@{Y%CDAb4dP*|gKv5~-zY1hss5`G-)G1Ozog^B% zIN_N@zkvEJ>kxgQ>^z=e%k$kYA?H#I4pA1LezH*BDDS3CdzS~(1M?A@@va^~L1EY3 z!FFeBn2RR2cuVp}`vz#G?%0xnpz;nzR#eMfI!Yax+{< zwfQ2$ekl2kT=4cD!#92^4Q3t-ZLX4sI|-Qbh*@WzgDiuLx;S*b-NTQdNJS7#N`uRR zXB7@E3(gn5`{gs+pY*u+cDlRi+$w`n+_^SjZc0B{A8&Ya>LGf{qIv%5O;PslQ#VM-3^&8Csd8Q6UtgB9KigHO}Ztt%+ox^pM8$Y=RSeOds{L z>vj{USxUXQ`u6{B$xL%_JSAXYs3NH|0HmmSJQ5p3oksLZC&K}BLhke35jSP=KF|^7 z_}!<$WN7?xUP`VfC;ooGssb{v!!cV?%%@{z_G}cKY;3ywn6o~^(__g#d_x-999d83 z$Xp-!4j~CN!QP+?F?GC!P-?mjxKvcs(3Rw>hHkdkE3f>7QH*h)P=*)3gp~g@<9SNe z;>i=oXFGAnq?nJ0~(V2BUxpCBVMur6cXY6jFRemjs7nZM(b#yxiwEuuO?u@$U7i>(oYgSJ4!A|B{a19Z2Kyq}d0zM?8Rz^JxWi&QJq7L; z<`?{z#F#1SJao@aPr6u`qHs~xFNDdq%=i0$I4}OdAI+v&AL3?Nq%b}lbrAX+Ksxq0 z!rKgoYW~iG$}(lNml(qIFF_INAJd-75k^M>Mjyu>K~Vx-zs^sD?91dbR9=TL8gMve%uvqN z-Ghnb>q5lCggWJ>K~P!7Y1M*zv>@q*&6M-+vI4L@O~w1{R_7WY0ba3!wLA65C~`!P zMfpccWs28~0NQ7|UuO=V@qc;EEeHJc$zK`7IG9qFSIvNi=)rtj(A_N<2(Agc$nSSq z0py_`gmuMjkOcP{MoYg>KRk zn018d5K-29&_7{D6e@}$;{w8V)6dp!QNKTGhArj~&q62h2g;BG-Ft;J_wHRl(BATY zayR>Jp#xBaX9M?kil7>(nC%N;x2w69izfz}`rKK_F3eqYh3y1m4e^kU(R!}W533d5 zyD`@I*x40s+iRd6S*oS^^g&I&{tZUItAkzgwJ`MUW6*Wm{q*8-ON2yxB# z>}Pt)1zOPS=l|9~>muk9n0YsP`dr!yUL<0crvT}Zg2~PBFOJ6%dXC!I#b1lS*6<%| zH@yw&ue4rD-K&%>?2E#!1V=$j&jA~JN?|rZOh>!NFkVR~>0f6EfdnkEe{MLvpFXB? zx|qQRFz0Sh{cjpDvPIP2m);_xm*HEBi@dM}QdcMC-zC7N- z0&3NL%XFGh#yAy_vO-9~+SkgvO%pUrWKahYi2bX;)P`p%^p-h@VeIg`M$1R|1;$Y3i7+0I?8HH``fJ;?0s!y#<&FG`}JAXCevU*O)ea3KBz4<)W#jO01qwZw1|tS6HWw2nQ6*V z=f(+n-f72H9ci*^dFl5vj4PJDna63{pNR)A%FhATx6g)*9T}ySoeG2>GE)@W~*hyxlv;wfx!@*g!kAhkM zU@>UKKk(?##!Wc?fxFN70Rnu-WLJosHNl9+MS|SrWpb36y{d@)4D?qs`D7(p_2^j5pasg+XZ^rpYH{elV-*?Y8hciUwdoQbJuS z6Qjtz!H{|7L2Es2y4Q9XrMQpRXrWm_B(<~W)!)l=QLfsd_@Ct&6Cv%}JMY;$DEfRE zkk-5I7DO`2bk6-e4=9Em5)#yd*a7d3un4XB`8Q z_FmXT=hj6haI`09{lrkaTY#o^Ib_eATDR^TEJCzcdNF0gam(JU_scv^uUL9Q2n zp;~Dd9&hJNml}x=Cdm@Xi9sA&kgUAL21DZ85H9zg7&8EpHLV zJdR%-;adplae!JTAAXSeGy4;S8X1xYiU@Hdt8=CvX>Yv>5ir}JEa_y%X@8^K6w?ql zxeNAwmH+}p{-!3scQLilS^=HL|GrlEZn9GC0BvzDa0pEndf%=?yn7!=fODi(q*osz z0V!Z88D4_IJtXz?S5Erh(G;5io-`I_AI}cpCj+BECME{IsMNdIAHURfUWqj>0R5D! zA~f$H+X8Y|f!3+=8OOSBs$T*p+B9B!n8ik@x91I^k>qB2B7E-}F?b825^gg7O>jeH*HOedlH@f7$vN%N;NA z;8m-zdh27Z8I$3a`!>oGI<(2PHKicTe0j1;ag+J;_OHUoLe>{^EpaAxp%%aW>KhvM z65hp}9mk~0>GW|{Gd>|RwribFV|g#ugt(t|>sC>~Q1g5#i$?UbY}IFJLKBMm7h*;C zgHq`^VIfaDKjI3?LUVnrxci?)Tiqng^p|*)iwEKf)ZW*g-rk6pwMj8P<>8tA;le!M1|d)%(wgsH!D%ryf5B1swuc2;mqaKu~vfn)MD z`h(UB@UKHt+JWIgaa<6G3K3Fmfb6+k)GIfQ#9WIL952ZMzEWGQ7(?(6g!+mAtm1qY za&?|lWI*?5#!8|ZL9C;}a|kq;N9!M7;Lt`Dk^{fwkC}{h5NrMdN+Z#hY6`H`q}hB{ zbO~0&OK>s_gDlB;vySL#%-1}%13uSX&xve%HZeOwwEa#>T>ru zeC>B8zkCB8_zpyDefvBiNIK)mw0Iu$HHG*~vY`DpC{n?_iKAgLh$SmY;AneJ z9}*NqzlvlC=&_|xeE+`5;Mj9B^)_m{^{4~G(b3W-sP0Ky#XZkzi4uQTo8!IW#hs1? zgSMFhG4Ic9TGRC>SNo|Qm}NCxvxmJi=TFl6XM5?R6T2Lv>XY^R1U{Zf6;^`7Ymz$d z;a%(ez7U3hE=>dPuDdz|;MNm**hWdbtx5QTmo-5VuH_J1s0O`eVzFv9o7e zu?~HWrLFi1!$G+}FA?lhGp!}Hh(59^)M1X`x6rA)5jU+Fk0=Yhv!7enIZ@Mxr=IAt z5&bP1LmxpTMThI&zE{+o5lDs=~ z63?LI*S5~hHYKz-aTD`RA1`SzHEd?^?uLu#t!QvJzwZ7e^CX5kqh6_FHg_o_)Pc_? z-wL_k&Sri2HOzeJ$oz4-dR01Ia@7^xc~1YPveyCr^5!?kt->RQ9NK%}b9x_9KL{^A zBIyna%Nh>h?@GaX35^2H9#ys-%GyQmWldR3wCC>QH;jx|Tepd=L;+ARn4zM(5)$5;8gZsDgVip2Bes=PFyh$CMk@&uP{7vwwi0_nab$UCSB?ULzaY^orq!ax3~Vh_d-m)F#Yp?+h`C6Cm8cvq`=#0O^$WSJGgdU3 z2Y-Kd@NQxnTo(@v&7sShhrhm0iYb>ECs+==nT3o*YSq1Q(Otp-i%%o{?En5J;Ir9T zAE58hbUmSbTS4kv2lxKr9QhLmy9IF?!YD27CR8boy)QfN680qq+!RL2jmB&5O&@x9+N8Uu1V#W}?HoI{O&CG_Do$a#3FA=wQkn`IFfHQmn)y!qQ`SSodR zMpuzMkL(x9rd!+dTjpeSy;g6|$R_BP-pJKqk!KAc z*bDOGA0{fV{(Q5l5S4FBjx>*w@$tYT=J+IWEc;CV2fsS1IZ6mK9!HApw~2Zt*yY&^ z!-R|$#vj~!RN6^H;Y5>*)H8wR6m5Xfj5^AhM70WA_xr8^B|H%M@vdqVXWsC*zx&sZ zwSslo%ms}sF+?4|rJ1a!_eRry=ki(=sqdTFpUagNY$-dQipDM%_W$PwJphBqVLvq^ z zdk}VK9ji@>glh4DHp$hR$Jne@qR*l=0HK%iX0!u1z(=?zfjrfs%#-2QHF#k3${$^L zd7Xf&u@+#M^|@9o+yGNNV?rTH<8*B0v?&^CM>%-|-U_MpbG3@vhN=>7m9``1{kFhX z9%v%r4$_!r&Y>1HD^W;bvucahH{16UFvBV9-JxQ4%2Z&!GD1DWo2`5j{Fs( zUDn@-I%KdOQ_DLouf1G$TK*rm_Z&g|uol5W(J|uDkxz>cwg8TswneXd(*&a6dN}Og zg*5ffW7`i$jzDuJn1!B ze;-sGg3jcKgm)UhQ()yJH+8Qqc|Qq~LmLhBhqM>_d>`D43VhQ_291Ktk*j}h z%uKM?m9l-&?&x69f7YR_m_V~Qsn$Pl^d2d#=@j!GtkDnbXth^$V@?~UQexsnA6wgB?~h(hKlu}px2v=_czUeTwL5rX zZz*`` z8o=~GD`YkB>FE}Dh#m9tOM{YuVsK+tnww=fKe9Kgpb~sQQu*)Ar+aaK1d1-r*I)A# z@1gcyHvS;{V8h(YROBXqTx+ARRQ1pxjhEOhY0PgCdv@0P`=Y!loAg4A;N23e>~CUQ)!)z zK7f>)+IT&eE-)d{)(R+3SSNac`e-^8OwDe<@!19!X$$mS3sL7;mVZFjv$pxyFVjF* zyu>^4mcS9ni1=D>u@!8(6mBH#=?Y`!KKl2$_|MZ}C38c$0JJevRL8(QOObZpjMrkk zSOY!zO%Rh?WH)=GP(iN#@!a(l->hGblp`OiA5y1ZH@_&ne+0Uqb8tRp_452w+ycGJ zL6HMtQ&dMme;f4K+aB)kngPrEKFv7TRwll}q{9(Ke}o!A{WA?(2kk>|MebgnEl}p~ z;~^jl<``6nYPzrT9AacN@;dW$p-(W$ zB-}xuL6R#7teOT|n{C;*rd)*@Whd8y`FTAvp!MDY zEr%TlyQ@&ZvOBN@`TDciK?9ATW14ckW-b1XMsfC{3+s1tGLCW-WGw~p_{w3)izLPg zO>97Taz&iq0LC5-=>le7L~(eu=>$9|ht45DAz_(!dG`f9TQ&)hYPXpP;+@0uw4v;2d#) z{CJm6@C&e&c0hu2sDqi20sxB(PWG}Mu6qiPoBOwF)+ihCd|0S0z~hs?bU|zw!g;}8 zN}HaCnQ4?y1wsovQ;@>8ieT{a~p&jSpjdd4R})W zjtRBi1U}?CrbTT*pvlz)oQLX}TkRS@a+asBv3wD^x+Tzf@ zICc<{=l{M#Ab`kFAa1nC_Nbe=#SzPWEvY^E2t0=h_0nZW)`u>b1_hKiE7^vL6C+sJz*s@4~<<6()o*+@t|owhb?8pfoy4T8bF%H zBPWh~za#W93`HfA!0RUvf&nmZQ)JE^Jrg7c(7$OkuzGeH`FSC!IjFM{vq(MyzA@tf zqv>%&0uxM*uU- zI)FO>pSVN8GbhQM`8rV)_5gg)x@#23(m1RAN;C*FKn!~CW{8Bb_biN%5KW2`)6l^U zZX46z(Rx-5$rX;%aeKg4TLSl+Hvmyu<5b&ZXVuMEEKbzuoKajMM9xX@_MXnRO3*@| z2q~tRz*3=^Ey)Nb#3Ec3PQWSTz>H|-e+4QE-bQvB9D5K5_NO5|(PyEjeP^Y`q z?DT`!>P24;bUk!(q+IKL4ccOwPpNKuU038~mvCJG%2+p2tufaNH{%!tjV5S5aS z4d9bc7rRg|QbW~*CqISB*TmR`sj!bttIANUNZFxVITeAGt0yb9?DNIdHshQ5sKoM!&<#xUm4oqsEE! zpOz|@yLyPD%t-M0qz8E$skG=TwN(iAbpMfAxYqhHK%jrY$KZGgBzR4Xy`|MEM_}OB zft~?@5>Mpzi^;1mqtxJh%b+%<@v{IRhzGrJD}y8rj$XiPCsNx-7NA34D4cBxC0Bsb zLD$sFOG5DI<)0q_=^GpVi=Mg&yM1upIzk!g$2cu^>^$fX(31z|m8*Av$z#2Kulf;C zm=smR$ai~MmO=(l18^e*W*0`s;VzIOyrqA=ZIS=YGL!UCfU3q>REF%%$i+D8Jb#IH6+h+`krzB zj-l(i_s1%9z9Hhz+KfJ@u}xG$Lhxp|3IrEdfj;ZD_H|Xj>$rPO@EAB12ReJV(NUxD z)oJiCE~QxP>cP`L;yd05N*&bBfH?nCS+RS~EP<~c-eIAq{J!&Dd>umd>J$7DkkzKm zq(&C>&*Gxv+8=6WPxfc*5RG~l2NDs}JV-M#R?PoSyeJ%huPGmA`+#IH>p8dbW63tw za2j+i0<)jL*L1qMkc`bQcFnQxMureAdF6g2BK}Nh*y^;<)=@K_7)zR`HJOVMU*14) zCx}{0yP>Y!7M4v+T}@YlC#65SU;jK9CQX5~NitFzH<(J^4GRc`8yi}u#r2xj#qrW1 zN5qxSp%PY%41c~OP!SZIuK1w35$@2HxfI2Fgk)_cp4}D05~hn@Kn5s5`RjPWG{b=x z&z8-a;Z4nZ$l*(Hn0t^p$H;3EuOd66j16WgQ{T%G1n#&n8zFzG87|*ilzIYtfP8|_ z1!LGx$<#ygqEh8ovP+LJv9nl1IFNT*Z=r&BN*5@U>!C0aYj!mgQliEe6j#Wl=g;gz zF#KNWFtNz+$Zv(uzg>IrrP)y971L8*R-#8s)P6TYsGSl{x)eV--9Zor5G6U|`JMZ9 ze`)_V$@{zQ;h!8f+Aza_Ml-!6b=;yS=!UFHA&=~*)n!b!1NsTyD0M!Z);2r1*`iRG z7x4Pvl{oRexBp{J&BWb6!CFZSJS zQf#Vb?);c|+`w~As{~9w)-jlxgk@C3LC&i~ly&0*U6-lSAp7U;$@pmi&GzLZkFANXEm zkLL@?T9WTRf;kx{RB_&37ra1K3t))R<4XDHV`y6N*6x@ey6+6BDvoVL60+g#l8$ z!5I{^X{JunHUzy&N1j*v=|S}@vL0b7Qq6W}p=0W%0rdG*%YPOs`{FRaO1_3tb=?_h zeY8w_NIQ9P!V@{N2&e~yYQ$B8OX>UI>aq8HcXk8-zG%qlzqg9nX8TZ@ilaLDSo7ie zE3LRCK0Ix4sxJ0+oX~LyWQZOl;OcUEWk{E`rel=9tM`^roV!nk-k1Xs;z4Y}@}k#O z1H-xFk5g{&u8h;tg~@cWIcmn-GL!Jpmp;T^J@vu+%?NA@F!1q;coT1I8%*Z=4eIi3 zyPGsAMuA3^$`#vgjpH?Kj}tE*eXX1|KQcta#p&;0eOj61l3Wrl)SUEG(=tGekM_MR zkA$jehm8JEhJDqhos6i8c=!1(UTB)1q5s8YTGej1>-8RQgY%PX9*-}LRN8-0jiPkp z^u`6pinuz4o3`GGUSi{8U_7=KKDwSk+!Ce%O*eMXdr2!&#G@TIEP_4lTgyc9eL|*F zKb#qC4j;zrEDA?dGl9O~W3U6QfM8c^jdS6FHxbRvb9klSywQTb!L6RY09mmFQ_}!D zIvts5!#lEiJ;oD&0jL7eLp5MN+9ZP;LsnQkN!f4ey$61g*CBL&LhgRX7gv5L?P(AW zH#3MUH(Hc=XdkN`CwYc~pTII#be?9>^xzxpLS;A_{vsyfv%&#QGUTVGkL2bl%7A6I z9(s;d3&6g4u4kQJd0J4{!< z=sW-Nw=p}cb1HX1A8r6!;A8Y19f_>w&ikB0TFJaDMvphm)3>C)?|og6^M5eO z%85mN&wy+<-GQYyBi>&%N=Abc=ttq+ytJ>C52Di~Egcg610Z-mk_Vl9?fUnxQ#1oo zum`dc%I`qwub1*2+Y%hI`vzspX|4bzn*qwAOGQ_@f{tiedbC*l@7xx?_tkgwzB-jH zI@k1Yb-&b7npnYf6b5NZ(;(LjCG%j@Q7sxiswVU75<(li?;s*Dq(6TDriEBXL;rB; z=>7CL5da%%(Qk_Xl8qAutfM1n2HL5s0^WilJUi4MmVFJgr4nH;6}$!v=_SOnXdLGN ze0upOZe%pRg36$N*b}JVPjrx)E44pBHLPotHLQ$S7e`;l9QD1UpnD&BQ!82M+bw+B zh>-ep8-3@9#z=0);0p*i7g z>_YHzGOF30^#!lMh9WmEO2;f40R9s7C?-Z}plcBGStJd3cPLPDf4+L`xr2OXd(hsZrhw_<;$D;PGWT1&y%cxV04*6Ax4i{a$F3z_|M$9mT7%O2P3-;Z$* z3mN_bk1qR#6(YM#^a&W0GYi)*Iqm1_ zP4F|+;}(?uV3yjLTxkLLjBe{Zs5{M0GgeBJ+5Cs3DfofsbOung8jO(zga-234SAB&x6n zqL4Z&j}vM=zCQk?d(M0_p!3}Vpo^_jIGCJg>M;e(yp-`(zR;vgPKq6%+0-r)K=>(x;#fRV>l^0jV<_K_GffAbnD9E@k#TJ^K+gT`qL4yh8bAxIhDNX>}=aG@(mb zYB~L4E$dkXp^G}AoB0)gVHn`UJR{PPr~^grk|iwKIQAgW%oNH2=&;cijoOAXb!jri&6sZ&@Y?i~N)+U#wo4uwjljftY++km%c$t~B*)omGRW0WPr;142t)U<8jx;A zA|fLVC^;%+SxW^GRq)^=(0H+D8?9`JuZ#0M`IQwt4Ng_-y7Oc3d0QYoD=xwcG@VS7 z>YM%$9D!{G6T5|h+jGwgn!h{-vB_p&F&_3(U%2lyVO$-8WpUsBCUsXOb*b7Lm^+=c zOauyO)G0brjUc}}1UI2R*-zVa7~KyZYl-=lbbjz_3GnyzPW5a9W8kh$V*9L2bBueq zTvK_2t5&_h2(u~t|GAQ8wCHg5z*+LhWg!EiHEIDAGbIds+ESJU;AZKiFl&*x5%ZWv z5NW~FA=@?k4hoEEYG?%@$tr!a1(*K=!%DCTJ2kQ5^h@4L0sv@1K&B{kx?Lpt#;R*| zVe2mZ0K*O{7*RGr?k=MIJn4aqYwS4KsmTdoNS`SH=4=Dc{KjWqM5j2&F6o{IsOt7= zn(J2eG)W5B+*p^WX9^)`C(MF<1C(Bvg}{tquaNP*V#x<38@*?*?!A#n7yYPUFk zF7S(zGtq;GUZq+rvu+v36|&<^*>5h3xDQofM*i$OKO z0tGM9SaQ5tOk|QUT&P{PW}=5v2LULHj$7iC4m^Z5aC4oUl2{UH^U4?U}6TXGxQE zr$3SmgnD5G+}r8po5QN+qd)ROtc{)?~zh^x?#+}{_alh{4FQh zN>Cpi?-}_c2ncorC0JcS<>U|F4#Aa}7lMV3DY%`Hy~ncLThYx|?`rJ9iBu8h-pjV6 z)=E&}GNEA@8ml=1R^XbhYQcBlw^)~8*tDzJ?wVko*ak>lNMmu-;W*I;Tla_QYsuCk z#{k`Kt((8=x51}pB1anokL(X&zzqmGtOVo9>p(cGNFm`uX3Q7M99+!}82pEPAOGIW z7fjU1K}gONEamV(vd2|)f40%Qy@cd^8LAR~hH&O)-Q8u%ehALa+HM9E7b)iwm!2OG z$kevbW@-9$3`6DgZ4Pnz-QRr56~%bVQ7d;+=Hiz#C;*sEPN2QJ=xy}Xf8>Hv%9NJe z;$&w*okrY92u)ks-AOU8{p!9-Z0s3R>cAHmIeuou0kCv4H#h(`} zl(XaK2Iz=O=R^n3r~F*(eG4EtwRtUMm3-Bk)d!-;px~0bMI5$9 zv;hEra1M$J{8fJJgDgIxoU5gv@F8nXCb^C{eM5_(Vd@rG1XVqv(ki2;Lv7ljmbnKP z`#hOewyA72%Tbj-8kZVUF2q@wRaP;k-6aMCQHG$7$-ZWwyt>h}!uyNY6{Pv_DhNQmOdjRiBP$mBK&UVdCAlaKj9bgEo#+_YYRL=t+!v`F^$K)_o1##s7~SD6Y5cqW z(GhBe-tstj#lV-{3(+|C_rEubEdQ+{JGqp*otNUOu9nU6*Mm>)h0EqSe~7<+(Kf7o z95`ne3UB%okekdaOE!qpa^8yMqr_#9Dh5`SUJCnR)hkCz0$OT$>hk$|p9jb>tgjN) zm?nEP1f}awP7P}~Wya?-HX(-N*n|wv?o>DEld&@@L9y%T8fXeppPp(`78+q1^howu zYwTUeu^_4SA|!|pI34YmH=Lbze=v23 zKXOnm0ZfULbh)xT9Gws4(^>RRF(2X_=W@ghxS00nt$_kEtqBTIlafM`A}OAukfR|> zgRQ10(`&oz0A0pW*bj5y=`$NZ0ModUZx*37+b+ z6j6?+bc)%3y1u>IsN?naY{;lRvdKLL+-<@`eIBO|yLfm@*dJdp(SC?#S84dbxu;Sh zs{H9s@ZBQPtH2x*_2Av*UjQF{52m9!=m1r0>PqWkcPhD2#zP5Ji8~iR=KxdC zyqMT6By|d4!^;<#PZ~gBTiYVKS&r3tKb!xEhpRxlBV2g;D;v*8s<#)P7BNWHo+yiO zHpiVdg*@oNTa;cSQNaI|pU#_1rhKe5*l&B@af~L~U}Ix_yMt~_-E-?dRC~_IQnZD> zTTozt3}?8*+zl zN=8~+VIYG2+PEnf>l0B0B)Jn?M)m$W>KX>OWdbZsolXZtsBN8Ccaz?j*Q{3cbn4OQmM~ z7}4rJ0!q%!qML=nl*cs#gPs3az?F-F;#eU!#7aep$vGjq}T6vhnpV8Gl(q^nP?CxGeId^ z+l7>cRYp4gyJrg>*w=36oDC5MUhsBNt1v#xIK|2NK*}wY9AQQIT&Md+tgBI92qt>z znH2#`j0=xQoz^$UJ0 z*DM*6Yqx#XbYosI&`G+SWu-hk6CJS-(X9}69T@7oqJ2HwvO~FGpfGe2>PJM#cP)f! ztwU!o2PAgCE54>PzKfyL*aON$nNZ6r$9Eq+XalJE0f$bYhfaII$>cNPf!cqcqW?Np6N7(WMS!_)12wI$v3Y|@HHOdm`Ew3dQ&Ess+XdNQ`S(IQ)I!bj^J>S@M z!ZE1h4qldVEeIQ!SU%u?qlp$DDqvJSx8fO@UymsZ+n`wjef7DZ-XTxg_)Ej;O8iQV zjUE;ufd@jx+}&p;=I6T~vvjlp!CSBGP_q)f0zY+n-jNrhHpGC^em}ONReV$2YI4efc+Z~0?Li#j@l;`k|L}qD^fEv0&KOPvtZcQs9C9-NAltX zlCkyH^KO4hSNaF=yWSm!aW+eqkr;7fK^WhOGs3_0Sp8OrWvU4EpVTmEEGfW67ZVx; zCmKs5mI2yAmT*^{%#@__?&J#(;DgPKF5(dmK~h zI2{x4n(+-SW7B%#Q$BYRoj7|t2PRkiKF)d69W9kg*V(}m$b+rfQ!3duF!c~;31^Ca zJIeW$?}h?fI*`eOb;1)hZEPlu=2B*Ro0ny7;z>q4GNqZsny2+g7In(bCsKqz#$4Ax z=i5_T5x%ABcV?NYI{f}!(IJs`p_q4vLTu?)$dzZ&{w3JTZX1tiPF&dx#30?I z9f)$>0=uKKr5{}7Og>fe&KFI_D}c?Kotx>7J>)PS+pKkY^p~|AfT2~FR0*QW5glIy z0Td?4@M)vbPMTR&z#A$^`-kIKm|i-EUjKQT8o*ILs7Q<1km#JeL&ACIDYh~d>M+rg zod?w^%ii27w~R0KfujV+yW{E_|I7(k?M`Co{s1Im?))AVQW0M~(HMpSxbfgQ$-X>} zaAHbs!ut^`2GQya8;Z*h8lOK$LKj3kjwo~wrhdpnA49`OBF)b%@U{5^4RRn6)?Q7= z(D<#Ay_bIr&^W%>Wv-Kl#6bzg;t9ea#6gG`nH~L1km;z8D-IUrBo*}{Sf!cAX6T3d z`Ln`DA)2(k!JPTQSKLguXX6#=N&x3x1CBD6tEDXk4!4f zkvy!7^0O>a4mrXpzNP1)IQ8f}wn98qd)-f}%jhweMF5!Y`N4(c7=o#u-Dtx_?BYKy?E20b#*{%j( zhO`kVF*sfyh=z-Uy$(fwWQbjHr?J+E`|kA}$stB#ht7})D`&YW`MS_?Z*5_wVn7^w zvv zGiqm^=JW#UXY9x#L(<&j0H$*gB74=xy6V*0DfwvzbKKtB-)KElv)g7V1EiYQx!`pJ zGxA$MrO_e3qh9u#zS66X7Q@^SSZRuRb07Vk+>-fAfaQGwSaj7r&nioD?+ve8xI0Bw z*bCdY81kduo&p2c9<_d)E4D#Cu*-;~M3In#a_4L-Oul0-6!u;hzNhf~0&3lqz&PN? zRKhi@gdhup3fKH}$h`37{SPDno~-Rr`I`AZ*t*SUH>}wu%`ks%iVO)h$ioi6knP=X z@e6n9HyY4+-n~7haHXh$gmzi@4|skwXFkU^{Q_|RDTwi`l*qgFkH{E&k&|REBu6UN z|M>O){m&U6FLBzsHja%4kf`sI8yZ#{V|uK7kXRi8{3a{fZ%c@O!CP7U2pNs20FnP< z*8kD1{ujc22*6!SfqX$m=mO@ZdD)_?^*?hB5_<*hS$W}~&i*Sz86W{KO+^ej{EJWj z`yT(m#BlG*2cO6dEIC62VSj%Y)y);yH!6h@IyBu-ufpp>-Z#IC{wtgP0`>xRCeBz$ z{rA89tVcoO$1Z~xIe)*R4bVj7+E}QS!Io3O??Qz~oH6_>kWpa)YhYAf<@|qqM;_nr zKw!F(4YVl7ARna$0ze;GUIEl&3xtL2+J^#+!m5X}qbL6u^scthjGQ?jW6G=pOP{Zd zl?<9Y^osGGTOuYyO7YtXhw;u5+a6+z*Tb=!K%eRWY5znS`1xvmz%0-qh=sN)AC$IR z3gy##wO+bZdj^m_v^ej_3a*_+by!hxRF+DUM)IHKQFp!qn34Ct7qLF~->Dv)QNjL&+UTULQF zqAoW>Vx8DW6<`wRQvk58fBkqVgu51a_JpnYYTg4{pau$s*MnGt8cJc8*7O$NrjHnP zkUX(S%ckwV?q5BiWz7X>S!*F5Jk=C{J~le?2QgfLe?Sm@=wIodz2z1_xE&ylK^r(( zmX^M~yLC7B4~R`Y8bgd>TyA3>0o!98m=W~T@etrfHlVfrjWuIsN_83rWup97YdBC@ zYq)|H`X?;2(D1*)SZEb^ry%&jr*{kAVb*%>eVbeye^v{GgXJd!J#js(5&)y;_Q1C4 zJmV6cGYVuLn;T?cAQ;iY5Hl68g$ApS5@*}JhEAyWRv#`p2Xi{ftn*Afm*gwvF;PIV zxCQ=?Lr8c}4FvMdfB;urY4o27fnqGMPTKtY!z0Re8^q5O%%*>Bk?V)!w_6b@9m zchU09yJ@^SE5PrlgJuuJ>%WbZTD+77(s4&febb-fWJE#qSGNdsZnXd+{W$}+G%Iji z|4}&yG|lSRLSp5Jy0$YOHDK_^qE_bI_LU%L=49_(eu-Z)Aciiu3_b$2h+OQ_Y!2Fir4nWcW@I7D1{P@y*mGNg@z7Q!uTi3F(V z+!~pB)}9-rBOL*gBh@ggX`I*wY07NKx_lwATb2K-`MxLHI$V-}HMsz)*wx8h16fKo zP7(%y#w|z&g1(E=OJ}fZrok1Shj960(7v4yD4|a0z2H9t)W0^x2yzMyfvG4Tan&BI z?|s&7H-?xiyWYp~+TdL&3=_+>I<~>HRyu87@=+r$@|N*<7kMfmzoS<>GYUcM@9Ug=N69H=~~psr0EZu8L@e3IRzGW}mS;!}+!o z-jCc-0Yh`!Ki<$Zgw8=${gHnvy??}c;7=9$o8mRqiEbdac2T(csgdquaVZ@+2JGd9 zr>)f>ix~As_n(YQiblSwnI zW`aJRuvwyCcAD!PT#y;aj|}p%7}9A?!5(vTIH24b0_poV0qopxg8Sspyt=--{nW@B zM;R~NyfcRvM4HwDG15puaWj@)tC(A657+^JweYnVN)>;yL~=|@Fp7U@sWy{&D}E!x zdi=Qpn;MV~p1^*7B(Q{rQGMk*Lq14)T?wARn0rm8M0x?bw9+9v9kNg8A0VA);kT`U zY}sLV30>h#;u-TK3RGA#_h0EO?CgE)05IiBcih=6f3VecxiUN-L&%UXw1fDD&$?kE zXQta<+kXHnRv)({hoqzni19}h9dF5p~>vYc-obWmVrf}OGE2^ddz;oS9DG2s`4 zLl&7J-?pQYbZk@N9U30M{5$05z;3DoVAr|RmSTryLl+=CuTM}&ug>l@7tn{F5zIn&THVkQ&{3~~n6g*f_x&7` zNN8~V;zI(_b=aN0+Uy4rg&pEq(;{IZ$#&>TJOH`qcPlK*+VK&q1P}E&-6o7~nH}5G zkHmP#av}2LcQ4}acSZFqWRaHNSbZ-v0BCUN1lh_AJ8&fOfDb z)};~S_AKn>3`25d5SRpUA+E}zLQbVf63J}9J&oAW>Kp?Z#s)_NIreEGd=kFONI-g8 zH19p`0;o7wLYRx$taEIb{`fOn5?(oxm?k?!WH9Goqmty$7l;ho5 z1z?HJmx|v_oqan#Bgmb;F1vD93H@)W_0L~s5^nHKO%vNn1XE)n`3is+L7w%|V=RJA zyB8q~H;v225nOWBq&{j18`wp(`$sgbSU1tRtvM>S-9020@z4~Xqi4b>ANITqaBAuO z5VruvE^sffY4Wn6Lb--5^w0vlg! zp%f}N?VSKu+h2dH??kP#P?KOZlA=7^=HbM)m`cpmN``INlfMP1gV~}byfI=l-Zi9d z=An>2{N7JV4)IjPGDkC=1fr+t;v8@6G4(j`=K#YAf8{q#sK%+KyO|a-@uKvR^Nru- zp!OXBRr;&YXHQ1OUIPQfLB`YR5JM*;c(&9tTH^@rrS^!{^-vzvCqOrL zLrZLRBILin4CUvWL-H)v``LUVwvl@E)QphIa%;$u*@`i!&RI`Fe zRea;10bdo_1jC^n5v~?8xcB1?-y8lWdR(Hhn^Z^Lqa#bHJED^-E;OuokpU_#Jlv|` z13&VYUS9%2VQC{vkHBe%qcucA>oPxc%RI!P>!ZKs3CkLxb0Z<3=Eu0P(%TF}9h*gz zkZ;IK-p38Pq_Jy`@s1Jskf}2(#%}7c|9j;gk6ff*Uv#jn<*to11Zel??MlT8J&F>n zjNZ*;at>Lc&$VE{6(syWg|sZKm&_f$kE=G~PU9dubsdODa;o`fnPCrMXh; zr|U*zgCN}%I=E9N zC{e`G%7M;=e?c3SII{0nA{f1`81`Dv3nWRX*xFr?Sl2&CE&6EXCV@@++n`y;mb{k` zmYjDml`R*-lnCuYqnHY3(^gRs7nGrUu=O+c@M!rF{~M=4)y{1R6`^k8baBr}7hyi$ zD-`lpMaL%i{Yy`gYktm-u~@3h#-xZFsvTBVn2CRSl3DWRoEjF?K1SAImNcVeEvT$k zyCi%E=gKc$t5-%H1In27>aJKJ18P6v5Mh|VGu%S(!+0q#%ZdeC0sSUYq`}UKVJ^-$ zQLDyw4#B~($F(p*K$;N}pt{^N30jjS68Zw1z+YX??Yj?r7DC|klU24|j_v*zdc?C* z29z9pH;6x;=M~%J%o`yoo{2_MPO$Rw$3q(oUsBbxU%1Aay4@2K%1sL;YeN zS`%_KrZOktIwLpoXFZa{I~DLZzvrg~xv)4ZG@rG& zuxM5axge+agvGZw37&q%xEZ&NK-X)v4B4kP3l-FFc1XBSWUY+a<;Uo}PedxPtzjLl zikvFf5k<=!D`FlQHbqlRwb?vRkTO{z^zKbxB2kjwr)Tec5yy;?En^V z_3@2vZMs`}+;I=c`y-`@bjz1N&ENNXkEH(WTgD$Z&1pzW$&1vbfx|zW>Dj5L_?MPH ztwZ{c{FKe=$1W-Yp01WEZzu0plr_O{cNmG($@fzFTZ|(N8Vi9)F$oxP*Kyus>|KdJ zlK>2Le+-(thGVDCR!B)n4CBW*3}VGcotC~g-U$4(a5vdT%F}#{z)7n_z~sPSF+RY} zpXg}8^g!LO=kNCJ0DGGm)%tl)^GMpDuE18~tzU#pa@XO3?mC-W_;a+CR zYx1C@GCWoBHr#Wm>rcR%8Ey;vz({cUla1Ms>N6F#lu>3*Q03S4`Zy1ntrK6+RNniX zBP3;MgBtu$<)M9WKyAhDf;OC?bD+#Mo2C<>=PP|Lfg;pjV#FQ0{1O3j$^8z#&SG7e<8CR`tn{+=NQx z`*5NwlxfGtCao7|42b?e3t%C=MVWl#Ly?xZ%|wQRhR&|lRGha zO$BRA=v+`wnk*x=K=w&!mvygg=)}D)RbT&P=V(MB9}suNYWx0`T#4zzWMO&O?W-?K zLAOy0dByUHx)}Z{#h&yiM)E%9$rvJ2h!uA#hM8!B-a zStA$>u%!2q>w=PP>_qsPES>Y1y0+zGPi2F^@$LeN73hL|qD22qrlJ>4`87-ttw+Q# z5Q7{0vfjITYv82kCsB6C<0WooXNbpZ{0&@r`!-XRsYgze1f*r_|T{N|NY{ za(E?B!JlGzw9R|pZtw@aN2g)qZUe1|5D$KoRmF(<0&`(`kya)2p412&8e0m0xg#QV zm1Igwd-)Eg*K-g5ilCY04^TJnsnORr?Z9c6jE55BT|jDY;O)d^2a)IxTmtVDv}%kC zyR{n=HyO-8AMa$Vc9Hltif+XXWU(ySo7uOLW!{dG&er6Pvdyl@v)BFT^>B3=qxi2= z0z&)h+qGp_{yc${fJ`9m$?=M+wpZ3E3sFE}bo&+2t(v0$usJr0xN z5Q+gY-OKyRz6*}LD`jf3xcTyIz2~7qW@hantQmvx14$Fx#WT;l-z#eE@i%WSo5((U zm7*?F9sVD*>i@nH;O|#Z=76Whvd6EbaM4FeN0lkK{on$MAY3&F7a)IX2!Vc-M9_|O z8JIDATJ3pmu}#-W_shi;gi$4H0|1~!O~lNNYnh!%@&0DSF100=Vble8eIdyq{t+yh z^__Gh6akyR%01SSXy%hhIK`Ue+`&KznYt`uJv-`f!$q)z-#!%2P`at}QiDqJlQh~M zdLH7F1CjS5MPR=qq7<^%{rC2LVny47v0%Rt!Gb)k9uA`DpKbLA?{!iLiq3radAb}| z={#MjKOQto}1>y&_;lvF!0pZN5tAsL*wOK zfea4{PbVTe1vg+C;4;4;h;TdAx0ujR^Eq5O+uYdK8H{A4keiOzGG~fUx#9Kpiq0+P za+x&%CYv!*Hn;riO?D5I(_``Z_X^b4_5YS_C^E}QYy{)on;>iG7KCC37=o$^ z{iZFtX4)s|;Ub9S+EV~m*`q^R_`fK2E*=ET`Ww!?`!!aSzBD4+68()-B8H|)gQ-O# z?K&b|9}6C zndbjR%>M@*-6;T2Zvm(vJbs|1RvTmta+Ivswz^|Q}WX;XsY;+r5lWyiul6uhni_U<*h(ZAR_5hGQZ%9+szw5jS z-1Y_MKx{e#neFL8$98eUQdOC_15y%Jfkhu(2Pph9DdAtWyOE0 za___Q1}x5GLHUrp6_COA{1!C3~o1d1vK)+oL*_(qg^xbxS z&jFiyNiP`=R{j`djZlBN?MVRTeb4-*4}hy4>=9Gvo=<3r)T-#vQQrXO; zZ|~|IW_zA&qaY!+eL)?I`q5wX9WYLgm;WN?LH>h7B4gLPate4dptRf$3_F zVj)5rmMio%Q}$-(P)Ev_KLj&_#)4mPhA-#*jqz|^iu@BUvi9*ii-UA#_P!EXUzB~bQ&1V2Ly$Oj$8lf-^8?cEq)B!lT{+NgF zOQ|PpUhWtqC)9z_Z6{2ZRLRr|10Zq4s zs5CWTLj$bU-}2wB7NgP(3h$IOc|;fpZxpD+nZ+hN2xaA zXSF5fAseRny#f#y7_tRI@ofNMVK5`O4`93hr@iwIr~3c@e@R){dmfvzC3|G=M9GY- zL)kNA6vq}FBwG$rA$w*N#~xAkJ{%(?Gb{X_uioEJ@Av2P`+ola{qgJS>Uvc!uj4hI z<8i;=Z;v|2e#Z8M{L~kO!RyP~847Jrpi)xn1qaZ=U=w2s+A5`jLFz<$J+xy0szkT) zfoIc3o!1n!s+fX|WFw%IG_=_b)yW%yH#%-pTee>wa1GQuMNwZsw}~I4Y%AuV_e4{p zuEg}vV|KP~s2T;;JbZw3o)}!%+%te&j@aX$Uah56Xcs zRL~jZ3?sj!VY~{gpc??GkSfXP26z@tmo3!CfG{+FQ6-t2VO*&1%`woY_R=>ey0?p3 zZvTkpd%w8|DULP3uKn6@d6exYgwHO%;dmIx!VKJr^{6kyv3(GMqQkvkYS|0m@ZXV6FUmd! zc5gXPS-PzNGsog8X#}}PkXy#s)xshQm{Di-n^1a~5)6oc00nhlI5BB8@RCui*^)vW!KP+wu`Nm4w*Q0pMU9 z7-pzv=xHkBLfM!pph+(@>bHI$&JcQGQ4>orD%K$qaNu?}eV)>|`ytP(fzAo~cbC9G z$wY?V`q)$;BusL9qNFq)<1urbK6?7r5@V9o$bWO(Y3$2ifFT~&6V0D`*^GK~Nw;sh zLk4TtNXAo$f3@+9{VJ=i6}aCe`?5Q*hX-xf0AWQG@2hy#1LWwu57}jScAyWWM@p#(sSr7V z4sPxstB>TUOmb#|SyQ0ICrg2icM4X(46}X~ruL;7+_J$yFdjAq$!>m$6;_EO$XN$t zaQwyE%5v<6()x{Bjysgf#wuedOF#~ZuvAaSPfOxC8v^RmdTXz-8kKUMa^y#V9K#bj zJtQ539VO5(bd6fQNYfu7#i7WMA{Bs5oGp0G~_U@-JhKb2akT}VJ zx#BvaM?ph%bT(Sm7GRAxp6y${h5MaD`(x z?G6wS)kFqYD|VrF1r^iMcWO+p>~X%l54~^N|I(wx?5KaA#YXcuL&kiWzcs}(VakQ( zb@4zaDu?el6j55P|MnG_Y-b2I^%xfLZf5b9oOD|!?Vm1H1+N;7&>J>bWJLRMDpaNE z6;6I;jJFNmt!u)qJzRRttmpv5R=#&kCDgJG!szUREdO^7j~nHg9`5=__KLPv>P#x)Kv2W9v)P<2R(dm5 zB}uhbd0j2CmF!6uqq@Tz-3G`8&dU1-fw&a%7Ch5Rk3=`E`YvaFl2 z2(P_toV?y{Phh2%U;Zde?QXkvK3GC}=&-utq~vrvHWvkV?KkX8l$_jdPOvvPvUk^7 zws!oTRb|9`{MjtH$kM-r^gQAj5Qcf)$EM$Onc4Cf2f4Rj{l0emhUP;+mF(hMT+*%0(QBLw?) z<1J)c^-n{?a@cNJ2i&CmG}iviUljd(Z4`3 ze$@KYbwuvdpr}zc$r#lRBE?I3{FE&%+aMH|L26my?OU54b>FGP+o^~ct->f|2C7NA zX_!M>rIcd6<;=Byw7zV@X3(`yBR_dE2H`?jbEPLeqW!8Y)sRasix?9YX+SsQ|A3ut z^1fc&N3Axu_rxj>8crO(zUS!|iQT-ZluUjXSQ(Ct@fR9y;3}h+GQhqy6je>b$@x z3!MKXB1s`OrbrwozvN(I_5B0are9>mw|uUGFfD_+mWTEQ3nk0pMZ>N%^Yh=F9>Oh(5B7hyDF zpdz0__o4{jcP1NV9PUw~ouh$``vs{FLN89g<&+@z_5@$f^-jUiIlDb z$F2b;N5aAxG-Yn0^-O65c`tr$+)^f6e1c?HtSEj6T}Rv_eXj6FG=UxX5uC1!YFR=u z62GEF&jx7QskOrs$Xz0GFSh65RNAp<1=;Q5Ss|E==_+hrhp8St`vg;3l@v-5Ay<5m z?pz&}IC{Hiq>Lc#-Q+xk=x#knpumYnPPu0&AVOa^@(;?QuAqd&cL_rK?`t<1@F z^}fzfx1tlKv=&~d7w~GQ(+i+fF#l#C~m|NH3yX91-ikJUfL|~%^9~Krboe`8P2$>7(_h~jb z=#hL`y&r8BQ&__z@MZ{tI|UOe84ToYiG|=IbRv2bt~w?zwB={eOVmnAUu(+nMd3J} zZkfux*$JrI!Q~c{U0-ornbE$9x2ABSRRM(LsU6kvYM%pZJM+WxzVe4-w!_1ndv4g6 z>;k9StyZM_Q$D@hU)n=F`)e^jvjdcp)`cvOGHlkBDfF@;Ig8VRIM+Il?dc}{Ojk44 z!VAP6*wu<}F)SY4JmoF^dBFFL)_Jpa{HR?jL2PoSvtYK?&oe98A0mpMek+6{+O@4A`+r35lfjzBbh7%ELWpk!j?%W}7 z*U8#(SF~RS56z1JiMLLk1iP_HNn|$<6e0({=q#|Db0u#8>7h?Q^jkjh|3W@e5r$J> zgF?%vjCoqW;$xI{mZ;B%aSLZ+fr{qBt!4yztZ`4jSK4)S$=Jl6Yj@2wkV-F- zf4nSlVgJ^zrcY$1bw<~~*ijVc-^5{%ir;y?Q;uieW`u87gZ(p0-6!6dbFpujMX^lt zL6=JOHZt$qsNu8juNN)XNRtR)ntE-cc%3cGsx7&GNOmIjM;=(ej?x?Iku&8@3Uj`` z8bC04KY^@Yf+=J(X~F**cK|8*2!U(=0~2Cg6s5cb4c8=x##l2-XjX|+e7MSW@MUd@ z3`HEF{^bVCIKe5U-8PFz>sLfbU&F|oN42DblnA9fpGPYf7K^fzqE0nwP_+w6oVL^` zqB^p=AQ}|W)Ju$<^i!E3@{_NTYmxA`kuQN$Cc-$fVR}B0-LBz&puNnVdc^<`MI@T0 zd086CbVV3P$^C}fj>CzN%jRqpALcy<$O;F>P63B+nKi{FkEoT)YEw5DOFZX%NtxvHW^LkPur*p6CaYAS*~ zGWe`+54rKc9D3TF0lRK3&+F%sMlHXJg8zIys(>MEB|8k3=m3vb+Eu|-#Axx+3DT=XCh7`f(W{@o;`OV(=Xk7YUAjym7xiqkOu&D_ zBr<5ItygjuRdDO{V)Kgn%?5!qWd4$JQJ$q34OjoONqciCOQnkM0YcZi+1Z3Ql*JNn zk}E9805qS{Zk~*wx@9ixXV{9G^v2ymL)p!$Kwy&9HfgV z=RwkrsjOf>_{f{4MXU9z-<5NX-R`N8o}2f4c_!!6$TOSUpiV3mzk^0i{J8QRm_b2a z0y!FUHcds~+vj*k0L%hjj5xf_b!8d^alv`TpdWZm2H>gM?$X zfJY*+1hs(r)@OGq+`POn9Naxsmz*y4Eqlsj_7@E|8co`tHSQ53ImOT^oeO=gAeBgn zq&2FQvWS~9B&t-FL&kVEWQDCn8JGv}B61xVVW}tE5yY%&b)$)rF%~DwZwx~$MM|VI6)!7yB9Hm$k8jl#mt;*}J-yz<_GY4rQ7%Wv z=Yz(L&aa0r?csf^W-RURE9AuzRCy6xX9b*kSa?8ub%N@)?;fa5zS_gBm54$a5Dxaf z2;=o=JA+3zd^4RD*@J490tDErVqf49Q7)nktK2##;*7XGnnaWfs#)Z!>6#&QnFOa` z# z5#nBwiCw4(^yMi^^l=d4!;0|31Q)g)-KR6TB%aB7GYRo;YwTzmwAMS-Q88M7AP(GwV2cbVulO@Aj7(+1Ps=CeMeY8aAAzE_;`))lHj>xvv5R;#C!H zCv%$|VY45>VYF;Z^q2C=>PHWu|Cq*)%__KHtz}*T`TA)1{MBpA$1PGWhSOh72#>VV zWJ;Vom>S=$LmH(hkFloSPup75<@c*=*LAzHa49uF319btU%}YRDRnG~c8_!WZhw2C z4R9LvX^y?CHyf9(coF{BSw}=HEq6>RWNa zBU6$|j!c#gSa0PyPSwK&lW{$f(#Yjd1uXUHU@Jc*$^Cr?8*6J6bQ>?fNyLLXL93)OPQ=I3h6ROZL_IRu!o?w-5PX2-KgYhHN ziB>ZNB9Jcek_uax3)txDP_Ly7q7kK~x)h`dn?i&!n-*4)4}9B)CR5-vu9BC_UQ>va zrZqi3Ap2Yer|+;=OK345I4d@P`>1`e-(7Z6eA?@_P%QG6X!rR1BQC|fO!d1)jmK7d z)eDc4WQ1-Sq>V1bD5W*hwcFw^Jsa%ZsizR+JW<7kk7NKF9@oru(vv=$+3ldyMPzE` zo#G9%toEDy>qTzzTfP>}p{L*SlmBeBwr{c;BX$OqJ_$$y+XuSStNh~ zI962{?uWSomvaXFZOicC5+Rz%mI)$zMByU$;A=O)O(1U}_$oV+5*Y5LQem6!>z49@ zur+o&@!*L3%hk*}!OM7ZV1-^>H}nXeXPwM6kqSt(^QC=5Em9_3RGs3vB+K7`Kz6s> zzShe#prNIV;L*ISh!89MD>`>jHU;0lDiz)m$S@dR8YCc*eGVm3QN?ch=wd6Qc*Z>* zy}*9Y*=_Wdb?vOw#X48ERyco5n?h3CJdDQl4_oF<$PLaE>=)Eqp9KVZ(CW~HC0AHd z;RD+>Ce_1iLe72o5(k_+@3IgaY4fv7titwCE_a+ln_iQlSRS598kTFiNq&}amci&YrKewRW zeZ2E_@f$gL;cI!!v=!=VenEslV?qjbIJx!Sr;QSC>IHQ~de@3*u#W#}5ASh^w#pt5 z?RK?OB?RT3fQ%&ACC=e9Ac>v)bU3of#!aLbDbC`}t9Y(6HvI;7gA=dXCzq@ahf*}@ zn7yksl}oBXaO@f%8|B6OOxj6>#SY1zJ6n5vbOXKp#%5kQN+|eNKNao(X z$$we{{#>E%tsZtf(+|qH(xkV|SL<=m#gQG(34u@$*x%k?Di{ z7J$VKlzm|qFJmZ*JEjX&p4>1W2YgIxWcS28GsYEIE8QFMA7Oq1ng@E)>HK;A zZlXJP`hP~rJ}2xoePK7qc|o-U9fWVI`4|j%gh+Lhhm4WVy?OM}+L+T9r3CwHPEu4Y z>%KjK!F$6-G4Iq5q(SN1W>Cn4blKJ($JiJH5WS2Q(FrTG?D2JOo=YCZCk|9=y(EsI z|BFgi`TQeqCG#}8=g;&1p9ims39y3d#%5nfKK;i&?=OuM{8Wn?07x{o))Z>EV zBB)#}4|R+#Slqca2VCDCxv`?r!63h0yDG@+JNI;}3J^ZjPiYK@qrbdmKp&ra;EbT6_6OR>GcL36L}X z0Hxj?KFEtjJn3Iva&z(Fqew}I)ZTv;6TG;_yh-cy z5*SEM6Z$L<@o^y*e^($N?MOw;srgv|ps7mN_V?;7pi0KXXWFHDf(Vg%Zypm{)l6?c z7*IM2`RJ7Teha9K&SJkY!XTg^g=SL80&b|=e?JFfmbv3oe#{CpqDG6xwYL#0Vd5}d;mA?1H0UEgr5ZiS?9Yx;ka0L*$DS&BaqLFbZ)#{y0NHj8NPeU>K!A`&$hG8eR@Z?_ zW`e5cUaCmQf~jNiBEIh8L2b@&9u4qFECqoCGs%e^6P=9(q^;53mZl=SJ68vu49bxs zPyxdQI9Fp`1?f8|>Qr3(+~4+gAGA&XH!wiTR5XZFQ9{tNMhWBEm@I7-Xk=9?yLfL>5}r@y+4DQGlUB050{{NQ~Ok>0#w2~N+3+!^m#m{Ise@i%+D!L zz2*(jr_KZLN~|NMTk6Sr08*+PB?|SFvW$4H+=&I%;#JiuAPVB{DZl#783)qpVlps{2tB#yiwU#XR8+aKA_~c?4Zes0fX`h%aK^WWOgxN%yMxo< z1oJh}#jn;DoRO*~;7<7%(FLp{RvX|FctP!^KFoy>G5L+&? zatNg?)+~Tri(+O00Pqg&5p(b`rT2V57Hi&j;k-92@W)fSfve`ae_{P$hx$f55Vt== zcy#$6qfkT(P_)wzg!`q}_$43}@(|4H@uS%QxMdNkh*0E(5^VF%P;P7edN`Rku-Lax z8v8<7Ct$i+hNy0kb*CPR5fSNIZgV3j1rT6P6`DY!;wIy_PVROuRXof|$Sgn#q| zljZR+!fzcsXI<>4*XrboobS%w?*@^7+A(1qBeOr z*Fk2>4P3U|{UK$34F+#dmiwC~V2`=LzPF-oZbCu>1sIceLpIg=Z4g$nhe{uSu+cEk zTMs3`?txaT?+s5b9~*Xg@EynSHi6ftJ&|B;OA0bXsKWw>U^r~kP{SHL*D|fRl9BRa z>}|){uRmS)cci|jXcs%yY!S!5h!A>!vjYqSKRh?X?5h4bM?PmIBOofzAg7b|;(Zt& z3{#)&cBgjLRD$B0 zpsizACtts7yh;knd2d}Et4O!M1?-*z4-4pF?5@o?C3j6YxjQ;OU#InBJ9@8zMZ9bE%`q~o&ziJO{q0A?7*R1 zT&oRXt4s`fiq6V6`B8Vm;t_H185>i)n`UndPijtIH%0#=5Vrqy?zbW^cY_j4dmkbA#)gQRagYy#4g?Y+>Uq`>w9_R6I*8@0W8l-Rb4 zolOwS-Z`AX4@G~Vh(XVwp3iJ_zMZmZzYyV>?n$*E6S3x2km(a$IQRIn(48iZ&7bS^ zeIKtBlAh1}yoX5{H61bc)KyT7IDxSXPKLKyZsM_r2E6HT@>ck@sl0?>W%e0} zE+<$C5M(rbd4>0BUC-qg(pUeCU8-PJ+VkxO^4x3#?`FLznEV&A(jtr+*o|2~$+8ps zfTj@ADq{SEKe~Hc!%D+3^vdN&2I2c-ph@IH%$#cCVzJ@7IQhUMG>bT=t6F7GId8)N z(n*0doF`)uX+3|An}XL4i*wCY1owrQjgRJbg%y@5G;C4Z=B_Y2Psfzw2aNlV0~meHS{uoWweGv%8V+Ci^Ai*k zhzYM77>;Vpfl*CJMM*0|k4IXcXc1c-ji^8cj=6#voa-^A->bG`6^u?8C~sEZm#XfJ zr{ba#Qc-AM5Y=B%eHfU`5!+yt&%aG|MF+(5-2?S;PI08FI)kbG@~Oiw(;Gyh6`JMJ z0!0LAm+<)vgD0wm)s4kM?X19zDtJu2Bod9AP^+NN9*`X$7GxtFLorUvF2X8E>F1m8 z&0|5^H)fKw6Sfu2ts7ZmFEEq~EAA&x_I4^ZhlRmt*@a45Ap2Uk%Nu9%H$>n8_Wm^X zMGL6;b7uuAIG){(l1v&8HzwM#SpuSimt%ad~?4bW) zto+S*Ma=wtx&FN3zYiv$piVW51#`xKIl~hWD}!_H{zne?&yO=O0+I9fHdp8V9y|X1 z3}Blm%#rix?{((CeLz6;1)G)P2qU~dpn*T%`>%uh-~{vp|GQNeT Date: Thu, 5 Sep 2019 11:14:03 +0700 Subject: [PATCH 46/73] Automatically build/deploy documentation (#267) (build and deloy docs) * add a script to clean up docs directory. For example, delete all files and subfolders in thainlp.org/pythainlp/docs/2.1 * add appveyor.docs.ymml (for building docs) * show branch name and full version at the document page --- appveyor.docs.yml | 58 +++++++++++++++++++++++++++++++++++++++++ docs/clean_directory.sh | 54 ++++++++++++++++++++++++++++++++++++++ docs/conf.py | 23 +++++++++++++--- 3 files changed, 132 insertions(+), 3 deletions(-) create mode 100644 appveyor.docs.yml create mode 100644 docs/clean_directory.sh diff --git a/appveyor.docs.yml b/appveyor.docs.yml new file mode 100644 index 000000000..595f3d06c --- /dev/null +++ b/appveyor.docs.yml @@ -0,0 +1,58 @@ +image: ubuntu1604 + +branches: + only: + - /2.*/ + - dev + +skip_commits: + message: /(skip ci docs)/ # Skip a new build if message contains '(skip ci docs)' + +install: + - sudo add-apt-repository ppa:jonathonf/python-3.6 -y + - sudo apt-get update + - sudo apt install -y python3.6 + - sudo apt install -y python3.6-dev + - sudo apt install -y python3.6-venv + - wget https://bootstrap.pypa.io/get-pip.py + - sudo python3.6 get-pip.py + - sudo ln -s /usr/bin/python3.6 /usr/local/bin/python + - sudo apt-get install -y pandoc libicu-dev + - python -V + - python3 -V + - pip -V + - sudo pip install -r requirements.txt + - export LD_LIBRARY_PATH=/usr/local/lib + - sudo pip install torch==1.2.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + - sudo pip install sphinx sphinx-rtd-theme typing artagger deepcut epitran keras numpy pyicu sklearn-crfsuite tensorflow ssg emoji pandas + - sudo pip install --upgrade gensim smart_open boto + +# configuration for deploy mode, commit message with /(build and deloy docs)/ +# 1. build documents and upload HTML files to Appveyor's storage +# 2. upload to thainlp.org/pythainlp/docs/ + +only_commits: + message: /(build and deploy docs)/ + +build_script: + - cd ./docs + - export CURRENT_BRANCH=$APPVEYOR_REPO_BRANCH + - export RELEASE=$(git describe --tags --always) + - export RELEASE=$(echo $RELEASE | cut -d'-' -f1) + - export TODAY=$(date +'%Y-%m-%d') + - make html + - echo "Done building HTML files for the branch -- $APPVEYOR_REPO_BRANCH" + - echo "Start cleaning the directory /docs/$APPVEYOR_REPO_BRANCH" + - sudo bash ./clean_directory.sh $FTP_USER $FTP_PASSWORD $FTP_HOST $APPVEYOR_REPO_BRANCH + - echo "Start Uploading files to thainlp.org/pythainlp/docs/$APPVEYOR_REPO_BRANCH" + - cd ./_build/html + - echo "cd to ./build/html" + - find . -type f -name "*" -print -exec curl --ftp-create-dir --ipv4 -T {} ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_HOST}/public_html/pythainlp/docs/$APPVEYOR_REPO_BRANCH/{} \; + - echo "Done uploading" + - echo "Done uploading files to -- thainlp.org/pythainlp/docs/$APPVEYOR_REPO_BRANCH" + +test: off + +artifacts: + - path: ./docs/_build/html + name: document \ No newline at end of file diff --git a/docs/clean_directory.sh b/docs/clean_directory.sh new file mode 100644 index 000000000..126ee4774 --- /dev/null +++ b/docs/clean_directory.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Delete all files and folders in the directory: /pythainlp/docs/ + +# $1 : FTP_USER +# $2 : FTP_PASSWORD +# $3 : FTP_HOST +# $4 : Brnach name + +FTP_USER=$1 +FTP_PASSWORD=$2 +FTP_HOST=$3 +BRANCH_NAME=$4 + +remove_all_files() +{ + # DIRECTORY=$1 + echo "delete files in: $1" + for f in `curl --list-only --ftp-create-dirs --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST/$1/`; do + if [[ -d "$f" ]] || [[ "$f" = _* ]] || [[ "$f" = .doctree ]] || [[ "$f" != *"."* ]]; then + echo "--- deleting files in folder: $1/$f"; + remove_all_files $1/$f + else + echo "delete a file: $f" + curl --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST -Q "DELE $1/$f" + fi + done +} + +remove_empty_folders() +{ + + echo "delete empty folders in: $1" + for f in `curl --list-only --ftp-create-dirs --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST/$1/`; do + if [[ -d "$f" ]] || [[ "$f" = _* ]] || [[ "$f" = fonts ]] || [[ "$f" = pythainlp ]] || [[ "$f" = .doctree ]] || [[ "$f" != *"."* ]]; then + echo "--- deleting folders in: $1/$f"; + remove_empty_folders $1/$f + curl --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST -Q "RMD $1/$f" + else + echo "delete a folder: $f" + curl --ipv4 ftp://$FTP_USER:$FTP_PASSWORD@$FTP_HOST -Q "RMD $1/$f" + fi + done +} + +echo "Start removing all files within 'public_html/pythainlp/docs/$BRANCH_NAME/'"; + +remove_all_files public_html/pythainlp/docs/$BRANCH_NAME; + +echo "Start removing all empty folders within 'public_html/pythainlp/docs/$BRANCH_NAME/'"; + +remove_empty_folders public_html/pythainlp/docs/$BRANCH_NAME; + +echo "Done"; \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 9a9e47343..a43b593a0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,6 +11,7 @@ # import os import sys +import traceback sys.path.insert(0, os.path.abspath('..')) from datetime import datetime @@ -23,10 +24,24 @@ curyear = datetime.today().year copyright = u'2017-%s, %s (Apache Software License 2.0)' % (curyear, project) +# -- Get version information and date from Git ---------------------------- + +try: + from subprocess import check_output, STDOUT + current_branch = os.environ["CURRENT_BRANCH"] if "CURRENT_BRANCH" in os.environ else check_output(['git', 'symbolic-ref', 'HEAD'], shell=False, stderr=STDOUT).decode().strip().split('/')[-1] + release = os.environ["RELEASE"] if "RELEASE" in os.environ else check_output(['git', 'describe', '--tags', '--always'], shell=False, stderr=STDOUT).decode().strip().split('-')[0] + today = os.environ["TODAY"] if "TODAY" in os.environ else check_output(['git', 'show', '-s', '--format=%ad', '--date=short'], shell=False, stderr=STDOUT).decode().strip() +except Exception as e: + traceback.print_exc() + release = '' + today = '' + current_branch = '' + # The short X.Y version -version = '2.0' +version = '{} ({})
Published date: {}'.format(current_branch, release, today) + # The full version, including alpha/beta/rc tags -release = '2.0.3' +release = release # -- General configuration --------------------------------------------------- @@ -88,7 +103,9 @@ # further. For a list of options available for each theme, see the # documentation. # -# html_theme_options = {} +html_theme_options = { + 'display_version': True, +} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, From bb2fc3346b890f23543d0e27483f96d0bb155763 Mon Sep 17 00:00:00 2001 From: Chakri Lowphansirikul Date: Thu, 5 Sep 2019 21:35:06 +0700 Subject: [PATCH 47/73] Update appveyor.docs.yml (build and deploy docs) --- appveyor.docs.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/appveyor.docs.yml b/appveyor.docs.yml index 595f3d06c..12f512c06 100644 --- a/appveyor.docs.yml +++ b/appveyor.docs.yml @@ -51,8 +51,12 @@ build_script: - echo "Done uploading" - echo "Done uploading files to -- thainlp.org/pythainlp/docs/$APPVEYOR_REPO_BRANCH" -test: off - artifacts: - path: ./docs/_build/html - name: document \ No newline at end of file + name: document + +after_build: + - echo "Done build and deploy" + - appveyor exit + +test: off From 74357b6faf8d48d4397074be3c9bc605bf5330ff Mon Sep 17 00:00:00 2001 From: heytitle Date: Sat, 7 Sep 2019 00:03:29 +0200 Subject: [PATCH 48/73] refactor correctly tokenized word counting code --- pythainlp/benchmarks/word_tokenisation.py | 40 +++++------------------ tests/data/sentences.yml | 6 +++- tests/test_benchmarks.py | 9 +++-- 3 files changed, 20 insertions(+), 35 deletions(-) diff --git a/pythainlp/benchmarks/word_tokenisation.py b/pythainlp/benchmarks/word_tokenisation.py index 75e074b84..d222e42cf 100644 --- a/pythainlp/benchmarks/word_tokenisation.py +++ b/pythainlp/benchmarks/word_tokenisation.py @@ -173,16 +173,9 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: c_f1 = _f1(c_precision, c_recall) # Compute word-level statistics - word_boundaries = _find_word_boudaries(ref_sample) - - correctly_tokenised_words = _count_correctly_tokenised_words( - sample, - word_boundaries - ) - w_precision = correctly_tokenised_words / np.sum(sample) - w_recall = correctly_tokenised_words / np.sum(ref_sample) - w_f1 = _f1(w_precision, w_recall) + # Find correctly tokenized words in the reference sample + word_boundaries = _find_word_boudaries(ref_sample) # Find correctly tokenized words in the sample ss_boundaries = _find_word_boudaries(sample) @@ -191,6 +184,12 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: ss_boundaries ) + correctly_tokenised_words = np.sum(tokenisation_indicators) + + w_precision = correctly_tokenised_words / np.sum(sample) + w_recall = correctly_tokenised_words / np.sum(ref_sample) + w_f1 = _f1(w_precision, w_recall) + tokenisation_indicators = list( map(lambda x: str(x), tokenisation_indicators) ) @@ -267,29 +266,6 @@ def _find_word_boudaries(bin_reps) -> list: return list(zip(start_idx, end_idx)) -def _count_correctly_tokenised_words(bin_reps, word_boundaries) -> list: - """ - Count how many words are tokenized correctly - - :param str bin_reps: binary representation of a text - :param list[tuple(int, int)] word_boundaries: list of when each word starts and ends - - :return: no. correctly tokenized words - :rtype: int - """ - count = 0 - for st, end in word_boundaries: - pend = min(end, bin_reps.shape[0]) - if (bin_reps[st] == 1 and np.sum(bin_reps[st+1:pend]) == 0) \ - and ( - (pend == bin_reps.shape[0]) or - (pend != bin_reps.shape[0] and bin_reps[pend] == 1) - ): - count = count + 1 - - return count - - def _find_words_correctly_tokenised( ref_boundaries: list, predicted_boundaries: list diff --git a/tests/data/sentences.yml b/tests/data/sentences.yml index 6d913a38d..3c8a0dcc9 100644 --- a/tests/data/sentences.yml +++ b/tests/data/sentences.yml @@ -39,4 +39,8 @@ binary_sentences: - expected: "10001010" actual: "10101000" - expected_count: 0 \ No newline at end of file + expected_count: 0 + - + expected: "10101001000" # "ฝน|ตก|ที่|ทะเล + actual: "10001001010" # "ฝนตก|ที่|ทะ|เล" + expected_count: 1 \ No newline at end of file diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index aad63fd76..304524223 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -59,10 +59,15 @@ def test_count_correctly_tokenised_words(self): sample = np.array(list(d['actual'])).astype(int) ref_sample = np.array(list(d['expected'])).astype(int) - wb = list(word_tokenisation._find_word_boudaries(ref_sample)) + sb = list(word_tokenisation._find_word_boudaries(sample)) + rb = list(word_tokenisation._find_word_boudaries(ref_sample)) + + # in binary [{0, 1}, ...] + correctly_tokenized_words = word_tokenisation\ + ._find_words_correctly_tokenised(rb, sb) self.assertEqual( - word_tokenisation._count_correctly_tokenised_words(sample, wb), + np.sum(correctly_tokenized_words), d['expected_count'] ) From c91dde6fa2c607d835c8df4cb1fc61a1344c7f8a Mon Sep 17 00:00:00 2001 From: heytitle Date: Sat, 7 Sep 2019 00:03:45 +0200 Subject: [PATCH 49/73] better description for a cli param --- bin/word-tokenization-benchmark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/word-tokenization-benchmark b/bin/word-tokenization-benchmark index 0193926d7..43b4b1ce1 100644 --- a/bin/word-tokenization-benchmark +++ b/bin/word-tokenization-benchmark @@ -21,7 +21,7 @@ parser.add_argument( parser.add_argument( "--test-file", action="store", - help="path to test file" + help="path to test file, i.e. ground truth" ) parser.add_argument( From 86b384ed918aadbe262c93ff8cb3059fd2af4b45 Mon Sep 17 00:00:00 2001 From: heytitle Date: Sat, 7 Sep 2019 00:05:07 +0200 Subject: [PATCH 50/73] update tokenization benchmark figure --- docs/images/evaluation.png | Bin 87242 -> 83855 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/docs/images/evaluation.png b/docs/images/evaluation.png index 170197d773dad7742cc6e6c441bcc9c8fae6d3d2..3ac1cd13e9c7b45142913506c422dacba386126f 100644 GIT binary patch literal 83855 zcmc$`Ra9J2*DZ*#;GJ%%>eSida!)<=cJf9j86VP(BVVhurDeLfK3dE3?I+vS`2eKURZz%|p7 zCtJB^*}kc3)G`82sI;yEC#$idMcs-7)*+3o8^@qq{pIO;k}Ry;N}F|=<^20!6UUXG z+za?P7q@M0}5O?0dd4CAlOgr1fbDB}Lp(2R^Awm&&3};Y5!q#ecl#?7nzU&PgbEQqv6A zL}T&zyRA*1^ZU&V=pk8-O5f{ph}gkLi$<~S<@phLS9-6S$N`B!TJdW&Ey719(|2VS zvsESW8lft6@vnbS=86^;Gf7E`V13Llk1HJs z>dR$OTutlk8~TxODf)G}w-i#DHKyD?wmGiDjd~6II}iLvUPSA1GcCvaJ4p)bZSTJq zk=|t(Cr8e4&0`gT^|WxY9j(Jc_ ze1EsncR`&vfmVx_MhXg-(Tfu@@=90(?Z9&jq-x8Y?bd0Ny(;g4mE=4zn%+PWMg zYBN>bFB>qX6hAjLr=^Mf!0K0Fveb=PwOT!bRLRieH&z1@=iS$O2f&(5Gp;u5wuK!9 z=XbFAF_)>yVmY-C8I zaWej%b)6f06C}0u8gQwd`T!bhKdI|#mrP>k3pSwll8mw&?K%FG5>68CYRPOe&ri5hvZK^H6O1;cmJ#g#FjiRZE%G`L9MUF| z_6mzj4$_|HuT9}98t<6kBC$>CAen;~O?m>~4WJb#Z@g|n7F%s=F5{Q*^gU+V{Zr2= zOXW($6~Ua?@iqiu!->>RvsX4>fIB&(fQOZ!Zx)O=aRG(=d^=3B!!#6eL^{MjGgqZ) zanGt?(Sa-+2( z1{r?^ArYA40|g@|h6My!nMTQlZJRBIw3f0mFw7nd7qXW_)A)!?+=ClE+gIjFc_%N= z=YhF~PT%L(JXRezmrr{h&w{;-25V3K|E_=9@Hydl>3G`oyN)MsZKlRi?=d6_N-?FG zy+N7TT62-O-%UNL`sZa%nsQE^5codJK&}H89!WYm)waw?Q}}&EFR0`S!3!5DC4nme zrM_c7Cs~w6{OJP1?@$`g-vo`uXI^p60L^anq;M@}?2Pz5eiqWst+WJUp8bhdJdfoF z4)5b>GxOKN3@-H#=)5Cik3^dEKVhV+SLIA?-7!Fz|YY*wtyy8 zgWD&gRHMHeZ@eIsclzJ6sWO&Jg2)*$6}9)LWvh6$Y-46+6+$)Ay<613VtD0jx2Vq) zX<1P@q_M&mR0ukx@v&%UGf%N_ywKH+rPt4I!^PuSB}$9O)-{c>J{6!ih3w_0m(73P zJJzlTmC7@Eth?)ela5Iby-|%E)hlAUl?OrK_!Uz9j(y+LoXL>+-?S-c5kvy9i&#l2 zh>R6yF^U@DCcLaNcC|8VWHoaVPD{rFp+&vPBEvTUR4ScK_;i0z_!C(6OvPkw>g`Vj z-qv5c!OE8XBAlaB!&OUs3Q43T6C~kO2h=`?#b5=-BXJdCBr?XYS`&k2Fk4lX1x@kr z4S&}?O~*GX)u_p3VUG0Pp~{(FElE7z%q}zUSe?o}`vWmueHYZh39lm;v+8}2&zN;6 z@p8Arj_(76$qghnZrlub6da~d`$m@}PSt@Xcnmgq$=GUi>&EC``CQV=?~~n{8a|M` z`>X{k9^kw^o1(3hdBc4^%3NqfV};%#opwF&t@|WNO}sINGVS~AO6>0ii{OwE_v

    Dr)HJ4f!kRrI!RaeQXujn>WmEr!gph7Z#3ZsfD^Auo|%ng zgQ7(7B6N@k@k5b#I4tFl6O6Z);aTfOF?Id}rZvA$h5r24AxwT3jquCveRLB$6|RD!ged2b zC5uF{tnhuP`NEQ*cFyeXde+3uN8S$uC-uFKso~VkIq012Mb5y2xaLg!Vz(fL?er4N zoYGr#m` zH^ttL{`>;(f5`KDzB;pGJBDpS^m+ncvC2e?yxd2xlC#Q2Ss`DO)D?Ili~=^}#5M66 z;vOA)b8lZ?;n#dG69r;sIc5rZ%N}Z76%v@kGE+DIVN%-#qyGfOr)i=*vi^hB|p0Tf$J-coQQc8`HqAE=f)=_XnTUuVgMB zPU<~n+T`z?cBN#L7>DF|2HY4-&HE9FTh>wvDkP;Neo~-GQ+jeEN%BG1 zQlgM=k_oYbU#o}+?D;9}L)!@x4ah0}fCz^ot+LJ_WV^DL;d?TaVpP9~_fdz{b@VuG z>L5BG(^XjiQShHwdz?&I=ogCth$!AaUxR-F-0#sJ)vr<#C?j8S3l*b)(KiYj8cfdt zRAy)Tk9+cW@86q1@l9hA7!Mk8J%mugWHBT=upl1G0+Ke+SZW zouW(J@vpnhIyV)zi^fuQZTR3#?8@a|5y>dg^QrUzA5c(1k%^4nJ2YyZhZh+*OzFAA zQu_hQO*w3C+XG+a6IEDIukz-Gb%a&6-rQafuBAdT)A1o$f2^vm0@I@+f_GnJr)SbqnM|{oDbzGoIj)zewr|BnlcG;V zefW^A5>$C{=|c;r`atOz7ac8pS0$F=*Wum+M=Y9NMq>JnM}aAv51qE^=BL_&R71cf zkzidG;g~1E6UXb;;P4yA_n#R{KOp2wcs!Y_N{woANvTwGtKel6$#d6TDi9um)8$NcbQY2X73AEIL z&PD$CFl3~=P`AQBl|>r3jTYP4iKRvhAz>hUlNubbDIi4tS7q2>QwGop~^pBAWi7vMOb? zRYC?)G4=m5@1R8n!4pOPNY3ELE@(0mZmaKxdZ5C_mB&>z(c+T2+oC7z-W8MARValZ zGtE$&R`{PMz{S{0HAB}M*|IdR9v<>mq$Hy1y+Xt}i~gzH`>GK`qT72(@YA6C;Z*;Y zuhhY>{BX-z2q6YFHfPfSu%Q_;{kS(!nZe2Q0e2`)e+!(_^QllyI>$M?oMT3?u4jL7 zDZx$7C~-OLID)QX&nv9Z7NrDktLTIAaoXGOrbbwVrQMy!Z$@<;YSHsYHNiD5r48aHLbmUWT8M;0!>znD!x;&<;V? zvMS2jR4BAHU7B@@52;?OUC`nJNqf9~;cnc8e~RQpZmG*ck7NRCNz42!PN2!$9X z*f=D)d>DV5$Snxs~GDCUh6YQK*ETT-u~;`qKMp z11Ur@6e8=38as*laDW&*qRaeywN2oT6En5phBE>kZ|7mOza%P+^J1qj@c(ID7;mAvwC4p zikJAcw=6mlf-s=^i&u0%%L3!Jleq&FD*e{tjd zPDG}3!#PNmfQt*&tW>q4W<%bgC=T}aRp)t!G<#^Nep{@R=PGgX0-Q#$J!0n_S_>{q z66YT{b@mxVlJ~gzR+oPaMFOMSYF5-+h4`kCsv12OkgEP5k&63ymBsM2lM9P2n^n*@ zqulXbxER@_=x<*fF<6D_{{T?%1c2K(qCQg}d(0Vt?20r$JP^p0+Z)T%JkHvI3tW zJetn_0?MCKh;7QLuV;l+`3otsRaO(d_ubaO!(cK8u6@B^4H`s_K(DuCc$P?vd*&JR z=DCg1FzzJ>IOD~Jo`=kc(hzeuB6ojk;}MWH^p3>->RNVfVUB(r&MQ}?(B}-gFu=dcz*V#pYq+k-ckAbnmY;gHFaL&@g=%WLz*gxb5#5i6A`5+@F&PrMz&y-x+ZC0;b-5?N7U z3T9!WmO9BSdeo9+I%56UT|gvbPr(w;q#kZ=zih}W^AhChp?sBo8khUd>jq=2>?08F zJ<@+Li=y2{AQvjPNY@30P?V+8_Q3Tp(>(#9k1)DYcAHNan&IEXs*&Y;))9LrUBCdM z%8pL9>60T1a0YmE9cGtu%gto`)z?OtU^-i`q+t$hb2B`TqBc8V*IO#T%`Dfmo!;`+F{zH^ix10a-Li90>Q(d_PVCk)@5v0)Fnu%KU5L)>_{Cj^M0n2#eT zV=l9jg`OMU*#yHoSHt1!-Ij*=fK%&7DwVBiwSl3B>{`A@dW^Ftr)Tm{*mxFGR%UnH%e zcT{jy@h^Qu(rG8kk+!yb<S1u|ux{Z%-%o$V|Q4JVy;@ z_AoPoxyC|ER6$ik*s=l|&AM6+E7AX9pgn56S=3AkK~*^Ijx(P0H&>FOy*Pz7B%As_uYFWwd+MwVz|RWQJWk=-iwu?*Blb?><1}^B}JusDpk) zH!+m%dTN?NCvH`doF2if@Tu%EX-7(y%s9Q01h$@DVN0h3SEfi32RI}&m{s#5*s#AD z?#-)H$Pt1^ROzU3$;%Q9FlCD95a=YBN>cOQev}3|=bXchz~GO!$PMm`0U`$mKaotw zg#eC^Wz5H&pIVMNjMH%q!SqSh!(&5{skciQ!G6v_;6tv z+qbIm3;{6DLrg6KB_%)o7E zXrovPnI9X&VD+2B&+S+)$6J8O_KaYQUVv`RQ3$)C*x6f=Qe**#edcXXSL3hh^#SN%J=-1e!JYmlOh9X@F{yakGDCqn9$5Gvf-2`K&?{o$_@2SrJ_>U>m zvtg={8TpO8(M8Ii!|i}1)yY>24_v_~%i@v~i(e*lcF)1$v|ci18Z`w;@fzjqh6-7s zLa=ky_m$s0WcvIj(pEO6q;tN5Mv^}bB-F_%xj<-MMN%0}^UPt)Pzj2}lK!YJ5|_kn zxdH?CEXH{2fT4@jT%y%C!}C$$mEGdvPlkCcMWG}N~zUwr^f)v1p4Dm_?7GiI=qL^@Cr zyq&|5lwex;V6j__MwGQp?4c%pZyCgpFXP9csOOi*g5#0G6rk0E&ec6-jo-6uo%?gF zE&3hVRC1Xtx$OK`N(3HHJoZjVHidY;nz#3%?$MC}bp-u&H3MIwk{NUnPs&>X@+CmKfT(&INz?{FdBBHBBhhS{Gse^hjZv;gD3ECC z{o-fBZ=wqL0lEoOMg?Wg?aEN>al+-ISvw1Bd@2D401)o*d6qZiQen|2@Y)RcgzY%? zaP{1kTr8Rf5;}JM=1)SO5qG&r-R1DK;zR#=@z?XDd}6BAPIsSxai@u+p${%kx>^Ap zrd5j>msHOKvjZ+_%Ggv`&glL<@#fEy2$^0Ds}0}t?|J1~*%}FVj-)8)VyEbZQ@H1= zZS_g;Jr{k%{_q&6xT9S5a+K}_$i@!XR z1X0~3h(FjoO3`JUK&Erwz0jU!a9ui!+g3WXtrM{$8mSFPIW2Z)eg4>w8kjAt5W=of zXO2+U^KcAzHdRjHF9qFY5nticE_y?D4quF^IT1;}Ya}N^DfIAz+=-yCaNhs=rvRc4 zz77w$!!GO8*>H+z5B{I*Fh|>Cf?y34%YZIz#jJ`qaPd*VE|;~}u1(cN=yu0r2Pe4; zDUe~2@v%gQTQQC*(N}6oa$m*q9yE8ljE6TOwwL*oPcz423C>cTwalubr+uH(P%3{D z<6YubQ97V`kZ2K?LO#5z;l`D3xjh(deY^bWuMYC&)j*5_#H{=zQR0O}^%qAEpub

    -@Ds64T3~7Uzjz2PbH8~T2qO8LKEheX$#S%GffLor~ZYa~Y zMxy`^;yTXtEIeO8u+iO0o9`LUA=nZ|Mb`@Kl&}gGALCz+RyjU1y2pUtAapCYDH0ah z#(=)l7v-027=HLONeRUE4yhW%IT5%Oh2!st+W*!7aFil4Xz+h}6V9r_n!BL4PLsjj zu0C@5`Vz{ZM~ZW+d6qJ#&7B~h=Jr;4!`(2IFFUma({Kk@opKm#v6t^{HJNq{)d8hIT7d=7Q)x zgraD*^iBV!$rpA<%iA{yb=OWO2yVh*5lUfDNSkJiR+}XZ_K~Ie!>b^stKqJTz$!`e zEj7;2;T`^;>N4)1b`tq1mQ+)8j(>@qPt-JM1gyI+g75Y+P4|)gZE0{y2P*Si z(5`n9J^J2abCv*GA`4QaB+i2u`GuDLb4G+ik36%l+CD&-zIpc{KLEhU&Zy|ZT4gu* z)7%$z3Tl3Pk`jJQr?Z~tTSM35s!E=Gk4rGwvfs9#TUcI$sO~rNedXD`O+Z7% ze7GE@<|-M+oq98%x#29=FB3|AjKKUjQyBQ-ogk&w-p~smFJ7^yNRD1XlEYUOk|B9s zDFK^Wd~K%OBEm|^==P}>C@$p59d7r&Je(*;ez$60_gIZzm01anzG2q6PFB03tF;(T z7|=*I1vBg0mZ$4AVhXHH^)x+68mqC`U4VUcbf4u+g6P47qkxu$1uTLB!kv;AAXJp>Lbd+dN6kC<9I@0kPat#UMWzS$c{Q=c25+rBN0 zw6PrpVKZw-ruIwn_Yt%*^*XwsNZ3)Eh!FoXr>P7%`5}mG7v_vDc7yYnM7)4lpETru zHOh=PF~ns>>au9;`OYmZ#2-HLp*SBQv|;1w7CmO0u2 zQ$Pz`6h8jA6^hNUeA;nPK+$uz9m|2~EuF!rij88tg4Yy=M07|8?%*Hbq1#Cpe}S-4 zW9j!Na$*oga@WFfPPL^5c?rcK9+*-9mBc(`3_ezZFT6|b7Yh5%6mG@+n?M3X7v%VS zy40A5NQ3~+_#ysy3PB!0IS0eDC7KIr5Zd^b(qx$eeug=>fEZq z1Sd&)ve~^;KNNrB%b>MwX_M*rLqKO!BG7;@CqN;N9>!Vo9^mrmbIb68QS8t_K+_== zIQFt%SDz{MGmAN4G%CnT4iUWy^*aQiTX%f1ovk2zxH*Cx@%bigFJosCR-$Lo;y-=) z^rq6Ojn0Wu(G9mR%>gJfxGSg-E~w6E{j5#rKn`B|l;RR*OC?i;`$|uvYQMT)#RRFg zbq`bxkm15oDv&j5x2l=|?`;#R5-vmmY{zy`XI|J_1|$-#E2ynUPX`2bTtn@XUBcjs zGlsyI`%mmyOB`}aZ$HkyEB3~*=~Qjn3PI0tUo_5)XX@*tx*e4hB7u%KZHg*J%#gtc zH}$46)MC|Xdku__HX1ChvvK*9UYlWiWO5pZl?9~IO}XWlDXi*O8^gk7R~WXVN1hKt zn})E`rgrYkxyfxQh26}0oi=+?%11I?pmGl1-XlR#BV)sGVfPbLjNVW!3Cemeod<~# zp?LmYX_emv;^T;|{NZ$Ir1yUIrx>p!fnN&wgNZw_rPvgLQ`;%pvQ@WHZ1v_xKm;BFf9xC@(y_(MlOQgww&7Lm?LQ&sW4{uL(pJ1!6rDHVC-u&?5FzP7vT)2Awpe@ zGKHQ7(ZOWdAE$zEV$L7WdlScqiprsqNWdJ0zKZ%>v~|v^G50&F zjHFPLuIy~%R94j!IQ;n*#)M_*!$eD?lFRR~#V9-xY>tcTySGE(wbJI~v=5{Y!HfyF ztgSd#*GVzYt&T0=*m8bUfoP_^i3sA=8$71J&6v%Yl~)!0G^Fo{5Fu{-_iSCC!uYy9 z^|HW8ZzIh0Zy~SbzZY4W-#}weN-_WM5j1WvO#9V5eIFb4dHW1| z{HxOY1F_KCy)Wa$ul#qrdWIY(SjYC5JK+~=qD#J9A}5{2CpwO(W;X%*XaadQ{7Tbl zI#4bl@~WxFvK2pPaLd)qVdfyfV#{1Uv}uSc(4*G$kG!!6!FMoAHM6F62sUNpDf_UZ zwJ(z}GCrH=$MmMH&pGrxRt7|OdkqUtNDuAb7+ZZBBTsyQt=9}e0R!_JN-Ks-29;Ll z2y=k5!uRrZ4qsAu?BU}L(3a2XTl<#j=d5D3txk=l5GQNkJ@`r^e{F=Rm#W!_9S}B%O}zT2HmGwE zRWE1yqwlCuP{%`v8(H-nV|bY#_aW*!R0UbM0r8> zRYZQ6?OkpsRmy$@-w!}vH4@bE_!#C<1;xk-h5|)dJl%s=LRbTsQIrR6P#q#H3e5Xm z00)ALz7V@+C7|blDM~TFnRlIaU;o-sw*&%)K>IS3#g4+;I8*0s6)lg@TUemrOPF0< zYuY;QS}f}E{m+`sGL*T%tAvG1Y?5Gl8)a@ZCdF(GUINTr%`mKnUTgBCRUfo25*A^H z3~NfSFlQ6o6=2{wLK{iY7!mJlb|VB7ievANgSXr}wXq2u87Te5Q~B^XdrnI72L(#B z(+=zys!GP(_0v33I_>$jJ9B_|U&XXypkw?R<~eg!((9N2$o@6@TZVQfinIrww_Qn( zW9QfL>Xo(`eTl=C-Qi}wP94ME&zT1ww@xfd*fb$SuaZlTNXghZGORyh9(J91Rh0)* z#kwT|6e@-@#|5!+p*Znh`%VYWtpeN?^Kat=6hy_xu4v(B&405bXJ+b_EG9Gffh9HB z<)U01ktKcBtiQJSsau8Fz6v(@8RUe2X_*ydbFk=QaiNioaWkvs#(tMW-=4QuF4Hts z-vW?<@>A>%*lNLI$&NXi{M|rb*}8Ii1{b>oXMt=lpD6gsxD#7QfoFFp#?VoV1mLyb zX_ti21pp_=EjVZjDF05F-=o(rxc?1a19Co+VJP(QP>}cY91B*Ls2-Js-<@+sSD>Di zk<%|bW(C9hmj8Saj<)H$nK%Y@_a6)TYkv}j_DOER+lzqYLq+n=nJ2T$S^e88-6l}S zurp}=JfyT0cn3wM|Gj%<-}U>O_2E=8Y_=Oqk`MriI5-sB49Z&NnH9RSSvgEO;0!VI zXA&6N8!inUJN9Y}Nc0a|EEBNF`6s3ndMPS+36>=Q{|aV>hWc3uI_Fk0Y8NKhZv)UX z4F>rGp1fRNrPM>fU)2cveIsvC74Mx`)T&#$2d}n44_{v;pg|_qHvt;tQHlpcFvZi4 z1WI!KvNDLN`@A+sqVh0%KdRYWpU|xe6eUZ!jWk;p{93_e@;GKgyK_?1SB0BQu&bi_q#DxWm(4_!E=m333mPuw#6 z9}uf9-=69QLxh-@1ZL!c8PT^|D;qZ)>jao3qpx6&|6mtTnX#&_D>FB2y^yM#YV5)l zAv|KovuqVj*=V$o6Bp9I;?bmhvSqFcmHnzE)4{U$FuT{V5wLJ5F-O{yQv9toaJiBmOgk+U{|(R*zZ@2CKZu!FfPB}OV! zhQF0&WHjjIaX4Fvy?JRqbCjaQsh~~qOJR^!%(%;=wNze^UKk1(UnX_|jIy%DF5B+2 z_VY__UHcK)<0Pfz1*m5LjSYdd8-gX01@M89f?N{$G8ARpTA)%4%X2#@#d+Hv`v?dH zj5gd~%^t#=@c`f~MkU6*{#C9rzl_R|BlH)KPgryN4RFxbjJ)^KUkjVxG-(wgC46pd znpZwo>*FrvqZk?vgoagVR+5c?ZBcem1dIJ2jw*E`rr3)k;Dy(}LVr|Md~QM%_^R9% zN<%590R;Us*o9Xy&dh+&qoGsd9C-5I+lr10UquI(NImo~o(Medc*5toC%!=N&8!ia zN%fKj8Eo+RiT}CQHhI;%x6&fyF0?i!;If6Im-g2I60GzYPaM)k;QIf#%`7vblHf;< z4k=t!f{Al%-?{;(Y^Dht^z(`C@2k&<2i7z6uFSSf&Sd3vcl<8Ew6E*ckF|dKSmrot zYrf?e$A%N!L7={#Qn>iJeNYRo#jcg07PqkGs*9?DQ+I*d{NbXsN9{Nx=sKKN#|!d?7-QS=E(G>>kw zilBt*wlJ9mz{u9`9Y%$dCdKea{=HQ$B9UHS_r(Bu1pwD0$l~2FIsu1J*5Y*#u_H8` z7>g(Dqg`!l#?eA8x#rFp8lBS9ysoJj!`?Z|%;mh-tcrk2t!anVdt?erdg17E*8OAG zagH`UlT2jy%!km;);?;^P)-a1yJP3O(n8BTQE#3TKHC`vCWqeVTZ%$Ka7jZ}D3);4 zs_#G;Y911$Tmpsd!{2kK9e5;+tqc86`9t>?+LV5R1c$@aftF<33U#grzhJX0JS6b@Vys6$T4(^2FhXa{30nmNV=sn zWhDx&m_`R$m`^`=R!Ph#j`r~+aT<4q$uRub`@(OYMjq;>T3pI&jur;-9#kN)1u7EO z&q>q*^k6{U$VONm9+0KL3X@3^lpJ>lcz;bl*BI1J>@2XfjW>wGL`^f7;91^W_dhI#v7Pz{-jpmGhL7j zR}t!*;xVT#hqw*k`9PXqxCdOK>v%#f@=c)9?4N9IIpqWafKfq74m6XMwX?a2vZogm zO4#!ATM8N zoH0p*L=^D(J9UJ}U!ym)d`%@PC{vXV3Fffff(z#%<3Og0iTHiJZZ1)9O1|Y(kz`d+ zuy!0^4~Ph?;$h&v&mh*#^Yc$o`l}-6t%nq{M>X%_NVzj$>*oDuUKifJt_Jiz0);wL29#~L+}0E1=M}2 zK?_73dT#;&*>A7iPeE=MP^ehhJ9bl)hp9Pi_AUU+4H{53Zz`v!aS3Y&eD;b{6{u=U z$n-kzx|B@q_sKAxnseemYb2}!7C#J*Dt;yQ#;{^5%)Ka--AfKGnmjr^l~@r{4p?mF zJu`}HS1rtjEHT#XLO2k|Mut*Be zf0nRxHcc?N)jyy%T?mg&QMk!~D>2p9{ZcoSvW-@|B(`|yIJ{}}b1BE>ycHloSP7F> zSZq=%ggF5MmokMy2@%cvEm?AhL6=MBD@$C9?{N$qi!MaMm@Ry8krXcSHE4W3gB9WU zGwdK`gF%uEeRQz#mVBGQ|ESB|>#W*Afb{++0m$q=PLlHyoV%w_Ie(ZpKM3xH)0DL2IgynoPMHM*m4tc5x zC=s{qeSl{6p$;Y6588ewn9!kHZ=#%UT+xsp=iT`iR+fkPG5VN9IlFbtyFO?0XIBmh72=!6{Gq{=$|?!uUHAsld5MgEIa^ou?$>R7Wj-6NTLkndv1_MttvCZ>ERy; zn>_#oyPd%mpe2`+y8iG3`6ED`d2f&v+j{zM3|rk69N<*8nC)==)C^#loPg(t%jl_e zJG^Ez!CEsGgf_rVlnvNa_+!r6GQSNU>AAoe|G^r-BFtX zN>P1$(dh74%?`5_@c@$bodC!?WLF`TsquV4@^1W+jZERytiXv+d_6>LO2IVt^K>75!~_-<*F zSpZN;0C%$6%Xs(2|I)G)-t89yoJBgE6>>nk64LAux5n)r_B|NMFU=R|lFFi>(k}HH zFV~(^_`q^TmTBaInwdx+Hz|BlQ=$$R?E*Uqv|dca?p@V&UH%~Vu8QHKnPJGg6|@yr z$dH-Q|6ON;)kVJuvVrL3;wqOB3(WIR>pP1k33YgfRMG9$p^L zC&Bj+1Fkjo0b-szu)y$_V01!HON>%IHY6dQT3n%a-cc%|;^@R5nO(V4IC7-GGfM0L z3gF=s7hp%Bv;9)p_@*0RMHZ)`8}|W6Jug6(MG74f7+-kkVoZ|aeI8AZ}G+VWqv$^?^{NGs|JoD`%5!j-X+2Cnh3Zdy|ChtI{eQ3|IFddI=_ zwdz=kW5gfsV>mGB2!j$gfwGdFs@NYbB775!Lrzf&F`pKP51umBBluz;Ui-=NydmRk zmOsrx-c+XgJ;RbJ(TRxF!K3v|BP~9YvUxP}|7G?@5vKZV&II zk&Hc3b@UHb{k9}h)8yhtb05_eQo~EOT#0=nh}-BWl^mTZ&BchX#=nL+drzUCz=S7s z>%Anj8%gLvW7|)td{5sdd~}Cd14bVV&?DCOB5Rw<;0^z`-J^&{%{-m+RWM_OtMzQr z&|uKU+vbg8GDRkFbPGxo$kS^-mCxXrG*`_F#Vc3Nre);UK770^3wfoxnhqBZW8z(r zv2#CSMnd~ZAM;U*QFtmteKk5>SORx#Q~1Gpac$Z6lCUme`K$$vx7C9XAL@WRXr3xM z9lT$QWhF=;944OrM$Yap1Bra){CC8hWCWWQ1ZJGfE1TNPbV-4CShMWm{ss(V;vCVR zOx+Gpo~Mb>e;BUF9$tiJW^{7j11I0Akf^H3^q9+oA)DJAu0GIZ@ghuwMCY^<%PzG^ zFz$U>-cP*N3mi|}n%NkaCh_s-rWc?IFjv1()F>~`1J83&-ki}%kkaqt5ACZk{~FwZ z65z;$#m+G1F((R+Ax41NqDqO1-Y2Tg>M9Q(8GmkMS&+sA@Zrcu?KTrfUo?N9z=~Oe z2ruUMTi!bbnLgY|jf4?u`PRa#v1+&fqhj2W4S5Ys;*1PYLChI6{2Sf)AB#nYSHn7| zspVaQ$k~>;Q-aVwH=xWyV=;d;7u#2*m%{WJV??#Hiw?5tczzCFjolvO1kU6zaVTf$(|fx%(AbbU8SdUTQ4yL`-bM;ls~&-E)<&2Ur`~C_DzfPM zQ0*bkmP|qIx0SIxzpK%GK-6Fzm1{yh-4k+M*+(BPm(*`=Vvy*w4(-ovdjU zD1s;mh28lGn5=LyeVYJ)nP!v3l?MT^6%LeB^ZMEj^uK#;3d;;rHX0K|SA@a>SB^Wi ztYhoPr;YmofdE#91{Ri59uH>562P0lh2R;KLy!g;wTkxZoBtUNIRznzPyc`JF5-g| zPD=YBr%Cww<_82koT*?2*dkel7ic8)5)6)g@5p^`gKY@)X_6(F8#T%gaB5GcsCWan z7fIqqy=za`^*2vLl`YsBQC2b;`m%O}(zi9Y?z#C>?;Ozkh=}v902Afw#HdjY@uRT` z?6i=Sy9aV;*+(p3ha^=+?{2E-p_$6H>u%5Y}=Ci(yQavhHHgVWRi zQhH~&5DW&rw+P2IW3h+T^RDn9jGq%uFIsrt3Nh#}>aU0?GZpz^=L(V;f1M)2X63 zRFz`^D`;R#Gbr;PI0%Ld$8k}@TJB9xgD8lE1oxOmr==M*vy&oc)ulfP+ZBP-yo@kZ zXxJ%_yt`)AB*Z=x91`66AJaC?`+Zl_Ix*GZ)O`dxXsO%#>f5)`Vzn}Y5wZ_kl|VXo z?Q9(lvsesnK{4r(I~5EPe!{*rGy=53ls}G*ScB;Eoi#o8+MnL5xyIpLO`DU#>aTaY z?+E5|Lp~nOQ2;^|KC$e*cpjHfELTtaylF$Q8ugRDN-I3x>5DJkcd>D9WWWgf+dpxYYPu?lCYp zBhTch>E_ONOgi_<1)|w?{q_l1QT_Sch81{gg0_lluee2$f;|R5>##If> zL2!CZEQC{aQP4;4t|Hfw>*Fy zsK*`FAHwT>p{;&kpcEB+hsnGI=p;d)Gk8?TCPDl{_>BMSw*bDAZpu3ccM4p&e zUdkk(Q%ZpEdXj$_1N$-KH=zCQbN}ibR9Lp%5Llhi#&`lCvsL;+wD_0d{I{%dV7M0F z&Xy_T%}fg2iZE~nbB|{Fj9Wx6%NkrNPxh&`!retHeagMNI3!zfhUo{C+yBtOjZp$3t zAZsQ0RC>3D$MwlwwFiJ~qNBP$r_*cFh7!T2ja+(a0FH_dBfl<4HO(%hnqP3bG{eLp zTJ;s@px-d4>p6Y@Q&SMQ7?q%$Z;n*qel$}KQhUp9+ymR;4q>zQ^VFb7$;af>2{UH|jriFpn;N5EL>m13gS8S24KetMi zKWOEc)^Sn0q%c`)=;WzYti@t|h2x?m{vbV^&F@l&Swr#aw;n>H3@s5{s|n}iyBv-u z|KwbOo&AqEMS%KX8YtH{x)l8d ze!c=CQ3anc76C!6*>-+-L%GeU9{Rt?`|GGI-!FO?l#oyw>5%RQDM{&&2I=mSR6s-; z0i_!R>Fy8^k(QQ_MhO8y5CK5}QRm!zzQ1?gHEYe9HUG_8^XDy(&vVChpX;2x_u1!j zpguP`WnG~TXN`RVT)-j!GWkpSeSm4mIM*R!zj6|N zIpXWZX7o1bPZd`pwuM;`RuQeTvGvdJhxM&j)<0#sgB!-r&>{SSsU#QV5u7FIQVt!L zr+(FS3Yu)v5#ydLm4A4+LvDsA1tj^;W&+aMRy?$;G*qKZ{`yQ(8mj7_!&goooqX+g z_{jJV^eRpxh{cJuf)p&kv|5MvxfkcE7|}qgj38pO$hf;L5^VS@r=W4HHO$OfML}^6 z*V{J~|J-J()EMX0NV@rhy;8YVDO6L^McbfyXG1qnl_!iVKyRv)V^aGE2Wlr_*$yia z{%2cfLd1H(+1$=m0^PXakIPzg(V?-9%LgLl3O63}1Y*mr2_4plVJ@{7ukkY9Y)~yu z=X?TMX?vQHF#c-;IXZI`&cLKmG96=fw}RQsqwVqa&`;sWISw zi0(s|iRY=-!TA1-Z*%YmEXUuak*X9VAq7cU4COu#Z@cRKfdv*a;#HDqulhJrovttnYtO73F##b_c?8A$7$ETT1>YHB zxpWe_#&V=l%PrjW1G*imuZ>joB&ebX9$eR{Oy3jKeKBVs#6 z$y|5Qn*UE|*{q|?o4EW5?%5gmdZ}Ig%P)VJcMc>3PS9~y=#$Xj;n>8YY-taHGwc@G z2O=|lM>JDw%G5$b4}JRK%Hw-|KY(uY4Elv%N*bxjQ*>`q!T;Y$?GsPo6xP`{n8|01 zeQPD^nLS?1$>jWEUMI;1u*gUJ52BK9%3472>(9U;aiD$Q_lyqqheQ5H!#Y=QVq!*& z{NSE`)lt`(+V^;8OHG5>*Yn40jFi;z!N=0&Y?dAmQQUe+{HfEVGa}wu3VN=~OC2~N z-3Lq*!K=M%Dwwr|tar0)K_X_+hbDNgf)}uQYX%Kp!V*)ITIQtZI&TKegZU6A0s7wZ z=QZe8%Mf0pf-<fo+IXJxAo?kj5K1snQpdP)?z=(H|{&ok`hp=JC>3)3Gzr$owV*Mc1a9&5UX*y&l7 zh1e!zeC^NoUXR=7B34T~8MM-|^gcVWjeHJDl9IAxPSsaw|9=KxW2axL#}ge8UI>-{ z$m^sR+F}&jDQ3~fHV6}-F8;;+kYSu!aT&x0P1b4rXG~14KU0udk|w3R3k5UbM2M}p zyg1|C4g<<8*`>Gn+4nfp1tG6e$24>(60>l+#bdtoc7RHGrqT%skDnyO8LM}8g&uA; z{qY7pf}`c-zqX1qx~*19D9kkd(G4!*ijl#&{0c>fBK~{I$qCJWvWxPYv)bs8SZe62 zm^E^uFLI~(&u&S%-Xx_wBzZf}3k0VqJH#u?l3JdSbk7!0;z8b6B2qrCOtrr_u9qL+ z_HTsu?XvMSZk`gup}GtcAFz`jTRPBZ8RTqBHJRKV-)XHCMq z6V6UESl={R`1fQr+{ww;O^Gf%243%m(@GueEI8!dvVAa6#Q&a0z_d+N@7nQ7(C;y3 z@79#Pe7Pvf3sd4BUPp{L(vcsDH6^SDt?MZS?bSug+5UpevjO{}0G(Oy86>3B^i^{d zPW|_SuUVo~=KrzV+CN8pN#^h!p00Kz8dU$~KYa{Z0RcB?gb_h_4^Rz@DWP_XOwL=ci7$O8vX=u7o}ElT$kY z1yHL9su5EZ+&-AM&vrXK{WK=B((zRDbcXjPa_em7umq=$c+tidu)cklicmg9imL&@ zeBJ!DWHvHv;;a@CGDIqFI>H2$XJe;_pK;?pUHzA4;y&baF}ZM4*i?h=`ChK%BjZU? zEtfD{;W|%|I_TQ*xPfV92DS$FZ%$@Evz8X88OQ8+(ROUW?R<=*Vc17+2+@(G;hgqz zZDl;?V?6OQ6M9i*@AwbM*zhG1P4?a7oj_6XzL>`s+)yuF^x~RfFf30@i{}`##o^Vh z(`|I=b7V)T|4S~d0M$aftrQh4`ok;of#YId!b^FSF8677#!+SO4h>3kAI7G)zg0(Y zHM51sT2?lOlEt1i&d(V+(ER?2EgyJ4_%O<%LLC$NRVT`+9B#@Wd1*xhY}ynL)_lDjGZBt+<*;Rw72Gx4WKg zrD(kCHj#8b_g7YtvZnTL;EO(JT(Cuk{E)&N0W1De2 zk^5XgXC3Mp!OmQ5UdT=|8=2h|fze-ZHzQO?`pte@ZC*Q~FI;Y@OyzwQVvA^NL2zGp z($(Jq!WVC9GLPkcH$l;mx+-X28izb+fbykW2PkJW;x_1p)6F`2a49#CQb_&IMazP(CKezj%M z$RT%J{g7EI*kkY>n>D9PdgwaMYo8r!Qgc5mdi@QkkA&pZbf0QoIM?THcb%?bVE!O8 zGMg1wS?VSGf@YKF#uJcw1YGI9&X9uB#C_;vus;W-=U``*Xzz1kO?rrixAC{0-h^%^ z1p%30>Q{Mmh@V?yHRs+-3-F2)U4igFDup2-nvqBrHFV{uzYPJnyRaBBunqV7@zDsW zeQrdV9g~@Qgvg(XC+%Vf8AqXD|6<){+Z-ZJ$C;fVv)_kYCy4YB;QkaD`H8GmLIP7? zb$J8zZzWEp<5jiox9@gXh2e!O_D2x;8p!I%ioUzjm|b@pqz3+(N5(GI`bP}-Ut(kB zluS)i#aB*ej3w1RYE4Rm{fkPy}1B&3LQ?Q#|+; z7|*~G6&!-r_zf`$r1l!5&q4LxxAq>f!bRtwUmd!7GB9S5^N9GJ%uzSyP35E$2=nHD zc^C{HyY(Dj`9we+c+ia^;2M$tBqr=r4d*Z8o-45-$6#CO)lZS~rPHeWA$144te9T~ z(e>fM6a&SV!2$6E1YZ$^CXVoBwxJbhSo)hfx0Nt;uR3Go)}cMOUr`SN$wU=SJH1y1 z2GLJ#3~Cy`e$P13CpuD+(H;SLQjg~`&5pNM-o7oy{+FyNC`Ojgy{<;>4oQV9Tce zn<6Rx4^vlo8HmpDWJM=vAxrmvcs>f?KSd~%+LZ`35F?|;@Dfc$t)SNK5?h+M0$|b9 zmFH#pmsG(^X4qhgm?!5NF0mxXD-aKzf)zh-TtY_^NmzP939T#~fqLt}if`cRCENmO z9o|i7i;|MZ9Cb?JoG1INW zP&Rd(|EBuLa}>JO*I#EeY|gX?8F55*qrFZI@mw@}Z?y4B==)_eJs>BIW5P=ZV>deS z1#fYi)q0%qDWKlgtZUm8pD$j7mi!F)?3HCE_wPt4B_oIavudifj_{7DoPWEA|7Oaj zoTAvKuv%`+-J&kXY~F_;v%!=HYn)YGh<4)rUucIOd+^}*v)HYd~9uy^OSWjlVwNU)F&FV7+)LDW+ z0uqE4Lvqd3Im~SVc0>3!4g}|K801|M{Z;E>je3Kv||$B*p0*8`~}8$CSlp zO|)=Jw4||YDU}-xTtJ?+1<_FugihYBv(Qk`a{;vW=gZmyp*yxfZh&*Z7zCQ;j4ydl zq-5e_g1-Qp!3R!^ajN#`Kw~gYr%k${_lqgL+)T)Ht8r!{9AAM2Iy*==bL0 zdV6s++Bl`kd9`m8I{eg2QNnlJJInYo5Hs5m?+EOFum=(*Y#vL5k2 z3$jV*k=vpD`>L?u?&&KiS2Cg9DucrO$sb=UJLwBmHKM~5!n80yv3*1ow@~wXUz}?8 zKPov!#vdz1GGVZieeOG$F?P$)uLvv)g+o#2HhToQ24C%)pOjG%4lN7zg3wt|T1+3@ zT>wqnB>+kPJRv#VQHeZjxR511LM%fJ(mH5I7?cpI7-5z}u|{P?=8sB#%8F%LpOSZi z7pf;M`(M}=`A-K>M4&Aj@n7{e{CD{}XrVh(g*4I z4upPyEwP{(W}@oEOvhy=tw$C@>d#Q_PSPy&KW;;U&@quUNrJQJzuip>y&@KB&6$1^ zHQh_zWewRW*Ru3Di!OIcUqZ*!;#SZNrbT@8DS5Gdx$_I5g8y|cVx>C@ZC#e6#*Xxj zkb-3u#)((>umGp-j1@xqh=Jmk^NsIf*ft@L*_v)k$>fUrIy2o5W|!Lla!3Rk^;ILO z&+39eX=T2fYM^E*0SJ22p6>|K*`0AvG6%Z1Oi&3$dt%~w*610V=5qNov|9cWBCe9lNU{!(67=wx!=r05F^XrLV zx}DxV&^c5i-I`mU~A^*x20@&!efYi^aEa9y>3UcSJef6!^Kv0U8{)$&&l~i zF-aEglZ?x4MWPk~Ya4%pQPR*2kmRph*0OnjMG;n)X*#dtai-Rb+G}I28=;(uzz&AH`Hm#B<4})K(;z zBP}LGx9Zfv+rryJ!eybZpdQg+8)4=6OsSNprSquAPAj;tg7reX(UU(XXDUP|-9N8- z)=~ID7V3SxbpP!wXxA6ytZ6Mk$Qj1nKafs)yxBBAfFvhdHVoscpR9N7yULn&4ubRs zlhJpPeNFp_`?c*E^Uz~x6}o2IG})kX3V7@DqOkgh-#2#dCVin-15 zMz0t>nCQi8g8jNmhLkOa@8uiQG(}BuVJ?Al0G?`98urY(d`Z4&;Vt6hbwIzh9JNak zPiqUJo*zduYOU#maH6Kru0naiWf%4jo`2&}fd}^Poiwq8>H|Q#7X#H_N>dhb-93*j zQaD388`i+b?3K#)?u7JSk5J#oAj+*;`;PuxWj?QA*K6?`elyL(9F+&b?MQ*7Nk*8R zpKjFC7fp8c`VOk;~$V6|7N%jLo&Z^LWk^;}!v$4!u!PCC9yW;$UxyU`A7bzBi) ztg}JmnCqZS*WxlaUQ#^SFQ9uxP*#)-v(F;%P@xgvFb2nMmf@h~JQq#wI96PmG}6E! zuaVnp;l!sWscM`ed*;Lpy)W9Xjh!&R2NtnM^S1jUzVyYaVr);HxU5JSgLs_1-*$c! z&fRcSCq6YMyp<>G)O$M-G@mq9%jI)8M~c&Rh4~7oS`+U@wYF@us}{y?%cDi3SL3T~ zhSoEuVXf{=R#WA2Os<$kvA$y?n<3Q>^q>CgRNSE!lNg^ZG&MZiw+ z9wPJG`@Gc-3)<@x9-V^)j;vST*^+Z*^(ee6ZkQBIYoY1gLsp;^-Vy3|MjU>n z`(Ii7`BqnRZkURC_;Dws_saMIKDb0h;Ew8vS9^aI`c-Idx32x+r2TRjlb`Ykl6wDx z2fNqRRo@uPi(}mMxgy%pm>$CMoM&x3_+DnVYXkO3fl*n0W)+QmM#jNC=Y+e1l*$AC z;{Jihk?I-fs?OK7S)RpDsGY@2{>(r)0o?PrrWI4~vHqX?9^t6wcM%7r3yy0$j;v=v zOkVtnA>Pxy$#`oD{~p16>yH|`i}Qzwn4lx5$7PS{6;iMKqECsyW?@Mz`}OVBN^VN( z?+3FDhv(R7x3<4w@_Gd~S#6iM7V&`njDTPBOra9~UzUhRILGEz5hSP07f7>J zvP7E&v;LJ|RLi>?kBIpOL`=JBYR^L=mRJv;Co@0B=+>{>$hnUmH0aF#+``v*G$;N)cLWzz0skg z+DekE1V`|k+p&feM>*5;T%xm?Y`(QyOt=+b5u@#}8qpGhrg5wx+C^4+{f!dO>KE|< zRCc7xfQMZ-!#yUk7IW%a_pUItcET$(79U2o^GS8F?*-TWK2iml-AOL2_;CBz@Qe|P z-wT%7_~|lQ_g6yxM;qG2?6>%HSUE!$7;=LTlr_oFrOixzCKc=x6-ZUam;0y<4Pr~{ zTQhygDJ@?EBZ6uF#41LS)<(zd?b;!Do^?^%T`7I8`QRvqCcnlzl`Zr zMh27UCXh0hSwj}0wa98cZ2o$gRQ!MVW7m?*(Fa}4!+rJowPXb`*NxsSsK@lN-<09`W-(ZN&3`9H-)K7oH}hoO@|_#BKp2=U-m|P0G%O%?R4FWtql}hq z>P38t;h2@)E_G1k44~X5X^IckhZWjSAkTW#dcHj32>oYJ~9o zt*+gn9)7q>?~`E@Z0bo~mq_*?zOOrp{UE%T>)Ot|Ee59dwWrhW-f@i7C-R{_MbTI9 z&A+;1O1((oX=ooQmucIfo#da;l&E#Z-lOj)83m55OL@(nhbUfpSxdwie?~pwz2_8l zDVrlv)y%G0@>DhT1C(13AT@40H`)LdqT87zJK2L$#QSQnwY<1@KWjb~ZTz&`_Kj%Y z=Yz;^f<>>B@%#GQxvM6u(Mm=6pU~P-U7L$_dGibU|FI3nNGP;}rX*#K5{j{B$!1^l z%UaAmAURlo+CWsSJg7KVnU3t|&()-J+gXw!%X_4m_?Ph) zYDREq#Ez<&5gIm6z{OyAc*tZ!l_o>Y6dyOu<{(p42>&}#x#xpua?jWMR9fYk5}pnL18Apia}kW`Xx4>Yd8Gh}T2Y&x*0x&AWVE zYkO!uhMv}BSS)XfrRwf8y`Tzwar-?9 zP^QWrj4+g8`O+IvD7@oqiY!072d%3hikX~J-uZHcly3+(os{gAo97R&vFHDGE&j#m z7P9QL4ov0+U)>u_9D32!MPAzEa(pAXgsrd?lh)5N(M)VE!O^(G6Zh$LDq>&UTRr1O zIbk2UGD4&N2rUavSa>{>Pf~k?9~_r4-}RFON2;g0hBha|3=<#Y?!%X|0j7TvNFRV$ z4DSbYt;!^QU~M!n1kG=|?7v=Hf|4VsKa5?H2{7zDYn5YTAKiZ@XiZgPGwX-kTvYP# z-e)uz{lmVknO6t`Ej(!`X>idQgRM{1t5!pKhqMYgM{wmVgrtJ_TCgxH^J=VQ*P@JY zGv)}$L=1cQXvF!B`@0yum@VeX#99xQSt$;>yJ*HeJZQuzk0bWCk3IMjuvmRV;#uY9 z&iUNcov7VWx8>O|757Z%qMuEpP`#oqvbjP$&BZstrXgP4*DED1`bnhMkM6klY8)%c z=u`CFrq2keE+5LHsm`NW9ch(GiE-z!?>Vb_$ETiK>l)OmJ8>vk>P?M@vJX8X zCUtR+GPa{p#@NI*SHBGt+7#SL9twqTwmnpA;dV@LW8}nVsOG+8K z7mQd~D3kVS=}6f4=tFr@(5Xb z-BHhm`cf7PF2>;Fo08;b(NjfbGRdSOk8H{3UM-b!v|-9fH#{}<18kdNm|5Gdj#1$vb53syRm|I^E8sYp#LvQhSa7&^fhL!jioz{=D}A=Oo-r= zrQFRdml3=P3)Sv1qZ8rXX088~g;@w$n&nEWuZke1A?LDic<#hGqj!1u|LtGX|IKQe zl$Tuo)(0lkb9tTQu>w_g1i}Hl0+iR%ZpB%bh>lh=a`^uXe{HI`sjsh_hjjre-V!ag zy1`8QEcWDW?w0w`GXw@vKTAQ>;grNEw-)C!z8ULPuJuM`6EZ4hAXObG!e*+#!wnI| zTT4-B#R&8h*k@QpHXQXo)olPF;Ri8bu0Qyrv6TOVF0CvO>3u*NQc9HKX)!bjVO)nU zT{a#Qil|Tl)ZG+>W#=M9qxOd;P}>VuRO}fMc3laP-2n1P5a1eNq+2~x+?V6+R2Ecu z4KcbwmyeMF{+kR@c?6NS>?$PT4sFOB75q;EPhkuF{e6Vz2WZa)PeDQpyFOsv?|x!* z;vL-GKzu={?swE96Q|Up@Mv(P8Z@(nhC0A#!khf{z74JEkZn~ z`u`L|7x*0? zu~!SFZwpTTG;wrs|J9adFSRsTP5!;_FrWs-Ai&OM1dEAD-u{KJK{oz5GMFR8@6OKx z+jDb*L0;eRw_T{$ZydfOd;qz2Fm?HNsI!S)f+{@5lRi%9DR)Nvh{C-mB>hCiKXSKz%eq3e&6{KNq7GOv9ovkEv#>2E47sHIe== zcsrZ-1=NDXAb3=%`b{+3rKa%EB1s<+y>xnT`%<`Q>_?fjcF3(tO%(cb_Pl8#ILcIAxE zEmIv%kpDnNh~)fHT2tkraR70U(#%&48(|+;|RWmGa|I6B<5Dc?c%79vPkQ zZ-AMPh6!u=7q1cS^fJ=13$dGzf3QxSJ4L`hzBBpP|Z527(U3M?G>p zLZiz)8dFnvOoLAVw^@u`OuaWGa|8%bqk6}+kTCQ6ooDv8V(yvRR^`(WQOK1$LDoBx zq;zxHO=@u9nMF=Ysy$tzL~&ps@6lSrV50cbfAEerUV}58{0$G6b3*BR5BIBOhN2Ih z+X=pOiGk$kALp`9gP5JlQxCcfo$2x)%NwwEsCH}@#JWB$Jiw&Z96I$Q{nmdM*K}1~mfi*AQ*lHV2*0T)*gSJqs zg)4Cw#&T$3VyCNN$P}_(Axli9*gJb1ipJ{QLKl-=y;g<=hBnV{alOdxb2CJXfqn)R z7_>oQHy>DF^h~6WpFje}dfAN>8fCVDOXC>mZ$?LgzJR0$!GhDddEY++qRGf;-RVJ2 z2L8y)WM~f2-rR%9b=n(xO+X7y?ax1axHuidA*YF5Zky68uh}R}Rek#T04aL5^