Merge pull request #886 from bact/dev

Add license info to /tests and README_TH.md
PyThaiNLP · Dec 10, 2023 · 75bbc07 · 75bbc07
2 parents 1f4a39e + 723ba01
commit 75bbc07
Show file tree

Hide file tree

Showing 23 changed files with 305 additions and 209 deletions.
diff --git a/README_TH.md b/README_TH.md
@@ -123,13 +123,11 @@ thainlp help
 
 ## การอ้างอิง
 
-ถ้าคุณใช้ `PyThaiNLP` ในโปรเจคหรืองานวิจัยของคุณ คุณสามารถอ้างอิงได้ตามนี้
+หากคุณใช้ซอฟต์แวร์ `PyThaiNLP` ในโครงงานหรืองานวิจัยของคุณ คุณสามารถอ้างอิงได้ตามนี้
 
-```
 Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, & Pattarawat Chormai. (2016, Jun 27). PyThaiNLP: Thai Natural Language Processing in Python. Zenodo. http://doi.org/10.5281/zenodo.3519354
-```
 
-หรือ BibTeX entry:
+โดยสามารถใช้ BibTeX นี้:
 
 ``` bib
 @misc{pythainlp,
@@ -143,6 +141,40 @@ Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Sur
 }
 ```
 
+บทความของเราในงานประชุมวิชาการ [NLP-OSS 2023](https://nlposs.github.io/2023/):
+
+Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, Pattarawat Chormai, Peerat Limkonchotiwat, Thanathip Suntorntip, and Can Udomcharoenchaikit. 2023. [PyThaiNLP: Thai Natural Language Processing in Python.](https://aclanthology.org/2023.nlposs-1.4) In Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023), pages 25–36, Singapore, Singapore. Empirical Methods in Natural Language Processing.
+
+โดยสามารถใช้ BibTeX นี้:
+
+```bib
+@inproceedings{phatthiyaphaibun-etal-2023-pythainlp,
+    title = "{P}y{T}hai{NLP}: {T}hai Natural Language Processing in Python",
+    author = "Phatthiyaphaibun, Wannaphong  and
+      Chaovavanich, Korakot  and
+      Polpanumas, Charin  and
+      Suriyawongkul, Arthit  and
+      Lowphansirikul, Lalita  and
+      Chormai, Pattarawat  and
+      Limkonchotiwat, Peerat  and
+      Suntorntip, Thanathip  and
+      Udomcharoenchaikit, Can",
+    editor = "Tan, Liling  and
+      Milajevs, Dmitrijs  and
+      Chauhan, Geeticka  and
+      Gwinnup, Jeremy  and
+      Rippeth, Elijah",
+    booktitle = "Proceedings of the 3rd Workshop for Natural Language Processing Open Source Software (NLP-OSS 2023)",
+    month = dec,
+    year = "2023",
+    address = "Singapore, Singapore",
+    publisher = "Empirical Methods in Natural Language Processing",
+    url = "https://aclanthology.org/2023.nlposs-1.4",
+    pages = "25--36",
+    abstract = "We present PyThaiNLP, a free and open-source natural language processing (NLP) library for Thai language implemented in Python. It provides a wide range of software, models, and datasets for Thai language. We first provide a brief historical context of tools for Thai language prior to the development of PyThaiNLP. We then outline the functionalities it provided as well as datasets and pre-trained language models. We later summarize its development milestones and discuss our experience during its development. We conclude by demonstrating how industrial and research communities utilize PyThaiNLP in their work. The library is freely available at https://github.com/pythainlp/pythainlp.",
+}
+```
+
 ## ร่วมสนับสนุน PyThaiNLP
 
 - กรุณา fork แล้วพัฒนาต่อ จากนั้นสร้าง pull request กลับมา :)
@@ -157,10 +189,10 @@ Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Sur
 
 | | สัญญาอนุญาต |
 |:---|:----|
-| PyThaiNLP Source Code and Notebooks | [Apache Software License 2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) |
-| Corpora, datasets, and documentations created by PyThaiNLP | [Creative Commons Zero 1.0 Universal Public Domain Dedication License (CC0)](https://creativecommons.org/publicdomain/zero/1.0/)|
+| ต้นรหัสซอร์สโค้ดและโน๊ตบุ๊กของ PyThaiNLP | [Apache Software License 2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) |
+| ฐานข้อมูลภาษา ชุดข้อมูล และเอกสารที่สร้างโดยโครงการ PyThaiNLP | [Creative Commons Zero 1.0 Universal Public Domain Dedication License (CC0)](https://creativecommons.org/publicdomain/zero/1.0/)|
 | Language models created by PyThaiNLP | [Creative Commons Attribution 4.0 International Public License (CC-by)](https://creativecommons.org/licenses/by/4.0/)  |
-| Other corpora and models that may included with PyThaiNLP | See [Corpus License](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md) |
+| สำหรับฐานข้อมูลภาษาและโมเดลอื่นที่อาจมาพร้อมกับซอฟต์แวร์ PyThaiNLP | ดู [Corpus License](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md) |
 
 
 ## บัตรโมเดล

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -4,7 +4,6 @@
 from typing import List, Tuple
 
 
-
 def pos_tag(
     words: List[str], engine: str = "perceptron", corpus: str = "orchid"
 ) -> List[Tuple[str, str]]:
@@ -169,10 +168,10 @@ def pos_tag_sents(
 
 
 def pos_tag_transformers(
-    sentence: str, 
+    sentence: str,
     engine: str = "bert",
     corpus: str = "blackboard",
-)->List[List[Tuple[str, str]]]:
+) -> List[List[Tuple[str, str]]]:
     """
     Marks sentences with part-of-speech (POS) tags.
 
@@ -202,29 +201,33 @@ def pos_tag_transformers(
     """
 
     try:
-        from transformers import AutoModelForTokenClassification, \
-            AutoTokenizer, TokenClassificationPipeline
+        from transformers import (
+            AutoModelForTokenClassification,
+            AutoTokenizer,
+            TokenClassificationPipeline,
+        )
     except ImportError:
         raise ImportError(
-            "Not found transformers! Please install transformers by pip install transformers")
+            "Not found transformers! Please install transformers by pip install transformers"
+        )
 
     if not sentence:
         return []
 
     _blackboard_support_engine = {
-        "bert" : "lunarlist/pos_thai",
+        "bert": "lunarlist/pos_thai",
     }
 
     _pud_support_engine = {
-        "wangchanberta" : "Pavarissy/wangchanberta-ud-thai-pud-upos",
-        "mdeberta" : "Pavarissy/mdeberta-v3-ud-thai-pud-upos",
+        "wangchanberta": "Pavarissy/wangchanberta-ud-thai-pud-upos",
+        "mdeberta": "Pavarissy/mdeberta-v3-ud-thai-pud-upos",
     }
 
-    if corpus == 'blackboard' and engine in _blackboard_support_engine.keys():
+    if corpus == "blackboard" and engine in _blackboard_support_engine.keys():
         base_model = _blackboard_support_engine.get(engine)
         model = AutoModelForTokenClassification.from_pretrained(base_model)
         tokenizer = AutoTokenizer.from_pretrained(base_model)
-    elif corpus == 'pud' and engine in _pud_support_engine.keys():
+    elif corpus == "pud" and engine in _pud_support_engine.keys():
         base_model = _pud_support_engine.get(engine)
         model = AutoModelForTokenClassification.from_pretrained(base_model)
         tokenizer = AutoTokenizer.from_pretrained(base_model)
@@ -235,8 +238,10 @@ def pos_tag_transformers(
             )
         )
 
-    pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+    pipeline = TokenClassificationPipeline(
+        model=model, tokenizer=tokenizer, aggregation_strategy="simple"
+    )
 
     outputs = pipeline(sentence)
-    word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
-    return word_tags
+    word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]]
+    return word_tags
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 """
 Unit test.
 

diff --git a/tests/test_ancient.py b/tests/test_ancient.py
@@ -1,20 +1,22 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 import unittest
 from pythainlp.ancient import aksonhan_to_current
 
 
 class TestAncientPackage(unittest.TestCase):
     def test_aksonhan_to_current(self):
-        self.assertEqual(aksonhan_to_current("ก"), 'ก')
-        self.assertEqual(aksonhan_to_current("กก"), 'กก')
-        self.assertEqual(aksonhan_to_current("ถนน"), 'ถนน')
-        self.assertEqual(aksonhan_to_current("จกก"), 'จัก')
-        self.assertEqual(aksonhan_to_current("ดง่ง"), 'ดั่ง')
-        self.assertEqual(aksonhan_to_current("นน้น"), 'นั้น')
-        self.assertEqual(aksonhan_to_current("ขดด"), 'ขัด')
-        self.assertEqual(aksonhan_to_current("ตรสส"), 'ตรัส')
-        self.assertEqual(aksonhan_to_current("ขบบ"), 'ขับ')
-        self.assertEqual(aksonhan_to_current("วนน"), 'วัน')
-        self.assertEqual(aksonhan_to_current("หลงง"), 'หลัง')
-        self.assertEqual(aksonhan_to_current("บงงคบบ"), 'บังคับ')
-        self.assertEqual(aksonhan_to_current("สรรเพชญ"), 'สรรเพชญ')
+        self.assertEqual(aksonhan_to_current("ก"), "ก")
+        self.assertEqual(aksonhan_to_current("กก"), "กก")
+        self.assertEqual(aksonhan_to_current("ถนน"), "ถนน")
+        self.assertEqual(aksonhan_to_current("จกก"), "จัก")
+        self.assertEqual(aksonhan_to_current("ดง่ง"), "ดั่ง")
+        self.assertEqual(aksonhan_to_current("นน้น"), "นั้น")
+        self.assertEqual(aksonhan_to_current("ขดด"), "ขัด")
+        self.assertEqual(aksonhan_to_current("ตรสส"), "ตรัส")
+        self.assertEqual(aksonhan_to_current("ขบบ"), "ขับ")
+        self.assertEqual(aksonhan_to_current("วนน"), "วัน")
+        self.assertEqual(aksonhan_to_current("หลงง"), "หลัง")
+        self.assertEqual(aksonhan_to_current("บงงคบบ"), "บังคับ")
+        self.assertEqual(aksonhan_to_current("สรรเพชญ"), "สรรเพชญ")
diff --git a/tests/test_augment.py b/tests/test_augment.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 
 import unittest
 import nltk

diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
@@ -1,3 +1,7 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+
 import unittest
 
 import numpy as np
@@ -63,8 +67,8 @@ def test_count_correctly_tokenised_words(self):
             rb = list(word_tokenization._find_word_boundaries(ref_sample))
 
             # in binary [{0, 1}, ...]
-            correctly_tokenized_words = word_tokenization._find_words_correctly_tokenised(
-                rb, sb
+            correctly_tokenized_words = (
+                word_tokenization._find_words_correctly_tokenised(rb, sb)
             )
 
             self.assertEqual(

diff --git a/tests/test_classify.py b/tests/test_classify.py
@@ -1,4 +1,7 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+
 import unittest
 from pythainlp.classify import GzipModel
 

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 
 import unittest
 from argparse import ArgumentError
@@ -41,7 +43,7 @@ def test_cli_benchmark(self):
                     "./tests/data/input.txt",
                     "--test-file",
                     "./tests/data/test.txt",
-                    "--save-details"
+                    "--save-details",
                 ]
             )
         )
@@ -117,9 +119,7 @@ def test_cli_tokenize(self):
         self.assertEqual(ex.exception.code, 2)
 
         self.assertIsNotNone(
-            cli.tokenize.App(
-                ["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"]
-            )
+            cli.tokenize.App(["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"])
         )
         self.assertIsNotNone(
             cli.tokenize.App(

diff --git a/tests/test_coref.py b/tests/test_coref.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 
 import unittest
 from pythainlp.coref import coreference_resolution

diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 import unittest
@@ -23,14 +25,15 @@
     thai_icu_words,
     thai_male_names,
     thai_negations,
+    thai_orst_words,
     thai_stopwords,
     thai_syllables,
-    thai_synonym,
+    thai_synonyms,
+    thai_volubilis_words,
+    thai_wikipedia_titles,
     thai_words,
     tnc,
     ttc,
-    volubilis,
-    wikipedia_titles,
     wordnet,
 )
 from pythainlp.corpus.util import revise_newmm_default_wordset
@@ -41,24 +44,26 @@ def test_conceptnet(self):
         self.assertIsNotNone(conceptnet.edges("รัก"))
 
     def test_corpus(self):
-        self.assertIsInstance(thai_icu_words(), frozenset)
-        self.assertGreater(len(thai_icu_words()), 0)
         self.assertIsInstance(thai_negations(), frozenset)
         self.assertGreater(len(thai_negations()), 0)
         self.assertIsInstance(thai_stopwords(), frozenset)
         self.assertGreater(len(thai_stopwords()), 0)
         self.assertIsInstance(thai_syllables(), frozenset)
         self.assertGreater(len(thai_syllables()), 0)
-        self.assertIsInstance(thai_synonym(), dict)
-        self.assertGreater(len(thai_synonym()), 0)
+        self.assertIsInstance(thai_synonyms(), dict)
+        self.assertGreater(len(thai_synonyms()), 0)
+
+        self.assertIsInstance(thai_icu_words(), frozenset)
+        self.assertGreater(len(thai_icu_words()), 0)
+        self.assertIsInstance(thai_orst_words(), frozenset)
+        self.assertGreater(len(thai_orst_words()), 0)
+        self.assertIsInstance(thai_volubilis_words(), frozenset)
+        self.assertGreater(len(thai_volubilis_words()), 0)
+        self.assertIsInstance(thai_wikipedia_titles(), frozenset)
+        self.assertGreater(len(thai_wikipedia_titles()), 0)
         self.assertIsInstance(thai_words(), frozenset)
         self.assertGreater(len(thai_words()), 0)
 
-        self.assertIsInstance(volubilis(), frozenset)
-        self.assertGreater(len(volubilis()), 0)
-        self.assertIsInstance(wikipedia_titles(), frozenset)
-        self.assertGreater(len(wikipedia_titles()), 0)
-
         self.assertIsInstance(countries(), frozenset)
         self.assertGreater(len(countries()), 0)
         self.assertIsInstance(provinces(), frozenset)

diff --git a/tests/test_el.py b/tests/test_el.py
@@ -1,4 +1,7 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+
 import unittest
 from pythainlp.el import EntityLinker
 

diff --git a/tests/test_generate.py b/tests/test_generate.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 
 import unittest